In [1]:
import findspark
findspark.init()

import os
print(os.environ['JAVA_HOME'])
print(os.environ['SPARK_HOME'])

/usr/lib/jvm/java-11-openjdk-amd64
/usr/local/spark


In [2]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from IPython.display import display, display_pretty, clear_output, JSON

from delta import *

# 공통 데이터 위치
home_jovyan = "/home/jovyan"
work_data = f"{home_jovyan}/work/data"
work_dir=!pwd
work_dir = work_dir[0]
warehouse_dir = f"{work_dir}/spark-warehouse"

# Create spark session with hive enabled
builder = (
    SparkSession
    .builder
    .appName("pyspark-notebook")
    .config("spark.sql.session.timeZone", "Asia/Seoul")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.sql.catalogImplementation", "hive")
    .config("spark.sql.warehouse.dir", warehouse_dir)
    .enableHiveSupport()
)

In [3]:
# 델타 레이크 생성시에 반드시 `configure_spark_with_delta_pip` 구성을 통해 실행되어야 정상적인 델타 의존성이 로딩됩니다
spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [4]:
# 노트북에서 테이블 형태로 데이터 프레임 출력을 위한 설정을 합니다
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # display enabled
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100) # display output columns size

# 로컬 환경 최적화
spark.conf.set("spark.sql.shuffle.partitions", 5) # the number of partitions to use when shuffling data for joins or aggregations.
spark.conf.set("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true")
spark.conf.set("spark.sql.decimalOperations.allowPrecisionLoss", "true")
spark

In [5]:
def show(queries, num_rows = 20):
    for query in queries.split(";"):
        spark.sql(query).show(num_rows, truncate=False)

def sql(query):
    return spark.sql(query)

def ls(command):
    !ls -al {command}

def cat(filename):
    !cat {filename}

def grep(keyword, filename):
    !grep -i {keyword} {filename}

def grep_and_json(keyword, filename):
    !grep {keyword} {filename} | python -m json.tool

def grep_sed_json(keyword, lineno, filename):
    !grep {keyword} {filename} | sed -n {lineno}p | python -m json.tool

In [6]:
import time
import json
from datetime import date, timedelta, datetime

def transform(spark, params):
    base_date = params["base_date"]
    p_date = datetime.strptime(base_date, '%Y%m%d')
    bds_db = params["bds_db"]
    interim_db = params["interim_db"]

In [74]:
spark.read.option("header", "true").option("inferSchema", "true").csv("data/sample").createOrReplaceTempView("bs_gamelog")
# spark.sql("select * from bs_gamelog").show()

In [75]:
df = spark.sql(f"""
with base as ( 
    select 
        concat(target_object_id, "_", target_name) as monster_id
        , sort_array(collect_list(concat(actor_id, "_", actor_group))) as player_group
    from ( 
        select actor_group, target_object_id, target_name, actor_id
        from bs_gamelog
        group by actor_group, target_object_id, target_name, actor_id 
    ) a
    group by concat(target_object_id, "_", target_name)
    having size(sort_array(collect_list(concat(actor_id, "_", actor_group)))) between 0 and 100
)

select player_group, count(*) as hunt_count
from base
group by player_group
""")
# where plogdate = '{p_date}' and logid = 1208 and actor_group in (4501, 4502, ... 3504) and target_code = 30

In [76]:
from collections import defaultdict
import itertools

def generate_combinations(group, min_size=4):
    for size in range(min_size, len(group)+1):
        for comb in itertools.combinations(group, size):
            yield comb

def count_combinations(player_group, hunt_count):
    frequency = defaultdict(int)
    for comb in generate_combinations(player_group):
        comb_str = ",".join(map(str, sorted(comb)))
        frequency[comb_str] += hunt_count
        return dict(frequency)

In [77]:
from pyspark.sql.types import MapType, StringType, IntegerType
from pyspark.sql.functions import date_format, concat_ws, collect_list, size, explode, split, count

@udf(MapType(StringType(), IntegerType()))
def calculate_combinations(player_group, hunt_count):
    return count_combinations(player_group, hunt_count)

In [78]:
df_with_combinations = df.withColumn("combinations", calculate_combinations(col("player_group"), col("hunt_count")))
df_exploded = df_with_combinations.select(explode(col("combinations")).alias("combination", "count"))
df_filtered = df_exploded.groupBy("combination").agg({"count":"sum"})

In [79]:
df.withColumn("combinations", calculate_combinations(col("player_group"), col("hunt_count"))).show()

+--------------------+----------+------------+
|        player_group|hunt_count|combinations|
+--------------------+----------+------------+
|[actor_1_group_a,...|         1|        NULL|
|[actor_2_group_a,...|         1|        NULL|
|[actor_1_group_c,...|         1|        NULL|
|[actor_3_group_b,...|         1|        NULL|
|[actor_4_group_b,...|         1|        NULL|
+--------------------+----------+------------+



In [80]:
df_filtered = df_filtered.withColumnRenamed("sum(count)", "total_hunts")
df_filtered = df_filtered.filter(col("total_hunts") >= 1)
df_players = df_filtered.withColumn("player", explode(split(col("combination"), ",")))
df_players = df_players.withColumn("char_id", split(col("player"), "_")[0]).withColumn("server_cd", split(col("player"), "_")[1])

In [81]:
df_max_hunts = df_players.groupBy("char_id", "server_cd").agg(max("total_hunts").alias("max_hunt_count"))

In [82]:
df_max_hunts.select("char_id", "server_cd").show()

+-------+---------+
|char_id|server_cd|
+-------+---------+
+-------+---------+



In [None]:
having size (sort_array(collect_list(concat(actor_id, "_", actor_group))))
between 4 and 20 -- 실패 (OOM)
between 4 and 12 -- 성공