In [61]:
from pubg_util import mysql, load_schema, notifier
from sphynx import sphynx, NODE_SMALL, NODE_MEDIUM, NODE_LARGE
from pyspark.sql.functions import *
import pandas as pd
import pickle

spark = sphynx.get_spark(executor_count=16, app_name='west0_mclaren', node_spec=NODE_LARGE)

Spark cluster not assigned. creating a new one...
Node spec: 16 executors with 48G RAM each
Job Port 4049 is assigned for requested cluster
Waiting for Spark master to be available...
Spark master launched!
Creating new Spark session, name: west0_mclaren...
Waiting for all executors ready...
All executors connected!
Complete! elapsed time: 00:00:28


In [65]:
sphynx.stop()

Stopping Spark session...
Destroying Spark cluster...
Done!


In [12]:
pass_user = mysql.read_table(spark, "labs", "pass_user_total")
m1_user = mysql.read_table(spark, "labs", "m1_user")
m2_user = mysql.read_table(spark, "labs", "m2_user")
mclaren_owner = mysql.read_table(spark, "labs", "mclaren_user_group") 

In [3]:
m2_user = m2_user.withColumn("pass_gcoin", lit(None)).withColumn("product_gcoin", col("total_use")) \
    .withColumn("level_up", lit(None)).withColumn("level_up_gcoin", lit(None))
m1_user = m1_user.select("account_id", "group").distinct().withColumn("pass_gcoin", lit(None)).withColumn("product_gcoin", lit(None)) \
    .withColumn("level_up", lit(None)).withColumn("level_up_gcoin", lit(None)).withColumn("paid_use", lit(None)).withColumn("total_use", lit(None))

In [4]:
m_user = m1_user.unionByName(m2_user)

In [5]:
m_user.select(count("*"), countDistinct("account_id")).show(truncate=False)

+--------+--------------------------+
|count(1)|count(DISTINCT account_id)|
+--------+--------------------------+
|3369930 |3369930                   |
+--------+--------------------------+



In [6]:
user_group = pass_user.unionByName(m_user)

In [7]:
user_group.select(count("*"), countDistinct("account_id")).show(truncate=False)

+--------+--------------------------+
|count(1)|count(DISTINCT account_id)|
+--------+--------------------------+
|3851527 |3851527                   |
+--------+--------------------------+



In [8]:
user_group.groupBy("group").agg(countDistinct("account_id"), count("*")).show(truncate=False)

+-----+--------------------------+--------+
|group|count(DISTINCT account_id)|count(1)|
+-----+--------------------------+--------+
|P-1  |147774                    |147774  |
|P-3  |277093                    |277093  |
|P-2  |56730                     |56730   |
|M-1  |3167117                   |3167117 |
|M-2  |202813                    |202813  |
+-----+--------------------------+--------+



In [32]:
user_group.join(mclaren_owner.withColumnRenamed("AccountId", "account_id"), "account_id", "full_outer") \
    .groupBy("group", "mclaren_group").agg(countDistinct("account_id").alias("user_cnt")).orderBy("group", "mclaren_group").show(40, truncate=False)

+-----+-------------+--------+
|group|mclaren_group|user_cnt|
+-----+-------------+--------+
|M-1  |null         |3153642 |
|M-1  |EY           |1       |
|M-1  |SB           |13467   |
|M-1  |SW           |6       |
|M-1  |TM           |1       |
|M-2  |null         |190435  |
|M-2  |EY           |392     |
|M-2  |SB           |10259   |
|M-2  |SW           |1108    |
|M-2  |TM           |619     |
|P-1  |null         |71176   |
|P-1  |EY           |71      |
|P-1  |SB           |4761    |
|P-1  |SW           |71053   |
|P-1  |TM           |713     |
|P-2  |null         |14933   |
|P-2  |EY           |85      |
|P-2  |SB           |1309    |
|P-2  |SW           |40094   |
|P-2  |TM           |309     |
|P-3  |null         |40879   |
|P-3  |EY           |35163   |
|P-3  |SB           |21271   |
|P-3  |SW           |147475  |
|P-3  |TM           |32305   |
+-----+-------------+--------+



In [13]:
user_group_with_mclaren_owner = user_group.join(mclaren_owner.withColumnRenamed("AccountId", "account_id"), "account_id", "full_outer")

In [None]:
# if group is null then assign group = "M-1"
# if mclaren_user_group is null then assign mclaren_user_group = "N"

In [14]:
user_group_with_mclaren_owner = user_group_with_mclaren_owner.withColumn("mclaren_group", when(col("mclaren_group").isNull(), lit("N")).otherwise(col("mclaren_group")))

In [15]:
user_group_with_mclaren_owner.groupBy("group", "mclaren_group").agg(countDistinct("account_id").alias("user_cnt")).orderBy("group", "mclaren_group").show(40, truncate=False)

+-----+-------------+--------+
|group|mclaren_group|user_cnt|
+-----+-------------+--------+
|M-1  |EY           |1       |
|M-1  |N            |3153642 |
|M-1  |SB           |13467   |
|M-1  |SW           |6       |
|M-1  |TM           |1       |
|M-2  |EY           |392     |
|M-2  |N            |190435  |
|M-2  |SB           |10259   |
|M-2  |SW           |1108    |
|M-2  |TM           |619     |
|P-1  |EY           |71      |
|P-1  |N            |71176   |
|P-1  |SB           |4761    |
|P-1  |SW           |71053   |
|P-1  |TM           |713     |
|P-2  |EY           |85      |
|P-2  |N            |14933   |
|P-2  |SB           |1309    |
|P-2  |SW           |40094   |
|P-2  |TM           |309     |
|P-3  |EY           |35163   |
|P-3  |N            |40879   |
|P-3  |SB           |21271   |
|P-3  |SW           |147475  |
|P-3  |TM           |32305   |
+-----+-------------+--------+



In [16]:
from datetime import datetime, timedelta

def load_data_mart(table, startdate, enddate, device=None):
    df = None
    if device is None:
        for i in range(int((datetime.strptime(enddate, '%Y-%m-%d') - datetime.strptime(startdate, '%Y-%m-%d')).days)+1):
            try:
                if df is None:
                    df = spark.read.parquet("s3a://pubg-log-labs/data_mart/{table}/{startdate}".format(table=table, startdate=startdate))   
                else :
                    df = df.unionAll(spark.read.parquet("s3a://pubg-log-labs/data_mart/{table}/{startdate}".format(table=table, startdate=startdate)))  
            except Exception as e:
                pass
            startdate = (datetime.strptime(startdate, '%Y-%m-%d') + timedelta(days = 1)).strftime('%Y-%m-%d')
    else:
        device = device.upper()
        for i in range(int((datetime.strptime(enddate, '%Y-%m-%d') - datetime.strptime(startdate, '%Y-%m-%d')).days)+1):
            try:
                if df is None:
                    df = spark.read.parquet("s3a://pubg-log-labs/data_mart/{table}/{startdate}/{device}".format(table=table, startdate=startdate, device=device))   
                else :
                    df = df.unionAll(spark.read.parquet("s3a://pubg-log-labs/data_mart/{table}/{startdate}/{device}".format(table=table, startdate=startdate, device=device)))  
            except Exception as e:
                pass
            startdate = (datetime.strptime(startdate, '%Y-%m-%d') + timedelta(days = 1)).strftime('%Y-%m-%d')
    return df


In [17]:
# 멕라렌 패스 기간
gameplay = load_data_mart('official_gameplay_master', "2022-09-07", "2022-11-01")

In [18]:
gameplay_df = gameplay.groupBy("AccountId").agg(countDistinct("date").alias("play_date_cnt"), \
    sum(col("TppSoloPlayCount") + col("TppDuoPlayCount") + col("TppSquadPlayCount") + col("FppSoloPlayCount") + col("FppDuoPlayCount") + col("FppSquadPlayCount")).alias("play_cnt"), \
    sum(col("TppSoloGameMinute") + col("TppDuoGameMinute") + col("TppSquadGameMinute") + col("FppSoloGameMinute") + col("FppDuoGameMinute") + col("FppSquadGameMinute")).alias("play_min"))

In [19]:
user_group_with_mclaren_owner = user_group_with_mclaren_owner.join(gameplay_df.withColumnRenamed("AccountId", "account_id"), "account_id", "left")

In [20]:
user_group_with_mclaren_owner.select(count("*"), countDistinct("account_id")).show(truncate=False)

+--------+--------------------------+
|count(1)|count(DISTINCT account_id)|
+--------+--------------------------+
|3851527 |3851527                   |
+--------+--------------------------+



In [22]:
gcoin = load_data_mart("pc", "2022-09-07", "2022-11-02", "gcoin_use")

In [23]:
workshop_gcoin = gcoin.where(col("event_name") == "202207_workshop")

In [24]:
workshop_gcoin_by_user = workshop_gcoin.groupBy("account_id") \
    .agg(sum(col("paid_use")+col("free_use")).alias("workshop_total_use"), sum("paid_use").alias("workshop_paid_use"))

In [25]:
user_group_with_mclaren_owner = user_group_with_mclaren_owner.join(workshop_gcoin_by_user, "account_id", "left")

In [26]:
non_mclaren_gcoin = gcoin.where(col("event_name") != "202209_season_workshop")
non_mclaren_gcoin_by_user = non_mclaren_gcoin.groupBy("account_id") \
    .agg(sum("paid_use").alias("non_mclaren_paid_use"), sum(col("paid_use") + col("free_use")).alias("non_mclaren_total_use"))

In [27]:
user_group_with_mclaren_owner = user_group_with_mclaren_owner.join(non_mclaren_gcoin_by_user, "account_id", "left")

In [62]:
## Pass Reward
reward_level = load_schema.lobby(spark, "pc", "live", "RewardLevel", "2022-09-07", "2022-11-01")

In [64]:
reward_level.select(max("Level")).show()

+----------+
|max(Level)|
+----------+
|       171|
+----------+



In [29]:
reward_level_max = reward_level.groupBy("AccountId").agg(max("level").alias("max_level"))

In [30]:
user_group_with_mclaren_owner = user_group_with_mclaren_owner.join(reward_level_max.withColumnRenamed("AccountId", "account_id"), "account_id", "left")

In [32]:
mclaren_user_group = mysql.read_table(spark, "labs", "mclaren_user_group")

In [49]:
user_group_with_mclaren_owner = user_group_with_mclaren_owner.drop("mclaren_own_date_num")

In [51]:
user_group_with_mclaren_owner = user_group_with_mclaren_owner.join(mclaren_user_group.withColumnRenamed("AccountId", "account_id").select("account_id", "mclaren_own_date_num"), "account_id", "left")

In [52]:
user_group_with_mclaren_owner.printSchema()

root
 |-- account_id: string (nullable = true)
 |-- pass_gcoin: long (nullable = true)
 |-- paid_use: long (nullable = true)
 |-- total_use: long (nullable = true)
 |-- level_up: long (nullable = true)
 |-- level_up_gcoin: long (nullable = true)
 |-- product_gcoin: long (nullable = true)
 |-- group: string (nullable = true)
 |-- mclaren_group: string (nullable = true)
 |-- play_date_cnt: long (nullable = true)
 |-- play_cnt: long (nullable = true)
 |-- play_min: double (nullable = true)
 |-- workshop_total_use: long (nullable = true)
 |-- workshop_paid_use: long (nullable = true)
 |-- non_mclaren_paid_use: long (nullable = true)
 |-- non_mclaren_total_use: long (nullable = true)
 |-- max_level: long (nullable = true)
 |-- mclaren_own_date_num: integer (nullable = true)



In [53]:
mysql.insert_table(user_group_with_mclaren_owner, "labs", "user_group_with_mclaren_owner")

In [55]:
pre_gcoin_user = mysql.read_table(spark, "labs", "gcoin_by_user")

In [56]:
pre_gcoin_user.printSchema()

root
 |-- account_id: string (nullable = true)
 |-- paid_use: long (nullable = true)
 |-- total_use: long (nullable = true)
 |-- gcoin_use_group: string (nullable = true)
 |-- workshop_gcoin: long (nullable = true)
 |-- if_workshop: string (nullable = true)
 |-- main_use_event_type: string (nullable = true)
 |-- type_total_use: long (nullable = true)
 |-- play_date_cnt: long (nullable = true)
 |-- play_cnt: long (nullable = true)
 |-- play_min: double (nullable = true)



In [57]:
pre_gcoin_user = pre_gcoin_user.withColumnRenamed("paid_use", "pre_paid_use").withColumnRenamed("total_use", "pre_total_use") \
    .withColumnRenamed("gcoin_use_group", "pre_gcoin_use_group").withColumnRenamed("workshop_gcoin", "pre_workshop_gcoin") \
    .withColumnRenamed("if_workshop", "pre_if_workshop").withColumnRenamed("main_use_event_type", "pre_main_use_event_type") \
    .withColumnRenamed("type_total_use", "pre_main_use_event_type_total_use").withColumnRenamed("play_date_cnt", "pre_play_date_cnt") \
    .withColumnRenamed("play_cnt", "pre_play_cnt").withColumnRenamed("play_min", "pre_play_min")

In [58]:
df = user_group_with_mclaren_owner.join(pre_gcoin_user, "account_id", "left")

In [59]:
mysql.insert_table(df, "labs", "mclaren_seg_pre_and_post_info")