In [74]:
from pubg_util import mysql, load_schema, notifier
from sphynx import sphynx, NODE_SMALL, NODE_MEDIUM, NODE_LARGE
from pyspark.sql.functions import *
import pandas as pd

spark = sphynx.get_spark(executor_count=8, app_name='pdu_syp', node_spec=NODE_LARGE)

Spark session already exists. Return it
If there are no Spark clusters, please do sphynx.stop() first


In [2]:
def load_inventory(device, target_date, explode=None):
    """ select explode from ('currencies', 'equips', 'items', None) """
    inventory_bucket = "s3a://pubg-log-labs/data_mart/economy/inventory_master/"
     
    # inventorySnapshot seemed to have duplicate entries;
    df = spark.read.parquet("{}{}/{}".format(inventory_bucket, device, target_date))#.dropDuplicates()
         
    if explode is None:
        return df
    else:
        df.createOrReplaceTempView("temp")
        result = spark.sql("""
        SELECT date
              ,accountid
              --,netid
              ,inline({})
              ,date_updated
          FROM temp
        """.format(explode))
        return result

In [6]:
tmp=load_inventory(device="pc", target_date="2021-07-30")

In [10]:
device = 'pc' # pc / console
env = 'live' # live /
log_name = 'pass_user_meta'
start_date = '2021-08-01' # YYYY-MM-DD
end_date = '2021-08-01' # YYYY-MM-DD

In [15]:
df = spark.read.parquet("s3a://pubg-log-labs/data_mart/pass_master/pc/"+start_date)

In [18]:
df.count()

74296120

In [20]:
df.limit(10).toPandas()

Unnamed: 0,date,platform,AccountId,period_type,period_status,pass_start_date,daily_levelup_xp,daily_play_xp,daily_outgame_xp,total_levelup_xp,total_play_xp,total_outgame_xp,final_level
0,2021-08-01,STEAM,account.d42a38b5332940178edb534769347653,taego,active,,0,2180,16000,0,14970,52800,7
1,2021-08-01,KAKAO,account.be91556859e54898a5f79713f25827ab,taego,non_active,,0,0,0,0,0,0,1
2,2021-08-01,STEAM,account.738e6aa86bcb4f389fc799e48a4a4209,taego,non_active,,0,0,0,0,0,0,1
3,2021-08-01,STEAM,account.47b6aeb227f94f939e4f9257eef35357,taego,non_active,,0,0,0,0,0,0,1
4,2021-08-01,STEAM,account.8f3d79f16e1944608a572e96ca51610f,taego,non_active,,0,0,0,0,0,0,1
5,2021-08-01,STEAM,account.7b876a830bcc4bba988b23f867155ace,taego,non_active,,0,0,0,0,0,0,1
6,2021-08-01,STEAM,account.f27c42a4ff884a82840e3ca90da6633a,taego,active,,0,0,0,0,5900,23400,3
7,2021-08-01,STEAM,account.8e86b682b17a4985a313ddc73462950c,taego,non_active,,0,0,0,0,0,0,1
8,2021-08-01,STEAM,account.698d386fe4a74d6daa6bc53046914e67,taego,non_active,,0,0,0,0,0,0,1
9,2021-08-01,STEAM,account.a9327686e47c4f7d9756485513fdfee5,taego,non_active,,0,0,0,0,0,0,1


In [23]:
df.where(col("period_type") == "taego").count()

74296120

In [25]:
df.where(col("final_level") == 30).show(10, truncate=False)

+----------+--------+----------------------------------------+-----------+-------------+---------------+----------------+-------------+----------------+----------------+-------------+----------------+-----------+
|date      |platform|AccountId                               |period_type|period_status|pass_start_date|daily_levelup_xp|daily_play_xp|daily_outgame_xp|total_levelup_xp|total_play_xp|total_outgame_xp|final_level|
+----------+--------+----------------------------------------+-----------+-------------+---------------+----------------+-------------+----------------+----------------+-------------+----------------+-----------+
|2021-08-01|STEAM   |account.9ac9386d414c46cea58ab32e0eabb5d8|taego      |active       |null           |0               |5120         |11800           |0               |73820        |219400          |30         |
|2021-08-01|STEAM   |account.c3d14ce892df451f89c237b64ef759e5|taego      |active       |null           |0               |5180         |13800        

In [26]:
df.where((col("period_status") == "pass") & (col("final_level") >= 30) & (col("final_level") <= 35)).groupBy("final_level").agg(count("AccountId")).orderBy("final_level").show()

+-----------+----------------+
|final_level|count(AccountId)|
+-----------+----------------+
|         30|            7153|
|         31|            9021|
|         32|            8442|
|         33|            8346|
|         34|            7989|
|         35|            7691|
+-----------+----------------+



### Pubg Region

In [60]:
from datetime import datetime, timedelta

def pass_user(device, startdate, enddate):
    df = None
    startdate = datetime.strptime(startdate, '%Y-%m-%d')
    enddate = datetime.strptime(enddate, '%Y-%m-%d')
    while startdate <= enddate :
        startdate = startdate.strftime('%Y-%m-%d')
        if df is None:
            df = spark.read.parquet("s3a://pubg-log-labs/data_mart/pass_master/"+device+"/"+startdate)
        else :
            df = df.union(spark.read.parquet("s3a://pubg-log-labs/data_mart/pass_master/"+device+"/"+startdate))
        startdate = datetime.strptime(startdate, '%Y-%m-%d') + timedelta(days = 1)
    return df


In [62]:
# user 지역
from pyspark.sql.types import * 

def classify_country(country_os, country_ip):
    if country_os != 'CN':
        return country_ip
    else:
        return country_os

country_type_udf = udf(classify_country, StringType())

meta_region = mysql.read_table(spark, 'metainfo', 'meta_bi_regions')

# df_user: user_master
# df_user = df_user.withColumn("country_new", country_type_udf("country_os", "country_ip"))
# df_user = df_user.join(meta_region, df_user.country_new == meta_region.country_code_iso2, "left")

In [9]:
pass_pc = pass_user(device="pc", startdate="2021-07-07", enddate="2021-08-18")

In [5]:
pass_console = pass_user(device="console", startdate="2021-07-15", enddate="2021-08-26")

In [10]:
user_pc = load_data_mart(start_date="2021-07-07", end_date="2021-08-18", device="pc", table_name="user_master").where(col("lastlogindate") >= "2021-07-07").select(col("accountid").alias("AccountId"), "country_os", "country_ip", "platform", "device").distinct()

In [28]:
user_console = load_data_mart(start_date="2021-07-07", end_date="2021-08-18", device="console", table_name="user_master").where(col("lastlogindate") >= "2021-07-07").select(col("accountid").alias("AccountId"), "country_os", "country_ip", "platform", "device").distinct()

In [15]:
pass_pc_user = pass_pc.join(user_pc, "AccountId", "left")

In [29]:
pass_console_user = pass_console.join(user_console, "AccountId", "left")

In [17]:
pass_pc_user = pass_pc_user.withColumn("country_new", country_type_udf("country_os", "country_ip"))
pass_pc_user = pass_pc_user.join(meta_region, pass_pc_user.country_new == meta_region.country_code_iso2, "left")

In [31]:
pass_console_user = pass_console_user.withColumn("country_new", country_type_udf("country_os", "country_ip"))
pass_console_user = pass_console_user.join(meta_region, pass_console_user.country_new == meta_region.country_code_iso2, "left")

In [46]:
pc_region_pd = pass_pc_user.where(col("period_status").isin(["active", "pass"])).groupBy("pubg_region").agg(countDistinct("AccountId").alias("account_cnt"), countDistinct(when(col("period_status") == "pass", col("AccountId"))).alias("pass_cnt")).withColumn("PassUserRate", col("pass_cnt")/col("account_cnt")).toPandas()

In [53]:
pc_level_region = pass_pc_user.where(col("period_status") == "pass").groupBy("pubg_region", "final_level").agg(countDistinct("AccountId").alias("user_cnt")).toPandas()
pc_level_region.to_csv("./data/pc_level_region.csv", index=False)

In [57]:
# pc_level_region = pd.read_csv("./data/pc_level_region.csv")

In [48]:
pc_region_pd.to_csv("./data/pc_region_pd.csv", index=False)

In [49]:
pc_region_pd

Unnamed: 0,pubg_region,account_cnt,pass_cnt,PassUserRate
0,CN,5941048,242382,0.040798
1,,245560,7245,0.029504
2,SA,212981,3840,0.01803
3,KR,1710793,100545,0.058771
4,SEA,428253,10419,0.024329
5,Undefined,8342,4,0.00048
6,JP,48483,1291,0.026628
7,TW/HK,101619,2479,0.024395
8,CIS,515406,5367,0.010413
9,EMEA,672288,15058,0.022398


In [64]:
pc_tmp = pd.merge(pc_region_pd, pc_level_region, on="pubg_region")
pc_tmp["level_user_rate"] = pc_tmp["user_cnt"]/pc_tmp["pass_cnt"]

In [67]:
pc_tmp.sort_values(["pubg_region", "final_level"]).to_csv("./data/pc_tmp.csv", index=False)

In [69]:
pc_tmp[(pc_tmp['final_level'] >= 30) & (pc_tmp['final_level'] <= 35)].groupby("pubg_region")['user_c']

Unnamed: 0,pubg_region,account_cnt,pass_cnt,PassUserRate,final_level,user_cnt,level_user_rate
5,CN,5941048,242382,0.040798,34,73827,0.304589
7,CN,5941048,242382,0.040798,33,74491,0.307329
17,CN,5941048,242382,0.040798,30,64433,0.265832
23,CN,5941048,242382,0.040798,31,75764,0.312581
96,CN,5941048,242382,0.040798,32,73326,0.302522
132,CN,5941048,242382,0.040798,35,71557,0.295224
168,SA,212981,3840,0.01803,33,1423,0.370573
208,SA,212981,3840,0.01803,30,1313,0.341927
220,SA,212981,3840,0.01803,32,1360,0.354167
277,SA,212981,3840,0.01803,31,1412,0.367708


In [50]:
console_region_pd = pass_console_user.where(col("period_status").isin(["active", "pass"])).groupBy("pubg_region").agg(countDistinct("AccountId").alias("account_cnt"), countDistinct(when(col("period_status") == "pass", col("AccountId"))).alias("pass_cnt")).withColumn("PassUserRate", col("pass_cnt")/col("account_cnt")).toPandas()

In [54]:
console_level_region = pass_console_user.where(col("period_status") == "pass").groupBy("pubg_region", "final_level").agg(countDistinct("AccountId").alias("user_cnt")).toPandas()
console_level_region.to_csv("./data/console_level_region.csv", index=False)

In [70]:
console_level_region = pd.read_csv("./data/console_level_region.csv")

In [51]:
console_region_pd.to_csv("./data/console_region_pd.csv", index=False)

In [71]:
console_region_pd

Unnamed: 0,pubg_region,account_cnt,pass_cnt,PassUserRate
0,CN,9944,121,0.012168
1,,646493,21179,0.03276
2,,139661,178,0.001275
3,SA,154184,2891,0.01875
4,KR,7985,405,0.05072
5,SEA,13846,218,0.015745
6,Undefined,845,2,0.002367
7,JP,23041,1360,0.059025
8,TW/HK,6171,173,0.028034
9,CIS,26826,308,0.011481


In [72]:
console_tmp = pd.merge(console_region_pd, console_level_region, on="pubg_region")
console_tmp["level_user_rate"] = console_tmp["user_cnt"]/console_tmp["pass_cnt"]

In [73]:
console_tmp.sort_values(["pubg_region", "final_level"]).to_csv("./data/console_tmp.csv", index=False)

In [77]:
pass_pc.select("period_status").distinct().show()

+-------------+
|period_status|
+-------------+
|         null|
|       active|
|   non_active|
|         pass|
+-------------+



In [86]:
pass_pc.limit(10).toPandas()

Unnamed: 0,date,platform,AccountId,period_type,period_status,pass_start_date,daily_levelup_xp,daily_play_xp,daily_outgame_xp,total_levelup_xp,total_play_xp,total_outgame_xp,final_level
0,2021-07-07,STEAM,account.27d7e1ef9da54b5bab3b2d1f7800b793,taego,non_active,,0,0,0,0,0,0,1
1,2021-07-07,STEAM,account.0fa49bf05c884503be411044735f61a2,taego,non_active,,0,0,0,0,0,0,1
2,2021-07-07,STEAM,account.044ba6368c944ad1bc3f40cc733137b5,taego,non_active,,0,0,0,0,0,0,1
3,2021-07-07,KAKAO,account.69af1aa7e28a46b489d339d999630408,taego,non_active,,0,0,0,0,0,0,1
4,2021-07-07,STEAM,account.714cba09d8a54df1933fd092663bcf24,taego,non_active,,0,0,0,0,0,0,1
5,2021-07-07,STEAM,account.8e32a62c0fc347ffbddbe6de41ad2a8a,taego,non_active,,0,0,0,0,0,0,1
6,2021-07-07,STEAM,account.4220d836d21e49a9aa587003372a8c82,taego,non_active,,0,0,0,0,0,0,1
7,2021-07-07,STEAM,account.a88505ced0f64d509ee75008521da23c,taego,non_active,,0,0,0,0,0,0,1
8,2021-07-07,STEAM,account.0ccd676153c14593bff99779506e1638,taego,non_active,,0,0,0,0,0,0,1
9,2021-07-07,KAKAO,account.b192c20ed27545468dcb388dbdfc6c2a,taego,non_active,,0,0,0,0,0,0,1


In [87]:
pass_pc.where(col("period_status") == "pass").limit(10).toPandas()

Unnamed: 0,date,platform,AccountId,period_type,period_status,pass_start_date,daily_levelup_xp,daily_play_xp,daily_outgame_xp,total_levelup_xp,total_play_xp,total_outgame_xp,final_level
0,2021-07-07,KAKAO,account.62be97e449da4c50a3c42c6dae8e8c13,taego,pass,2021-07-07,0,5210,9800,0,5210,9800,2
1,2021-07-07,KAKAO,account.c76d9f5fbc2c466992e20b8cd750dfe2,taego,pass,2021-07-07,50000,1640,6000,50000,1640,6000,6
2,2021-07-07,STEAM,account.381623327da34585aa9cbc1682bd3d65,taego,pass,2021-07-07,0,2810,12000,0,2810,12000,2
3,2021-07-07,STEAM,account.d39fcda4dcbb44818cf8ae7c78f0a70f,taego,pass,2021-07-07,490000,0,0,490000,0,0,50
4,2021-07-07,STEAM,account.c1ba0609c4d847708b2461ac036be0f5,taego,pass,2021-07-07,300000,3250,23800,300000,3250,23800,33
5,2021-07-07,STEAM,account.791987d406564ec4acb69c63e58b8ea4,taego,pass,2021-07-07,0,700,2200,0,700,2200,1
6,2021-07-07,STEAM,account.51814af24a0540fba626be01d31e6257,taego,pass,2021-07-07,0,4390,23800,0,4390,23800,3
7,2021-07-07,STEAM,account.90843f6cf65d47fb88a5c36d1f40ab64,taego,pass,2021-07-07,490000,1980,11800,490000,1980,11800,51
8,2021-07-07,STEAM,account.30136daf624c4ba8bbec8f7dc0b48ca1,taego,pass,2021-07-07,0,3250,9800,0,3250,9800,2
9,2021-07-07,STEAM,account.a396ad243e05431aa9220af3006d81fe,taego,pass,2021-07-07,0,2440,12000,0,2440,12000,2


In [90]:
pass_pc.where(col("period_status") == "pass").groupBy("date").agg(sum("daily_levelup_xp").alias("sum_daily_levelup_xp"), countDistinct("AccountId").alias("account_cnt"), sum(when(col("daily_levelup_xp") > 0, 1)).alias("levelup_cnt")).withColumn("levelup_rate", round(col("levelup_cnt")/col("account_cnt"), 3)).orderBy("date").toPandas()

Unnamed: 0,date,sum_daily_levelup_xp,account_cnt,levelup_cnt,levelup_rate
0,2021-07-07,9130680000,137484,30080,0.219
1,2021-07-08,4171360000,195539,16671,0.085
2,2021-07-09,2675170000,225880,11727,0.052
3,2021-07-10,2376870000,248430,10922,0.044
4,2021-07-11,1734680000,263970,8808,0.033
5,2021-07-12,1132350000,274075,6290,0.023
6,2021-07-13,946730000,282068,5555,0.02
7,2021-07-14,835790000,288962,5338,0.018
8,2021-07-15,856060000,295908,5184,0.018
9,2021-07-16,869400000,302385,5340,0.018


In [91]:
pass_console.where(col("period_status") == "pass").groupBy("date").agg(sum("daily_levelup_xp").alias("sum_daily_levelup_xp"), countDistinct("AccountId").alias("account_cnt"), sum(when(col("daily_levelup_xp") > 0, 1)).alias("levelup_cnt")).withColumn("levelup_rate", round(col("levelup_cnt")/col("account_cnt"), 3)).orderBy("date").toPandas()

Unnamed: 0,date,sum_daily_levelup_xp,account_cnt,levelup_cnt,levelup_rate
0,2021-07-15,537970000,10510,2179,0.207
1,2021-07-16,359180000,16442,1722,0.105
2,2021-07-17,216340000,19588,1127,0.058
3,2021-07-18,158520000,21601,907,0.042
4,2021-07-19,92880000,22828,623,0.027
5,2021-07-20,87000000,23743,550,0.023
6,2021-07-21,55780000,24424,417,0.017
7,2021-07-22,53060000,25035,409,0.016
8,2021-07-23,68170000,25681,455,0.018
9,2021-07-24,67260000,26329,458,0.017


In [84]:
pass_pc.where(col("period_status").isin(["active", "pass"])).groupBy("date").agg(sum(when(col("period_status") == "active", 1)).alias("active_cnt"), sum(when(col("period_status") == "pass", 1)).alias("pass_cnt")).orderBy("date").toPandas()

Unnamed: 0,date,active_cnt,pass_cnt
0,2021-07-07,1185956,137489
1,2021-07-08,1942282,195563
2,2021-07-09,2472483,225943
3,2021-07-10,2983132,248610
4,2021-07-11,3379040,264404
5,2021-07-12,3648450,275020
6,2021-07-13,3884888,283973
7,2021-07-14,4094519,292787
8,2021-07-15,4312689,303573
9,2021-07-16,4574847,317730


In [85]:
pass_console.where(col("period_status").isin(["active", "pass"])).groupBy("date").agg(sum(when(col("period_status") == "active", 1)).alias("active_cnt"), sum(when(col("period_status") == "pass", 1)).alias("pass_cnt")).orderBy("date").toPandas()

Unnamed: 0,date,active_cnt,pass_cnt
0,2021-07-15,147196,10510
1,2021-07-16,228021,16442
2,2021-07-17,293473,19588
3,2021-07-18,349012,21601
4,2021-07-19,390163,22828
5,2021-07-20,427422,23743
6,2021-07-21,460446,24424
7,2021-07-22,492241,25035
8,2021-07-23,524613,25681
9,2021-07-24,561217,26329


In [48]:
pass_pc_level = pass_pc.where(col("period_status") == "pass").groupBy("date", "platform", "final_level").agg(countDistinct("AccountId")).orderBy("date").toPandas()

In [49]:
pass_console_level = pass_console.where(col("period_status") == "pass").groupBy("date", "platform", "final_level").agg(countDistinct("AccountId")).orderBy("date").toPandas()

In [54]:
pass_pc_level.columns = ["date", "platform", "final_level", "account_cnt"]
pass_console_level.columns = ["date", "platform", "final_level", "account_cnt"]

In [55]:
pass_pc_level.to_csv("./data/pass_pc_level.csv", index=False)

In [56]:
pass_console_level.to_csv("./data/pass_console_level.csv", index=False)

In [61]:
don_lee_pc = pass_pc_level[(pass_pc_level.final_level >= 30) & (pass_pc_level.final_level <= 35)]
don_lee_console = pass_console_level[(pass_console_level.final_level >= 30) & (pass_console_level.final_level <= 35)]

In [65]:
don_lee_pc.groupby(["date", "final_level"])["account_cnt"].sum()

date        final_level
2021-07-07  30               11
            31             7839
            32             4035
            33             2821
            34              957
            35              504
2021-07-08  30               15
            31             6503
            32             4571
            33             4871
            34             3724
            35             1871
2021-07-09  30               30
            31             5371
            32             4072
            33             4960
            34             4987
            35             3387
2021-07-10  30               24
            31             4435
            32             3613
            33             4724
            34             5281
            35             4436
2021-07-11  30               34
            31             3754
            32             3181
            33             4196
            34             5088
            35             4720
                

PC

In [70]:
pass_pc.where((col("final_level") >= 30) & (col("final_level") <= 35)).groupBy("final_level").agg(countDistinct("AccountId")).orderBy("final_level").show()

+-----------+-------------------------+
|final_level|count(DISTINCT AccountId)|
+-----------+-------------------------+
|         30|                   473300|
|         31|                   460838|
|         32|                   427516|
|         33|                   401471|
|         34|                   374491|
|         35|                   347316|
+-----------+-------------------------+



In [74]:
pass_pc.where((col("final_level") >= 30) & (col("final_level") <= 35)).groupBy("final_level", "platform").agg(countDistinct("AccountId")).orderBy("final_level", "platform").toPandas()

Unnamed: 0,final_level,platform,count(DISTINCT AccountId)
0,30,DMM,26
1,30,KAKAO,35430
2,30,MAILRU,394
3,30,STEAM,437476
4,31,DMM,30
5,31,KAKAO,36205
6,31,MAILRU,390
7,31,STEAM,424243
8,32,DMM,27
9,32,KAKAO,34070


CONSOLE

In [71]:
pass_console.where((col("final_level") >= 30) & (col("final_level") <= 35)).groupBy("final_level").agg(countDistinct("AccountId")).orderBy("final_level").show()

+-----------+-------------------------+
|final_level|count(DISTINCT AccountId)|
+-----------+-------------------------+
|         30|                    32747|
|         31|                    32126|
|         32|                    29596|
|         33|                    28085|
|         34|                    26161|
|         35|                    24261|
+-----------+-------------------------+



In [75]:
pass_console.where((col("final_level") >= 30) & (col("final_level") <= 35)).groupBy("final_level", "platform").agg(countDistinct("AccountId")).orderBy("final_level", "platform").toPandas()

Unnamed: 0,final_level,platform,count(DISTINCT AccountId)
0,30,PSN,9507
1,30,STADIA,89
2,30,XBOX,23151
3,31,PSN,9234
4,31,STADIA,85
5,31,XBOX,22807
6,32,PSN,8620
7,32,STADIA,75
8,32,XBOX,20901
9,33,PSN,8143
