In [247]:
from pubg_util import mysql, load_schema, notifier
from sphynx import sphynx, NODE_SMALL, NODE_MEDIUM, NODE_LARGE
from pyspark.sql.functions import *
import pandas as pd
import pickle

spark = sphynx.get_spark(executor_count=8, app_name='pdu_syp2', node_spec=NODE_LARGE)

Spark cluster not assigned. creating a new one...
Node spec: 8 executors with 48G RAM each
Job Port 4049 is assigned for requested cluster
Waiting for Spark master to be available...
Spark master launched!
Creating new Spark session, name: pdu_syp2...
Waiting for all executors ready...
All executors connected!
Complete! elapsed time: 00:00:25


In [248]:
start_date = "2021-11-12"
end_date = "2021-12-18"

In [69]:
pc_au = load_data_mart(start_date=start_date, end_date=end_date, table_name="user_master", device="pc").where(col("lastlogindate") == col("date"))

In [267]:
console_au = load_data_mart(start_date=start_date, end_date=end_date, table_name="user_master", device="console").where(col("lastlogindate") == col("date"))

In [268]:
# user 지역
from pyspark.sql.types import * 

def classify_country(country_os, country_ip):
    if country_os != 'CN':
        return country_ip
    else:
        return country_os

country_type_udf = udf(classify_country, StringType())

meta_region = mysql.read_table(spark, 'metainfo', 'meta_bi_regions')

In [269]:
# pc_au = pc_au.withColumn("country_new", country_type_udf("country_os", "country_ip"))
# pc_au = pc_au.join(meta_region, pc_au.country_new == meta_region.country_code_iso2, "left")

console_au = console_au.withColumn("country_new", country_type_udf("country_os", "country_ip"))
console_au = console_au.join(meta_region, console_au.country_new == meta_region.country_code_iso2, "left")

In [270]:
# pc_user = pc_au.select("accountid", "pubg_region").distinct()
console_user = console_au.select("accountid", "pubg_region").distinct()

In [99]:
pc_au_df = pc_au.groupBy("date", "platform", "pubg_region").agg(countDistinct("accountid").alias("au")).toPandas()
console_au_df = console_au.groupBy("date", "platform", "pubg_region").agg(countDistinct("accountid").alias("au")).toPandas()

In [100]:
pc_au_df.head()

Unnamed: 0,date,platform,pubg_region,au
0,2021-12-04,STEAM,EMEA,98663
1,2021-11-20,STEAM,CIS,60547
2,2021-12-04,MAILRU,CIS,1179
3,2021-12-06,KAKAO,CN,27
4,2021-12-11,KAKAO,CN,23


In [102]:
pc_au_df.to_csv("./data/pc_au_df.csv", index=False)
console_au_df.to_csv("./data/console_au_df.csv", index=False)

In [227]:
pc_product_list = ["itemdesc.13000383", "itemdesc.13000384", "itemdesc.13000385", "itemdesc.13000386"]
console_product_list = ["itemdesc.13000387", "itemdesc.13000388", "itemdesc.16100298", "itemdesc.13000389", "itemdesc.12032102", "itemdesc.12031102", "itemdesc.12020036", "itemdesc.12010510", "itemdesc.12010501"]

In [251]:
len(console_product_list)

9

In [249]:
pc_gcoin = load_data_mart(start_date=start_date, end_date=end_date, table_name="gcoin_use", device="pc").where(col("product_id").isin(pc_product_list))
console_gcoin = load_data_mart(start_date=start_date, end_date=end_date, table_name="gcoin_use", device="console").where(col("product_id").isin(console_product_list))

In [283]:
pc_gcoin.select(sum("free_use") + sum("paid_use")).show(truncate=False)

+-------------------------------+
|(sum(free_use) + sum(paid_use))|
+-------------------------------+
|1337673360                     |
+-------------------------------+



In [271]:
# pc_gcoin_with_region = pc_gcoin.join(pc_user, pc_gcoin.account_id == pc_user.accountid, how="left")
console_gcoin_with_region = console_gcoin.join(console_user, console_gcoin.account_id == console_user.accountid, how="left")

In [235]:
pc_daily_pu = pc_gcoin.groupBy("date").agg(countDistinct("account_id").alias("pu")).toPandas()
console_daily_pu = console_gcoin.groupBy("date").agg(countDistinct("account_id").alias("pu")).toPandas()

In [236]:
pc_daily_au = pc_au_df.groupby("date", as_index=False)["au"].sum()
console_daily_au = console_au_df.groupby("date", as_index=False)["au"].sum()

In [240]:
pd.merge(pc_daily_au, pc_daily_pu, on="date").to_csv("./data/pc_daily_pur.csv", index=False)
pd.merge(console_daily_au, console_daily_pu, on="date").to_csv("./data/console_daily_pur.csv", index=False)

### Daily AU & PU by Platform

In [149]:
pc_platform_daily_pu = pc_gcoin.groupBy("date", "platform").agg(countDistinct("account_id").alias("pu")).toPandas()
console_platform_daily_pu = console_gcoin.groupBy("date", "platform").agg(countDistinct("account_id").alias("pu")).toPandas()

In [150]:
pc_platform_daily_pu.to_csv("./data/pc_platform_daily_pu.csv", index=False)
console_platform_daily_pu.to_csv("./data/console_platform_daily_pu.csv", index=False)

In [151]:
pc_platform_daily_pu.sort_values(["date", "platform"]).head()

Unnamed: 0,date,platform,pu
0,2021-11-12,KAKAO,5395
79,2021-11-12,MAILRU,9
37,2021-11-12,STEAM,84473
32,2021-11-13,KAKAO,3100
74,2021-11-13,MAILRU,4


In [188]:
pc_platform_daily_au = pc_au_df.groupby(["date", "platform"], as_index=False)["au"].sum()
console_platform_daily_au = console_au_df.groupby(["date", "platform"], as_index=False)["au"].sum()

In [189]:
# pc_platform_daily_pu.date = pc_platform_daily_pu.date.astype(str)
# pc_platform_daily_au.date = pc_platform_daily_au.date.astype(str)
# console_platform_daily_au.date = console_platform_daily_au.date.astype(str)
# console_platform_daily_pu.date = console_platform_daily_pu.date.astype(str)

In [187]:
pc_platform_daily = pd.merge(pc_platform_daily_au, pc_platform_daily_pu, on=["date", "platform"])
pc_platform_daily.to_csv("./data/pc_platform_daily.csv", index=False)

In [202]:
pc_platform_daily.head()

Unnamed: 0,date,platform,au,pu
0,2021-11-12,KAKAO,105555,5395
1,2021-11-12,MAILRU,1872,9
2,2021-11-12,STEAM,1136535,84473
3,2021-11-13,KAKAO,118307,3100
4,2021-11-13,MAILRU,1960,4


In [191]:
console_platform_daily = pd.merge(console_platform_daily_au, console_platform_daily_pu, on=["date", "platform"])
console_platform_daily.to_csv("./data/console_platform_daily.csv", index=False)

### Total AU & PU by Platform

In [152]:
pc_platform_pu = pc_gcoin.groupBy("platform").agg(countDistinct("account_id").alias("pu")).toPandas()
console_platform_pu = console_gcoin.groupBy("platform").agg(countDistinct("account_id").alias("pu")).toPandas()

In [153]:
pc_platform_pu.to_csv("./data/pc_platform_pu.csv", index=False)
console_platform_pu.to_csv("./data/console_platform_pu.csv", index=False)

In [154]:
pc_platform_pu

Unnamed: 0,platform,pu
0,KAKAO,25541
1,STEAM,262868
2,MAILRU,29


In [42]:
pc_platform_au = pc_au.groupBy("platform").agg(countDistinct("accountid").alias("au")).toPandas()
console_platform_au = console_au.groupBy("platform").agg(countDistinct("accountid").alias("au")).toPandas()

# pc_platform_au.to_csv("./data/pc_platform_au.csv", index=False)
# console_platform_au.to_csv("./data/console_platform_au.csv", index=False)

In [86]:
pc_platform_au

Unnamed: 0,platform,au
0,KAKAO,726358
1,STEAM,5782798
2,MAILRU,7682


In [193]:
pd.merge(pc_platform_pu, pc_platform_au, on="platform")

Unnamed: 0,platform,pu,au
0,KAKAO,25541,726358
1,STEAM,262868,5782798
2,MAILRU,29,7682


In [194]:
pd.merge(console_platform_pu, console_platform_au, on="platform")

Unnamed: 0,platform,pu,au
0,XBOX,11625,677807
1,STADIA,39,7441
2,PSN,4406,306905


### Daily AU & PU by Region

In [155]:
pc_region_daily_pu = pc_gcoin_with_region.groupBy("date", "pubg_region").agg(countDistinct("account_id").alias("pu")).toPandas()
console_region_daily_pu = console_gcoin_with_region.groupBy("date", "pubg_region").agg(countDistinct("account_id").alias("pu")).toPandas()

In [156]:
pc_region_daily_pu.to_csv("./data/pc_region_daily_pu.csv", index=False)
console_region_daily_pu.to_csv("./data/console_region_daily_pu.csv", index=False)

In [157]:
pc_region_daily_pu.sort_values(["date", "pubg_region"]).head()

Unnamed: 0,date,pubg_region,pu
357,2021-11-12,CIS,1191
242,2021-11-12,CN,69040
150,2021-11-12,EMEA,2101
333,2021-11-12,JP,169
290,2021-11-12,KR,13037


In [195]:
pc_region_daily_au = pc_au_df.groupby(["date", "pubg_region"], as_index=False)["au"].sum()
console_region_daily_au = console_au_df.groupby(["date", "pubg_region"], as_index=False)["au"].sum()
pc_region_daily_au.date = pc_region_daily_au.date.astype(str)
console_region_daily_au.date = console_region_daily_au.date.astype(str)
pc_region_daily_pu.date = pc_region_daily_pu.date.astype(str)
console_region_daily_pu.date = console_region_daily_pu.date.astype(str)

In [198]:
pc_region_daily = pd.merge(pc_region_daily_au, pc_region_daily_pu, on=["date", "pubg_region"])
pc_region_daily.to_csv("./data/pc_region_daily.csv", index=False)

console_region_daily = pd.merge(console_region_daily_au, console_region_daily_pu, on=["date", "pubg_region"])
console_region_daily.to_csv("./data/console_region_daily.csv", index=False)

### Total AU & PU by Region

In [158]:
pc_region_pu = pc_gcoin_with_region.groupBy("pubg_region").agg(countDistinct("account_id").alias("pu")).toPandas()
console_region_pu = console_gcoin_with_region.groupBy("pubg_region").agg(countDistinct("account_id").alias("pu")).toPandas()

In [159]:
pc_region_pu.to_csv("./data/pc_region_pu.csv", index=False)
console_region_pu.to_csv("./data/console_region_pu.csv", index=False)

In [111]:
pc_region_au = pc_au.groupBy("pubg_region").agg(countDistinct("accountid").alias("au")).toPandas()
console_region_au = console_au.groupBy("pubg_region").agg(countDistinct("accountid").alias("au")).toPandas()

pc_region_au.to_csv("./data/pc_region_au.csv", index=False)
console_region_au.to_csv("./data/console_region_au.csv", index=False)

In [199]:
pd.merge(pc_region_au, pc_region_pu, on="pubg_region")

Unnamed: 0,pubg_region,au,pu
0,CN,3759605,199891
1,,151755,3162
2,SA,110434,1894
3,KR,1376148,57098
4,SEA,249764,9870
5,Undefined,558,6
6,JP,26119,690
7,TW/HK,52970,1930
8,CIS,297871,4663
9,EMEA,468911,8564


In [200]:
pd.merge(console_region_au, console_region_pu, on="pubg_region")

Unnamed: 0,pubg_region,au,pu
0,CN,6502,57
1,,537764,10218
2,SA,135219,1391
3,KR,5543,130
4,SEA,8381,111
5,JP,15807,482
6,TW/HK,3464,88
7,CIS,26849,258
8,EMEA,227671,2791
9,OC,24498,544


In [45]:
pc_au.select(countDistinct("accountid")).show(truncate=False)

+-------------------------+
|count(DISTINCT accountid)|
+-------------------------+
|6514824                  |
+-------------------------+



In [46]:
console_au.select(countDistinct("accountid")).show(truncate=False)

+-------------------------+
|count(DISTINCT accountid)|
+-------------------------+
|992153                   |
+-------------------------+



In [160]:
pc_gcoin.select(countDistinct("account_id")).show(truncate=False)

+--------------------------+
|count(DISTINCT account_id)|
+--------------------------+
|288435                    |
+--------------------------+



In [161]:
console_gcoin.select(countDistinct("account_id")).show(truncate=False)

+--------------------------+
|count(DISTINCT account_id)|
+--------------------------+
|16070                     |
+--------------------------+



In [163]:
pc_sales = pd.read_csv("./data/pc_daily_sales.csv").fillna(0)

In [255]:
console_sales = pd.read_csv("./data/console_daily_sales.csv")

In [170]:
pc_au_df.head()

Unnamed: 0,date,platform,pubg_region,au
0,2021-12-04,STEAM,EMEA,98663
1,2021-11-20,STEAM,CIS,60547
2,2021-12-04,MAILRU,CIS,1179
3,2021-12-06,KAKAO,CN,27
4,2021-12-11,KAKAO,CN,23


In [176]:
# pc_au_df.date = pc_au_df.date.astype(str)
# pc_sales.date = pc_sales.date.astype(str)

# pd.merge(pc_au_df, pc_sales, on=["date", "platform", "pubg_region"]).sort_values(["date", "platform", "pubg_region", "product_id"]).to_csv("./data/pc_pur.csv", index=False)
pc_pur = pd.read_csv("./data/pc_pur.csv")

In [177]:
# console_au_df.date = console_au_df.date.astype(str)
# console_sales.date = console_sales.date.astype(str)

# pd.merge(console_au_df, console_sales, on=["date", "platform", "pubg_region"]).sort_values(["date", "platform", "pubg_region", "product_id"]).to_csv("./data/console_pur.csv", index=False)
console_pur = pd.read_csv("./data/console_pur.csv")

In [60]:
pc_sales.paid_in_dollar.sum()

9338152.15

In [61]:
console_sales.paid_in_dollar.sum()

501302.39999999997

In [181]:
9.23948e+07

92394800.0

In [242]:
pc_sales.groupby("platform", as_index=False)["sales"].sum()

Unnamed: 0,platform,sales
0,KAKAO,1074412.0
1,MAILRU,636.9
2,STEAM,12301528.2


In [243]:
console_sales.groupby("platform", as_index=False)["sales"].sum()

Unnamed: 0,platform,sales
0,PSN,271402.4
1,STADIA,1523.6
2,XBOX,651021.6


In [210]:
pc_sales.groupby("pubg_region", as_index=False)["sales"].sum()

Unnamed: 0,pubg_region,sales
0,0,127280.2
1,CIS,178704.9
2,CN,9489277.3
3,EMEA,371896.7
4,JP,30909.6
5,KR,2490809.9
6,OC,38310.5
7,SA,80853.1
8,SEA,478793.4
9,TW/HK,89622.7


In [211]:
console_sales.groupby("pubg_region", as_index=False)["sales"].sum()

Unnamed: 0,pubg_region,sales
0,0,548189.2
1,CIS,15831.2
2,CN,4773.4
3,EMEA,169937.8
4,JP,33703.0
5,KR,10229.8
6,OC,30187.6
7,SA,97101.0
8,SEA,6454.4
9,TW/HK,7540.2


In [224]:
pc_sales.groupby("product_name", as_index=False)[["unit_sold", "sales", "paid_sales"]].sum().sort_values("sales", ascending=False)

Unnamed: 0,product_name,unit_sold,sales,paid_sales
3,PGC 2021 MEGA BUNDLE + VOTING COUPONS,166566.0,9894020.4,6896149.1
0,PGC 2021 CONTINGENT CHAOS DUSTER BUNDLE + VOTI...,68954.0,2185841.8,1647499.45
1,PGC 2021 CONTINGENT CHAOS JACKET BUNDLE + VOTI...,26897.0,852634.9,608678.15
2,PGC 2021 DANCE SPLASH EMOTE + VOTING COUPON BU...,88816.0,444080.0,185825.45


In [225]:
console_sales.groupby("product_name", as_index=False)[["unit_sold", "sales", "paid_sales"]].sum().sort_values("sales", ascending=False)

Unnamed: 0,product_name,unit_sold,sales,paid_sales
8,PGC 2021 MEGA BUNDLE,6252.0,371368.8,273436.2
1,PGC 2021 CONTINGENT CHAOS DUSTER BUNDLE,5342.0,169341.4,117309.0
5,PGC 2021 Contingent Chaos - Pan,9926.0,98267.4,53888.2
2,PGC 2021 CONTINGENT CHAOS JACKET BUNDLE,2998.0,95036.6,64374.2
6,PGC 2021 Dance - Splash Damage,14312.0,71560.0,46183.0
0,"""PGC 2021 Contingent Chaos"" UAZ",3560.0,35244.0,17264.8
4,PGC 2021 Contingent Chaos - P90,3506.0,34709.4,17874.2
3,PGC 2021 Contingent Chaos - Helmet (Level 3),4842.0,24210.0,10906.8
7,PGC 2021 Giltslate Contagion Backpack (Level 3),4842.0,24210.0,11771.8


In [207]:
pc_au_df.date = pc_au_df.date.astype(str)
pc_sales.date = pc_sales.date.astype(str)
pd.merge(pc_au_df, pc_sales, on=["date", "platform", "pubg_region"]).to_csv("./data/pc_daily_product_sales.csv", index=False)

console_au_df.date = console_au_df.date.astype(str)
console_sales.date = console_sales.date.astype(str)
pd.merge(console_au_df, console_sales, on=["date", "platform", "pubg_region"]).to_csv("./data/console_daily_product_sales.csv", index=False)

In [276]:
console_gcoin_with_region.select(((sum("paid_use") + sum("free_use")))).show(truncate=False)

+-------------------------------+
|(sum(paid_use) + sum(free_use))|
+-------------------------------+
|46197380                       |
+-------------------------------+



In [277]:
console_gcoin_with_region.groupBy("product_name").agg(count("*")).show(truncate=False)

+-----------------------------------------------+--------+
|product_name                                   |count(1)|
+-----------------------------------------------+--------+
|PGC 2021 CONTINGENT CHAOS DUSTER BUNDLE        |2671    |
|PGC 2021 Dance - Splash Damage                 |7156    |
|PGC 2021 Giltslate Contagion Backpack (Level 3)|2421    |
|PGC 2021 CONTINGENT CHAOS JACKET BUNDLE        |1499    |
|PGC 2021 MEGA BUNDLE                           |3126    |
|PGC 2021 Contingent Chaos - P90                |1753    |
|PGC 2021 Contingent Chaos - Pan                |4963    |
|PGC 2021 Contingent Chaos - Helmet (Level 3)   |2421    |
|"PGC 2021 Contingent Chaos" UAZ                |1780    |
+-----------------------------------------------+--------+



In [274]:
console_gcoin_with_region.groupBy("product_name").agg(((sum("paid_use") + sum("free_use"))/100)).show(truncate=False)

+-----------------------------------------------+---------------------------------------+
|product_name                                   |((sum(paid_use) + sum(free_use)) / 100)|
+-----------------------------------------------+---------------------------------------+
|PGC 2021 CONTINGENT CHAOS DUSTER BUNDLE        |84670.7                                |
|PGC 2021 Dance - Splash Damage                 |35780.0                                |
|PGC 2021 Giltslate Contagion Backpack (Level 3)|12105.0                                |
|PGC 2021 CONTINGENT CHAOS JACKET BUNDLE        |47518.3                                |
|PGC 2021 MEGA BUNDLE                           |185684.4                               |
|PGC 2021 Contingent Chaos - P90                |17354.7                                |
|PGC 2021 Contingent Chaos - Pan                |49133.7                                |
|PGC 2021 Contingent Chaos - Helmet (Level 3)   |12105.0                                |
|"PGC 2021

In [278]:
console_gcoin_with_region.groupBy("platform").agg(((sum("paid_use") + sum("free_use"))/100)).show(truncate=False)

+--------+---------------------------------------+
|platform|((sum(paid_use) + sum(free_use)) / 100)|
+--------+---------------------------------------+
|XBOX    |325510.8                               |
|STADIA  |761.8                                  |
|PSN     |135701.2                               |
+--------+---------------------------------------+



In [281]:
console_gcoin_with_region.groupBy("platform").agg(countDistinct("account_id")).show(truncate=False)

+--------+--------------------------+
|platform|count(DISTINCT account_id)|
+--------+--------------------------+
|XBOX    |11625                     |
|STADIA  |39                        |
|PSN     |4406                      |
+--------+--------------------------+



In [279]:
console_gcoin_with_region.groupBy("pubg_region").agg(((sum("paid_use") + sum("free_use"))/100)).show(truncate=False)

+-----------+---------------------------------------+
|pubg_region|((sum(paid_use) + sum(free_use)) / 100)|
+-----------+---------------------------------------+
|CN         |2386.7                                 |
|NA         |274094.6                               |
|SA         |48550.5                                |
|KR         |5114.9                                 |
|SEA        |3227.2                                 |
|JP         |16851.5                                |
|TW/HK      |3770.1                                 |
|CIS        |7915.6                                 |
|EMEA       |84968.9                                |
|OC         |15093.8                                |
+-----------+---------------------------------------+

