In [1]:
from pubg_util import mysql, load_schema, notifier
from sphynx import sphynx, NODE_SMALL, NODE_MEDIUM, NODE_LARGE
from pyspark.sql.functions import *
import pandas as pd
import pickle

spark = sphynx.get_spark(executor_count=16, app_name='west0_mclaren_p', node_spec=NODE_LARGE)

Spark cluster not assigned. creating a new one...
Node spec: 16 executors with 48G RAM each
Job Port 4049 is assigned for requested cluster
Waiting for Spark master to be available...
Spark master launched!
Creating new Spark session, name: west0_mclaren_p...
Waiting for all executors ready...
All executors connected!
Complete! elapsed time: 00:00:25


In [4]:
pc_start_date = "2022-09-07"
pc_end_date = "2022-11-02"
console_start_date = "2022-09-15"
console_end_date = "2022-11-10"
event_name = "202209_season_workshop"

In [42]:
pc_gcoin = load_data_mart("pc", pc_start_date, pc_end_date, "gcoin_use").where(col("event_name") == event_name)
console_gcoin = load_data_mart("console", console_start_date, console_end_date, "gcoin_use").where(col("event_name") == event_name)

In [56]:
wsus_start_date = "2021-06-02"
wsus_end_date = "2021-09-30"
wsus_gcoin = load_data_mart("pc", wsus_start_date, wsus_end_date, "gcoin_use").where(col("event_name") == "202106_wsus_progressive_apocalypse")

In [59]:
wsus_user = load_data_mart("pc", wsus_end_date, wsus_end_date, "user_master").where(col("lastlogindate") >= wsus_start_date)
wsus_user.select(countDistinct("accountid")).show(truncate=False)

+-------------------------+
|count(DISTINCT accountid)|
+-------------------------+
|15055970                 |
+-------------------------+



In [57]:
wsus_gcoin = wsus_gcoin.withColumn("paid_account_id", when(col("paid_use") > 0, col("account_id")).otherwise(lit(None)))

In [58]:
wsus_gcoin.select(sum("paid_use"), sum(col("paid_use") + col("free_use")), countDistinct("paid_account_id"), countDistinct("account_id")).show(truncate=False)

+-------------+--------------------------+-------------------------------+--------------------------+
|sum(paid_use)|sum((paid_use + free_use))|count(DISTINCT paid_account_id)|count(DISTINCT account_id)|
+-------------+--------------------------+-------------------------------+--------------------------+
|4497710380   |5054651200                |627910                         |848050                    |
+-------------+--------------------------+-------------------------------+--------------------------+



In [38]:
pc_user = load_data_mart("pc", pc_end_date, pc_end_date, "user_master")
console_user = load_data_mart("console", console_end_date, console_end_date, "user_master")

In [39]:
# user 지역
from pyspark.sql.types import * 

def classify_country(country_os, country_ip):
    if country_os != 'CN':
        return country_ip
    else:
        return country_os

country_type_udf = udf(classify_country, StringType())

meta_region = mysql.read_table(spark, 'metainfo', 'meta_bi_regions')
pc_user = pc_user.withColumn("country_new", country_type_udf("country_os", "country_ip"))
pc_user =pc_user.join(meta_region, pc_user.country_new == meta_region.country_code_iso2, "left")

console_user = console_user.withColumn("country_new", country_type_udf("country_os", "country_ip"))
console_user = console_user.join(meta_region, console_user.country_new == meta_region.country_code_iso2, "left")

In [13]:
pc_user.where((col("lastlogindate") >= pc_start_date) & ((col("server_type") == "LIVE"))).select(countDistinct("accountid")).show(truncate=False)

+-------------------------+
|count(DISTINCT accountid)|
+-------------------------+
|14072022                 |
+-------------------------+



In [49]:
pc_user.where((col("lastlogindate") >= pc_start_date) & ((col("server_type") == "LIVE"))).groupBy("pubg_region").agg(countDistinct("accountid")).toPandas()

Unnamed: 0,pubg_region,count(DISTINCT accountid)
0,CN,8606890
1,,396802
2,SA,376320
3,KR,1556485
4,SEA,677685
5,Undefined,1678
6,JP,53996
7,TW/HK,86422
8,CIS,1107531
9,EMEA,1151710


In [40]:
console_user.where((col("lastlogindate") >= console_start_date) & ((col("server_type") == "LIVE"))).select(countDistinct("accountid")).show(truncate=False)

+-------------------------+
|count(DISTINCT accountid)|
+-------------------------+
|2291967                  |
+-------------------------+



In [50]:
console_user.where((col("lastlogindate") >= console_start_date) & ((col("server_type") == "LIVE"))).groupBy("pubg_region").agg(countDistinct("accountid")).toPandas()

Unnamed: 0,pubg_region,count(DISTINCT accountid)
0,CN,32737
1,,982831
2,SA,392095
3,KR,10291
4,SEA,45239
5,Undefined,769
6,JP,36233
7,TW/HK,8383
8,CIS,90759
9,EMEA,641592


In [6]:
pc_gcoin.select(sum("paid_use"), sum(col("paid_use") + col("free_use"))).show(truncate=False)

+-------------+--------------------------+
|sum(paid_use)|sum((paid_use + free_use))|
+-------------+--------------------------+
|3588856956   |4340361840                |
+-------------+--------------------------+



In [47]:
pc_gcoin.select(countDistinct("account_id"),countDistinct("paid_account_id")).show(truncate=False)

+--------------------------+-------------------------------+
|count(DISTINCT account_id)|count(DISTINCT paid_account_id)|
+--------------------------+-------------------------------+
|684410                    |452889                         |
+--------------------------+-------------------------------+



In [7]:
console_gcoin.select(sum("paid_use"), sum(col("paid_use") + col("free_use"))).show(truncate=False)

+-------------+--------------------------+
|sum(paid_use)|sum((paid_use + free_use))|
+-------------+--------------------------+
|201132180    |245824860                 |
+-------------+--------------------------+



In [48]:
console_gcoin.select(countDistinct("account_id"),countDistinct("paid_account_id")).show(truncate=False)

+--------------------------+-------------------------------+
|count(DISTINCT account_id)|count(DISTINCT paid_account_id)|
+--------------------------+-------------------------------+
|54306                     |42443                          |
+--------------------------+-------------------------------+



In [43]:
pc_gcoin = pc_gcoin.join(pc_user.withColumnRenamed("accountid", "account_id").select("account_id", "pubg_region"), "account_id")
console_gcoin = console_gcoin.join(console_user.withColumnRenamed("accountid", "account_id").select("account_id", "pubg_region"), "account_id")

In [44]:
pc_gcoin = pc_gcoin.withColumn("paid_account_id", when(col("paid_use") > 0, col("account_id")).otherwise(lit(None)))
console_gcoin = console_gcoin.withColumn("paid_account_id", when(col("paid_use") > 0, col("account_id")).otherwise(lit(None)))

In [36]:
pc_gcoin.groupBy("pubg_region").agg(sum("paid_use"), sum(col("paid_use") + col("free_use")), countDistinct("account_id"), countDistinct("paid_account_id")).toPandas()

Unnamed: 0,pubg_region,sum(paid_use),sum((paid_use + free_use)),count(DISTINCT account_id),count(DISTINCT paid_account_id)
0,CN,2615241155,3166162830,484164,323739
1,,77625250,92793370,14506,10333
2,SA,17136400,29526770,13436,6331
3,KR,627388950,721356030,78210,58255
4,SEA,40429500,53886310,14532,8750
5,JP,7401650,9054070,1326,906
6,TW/HK,26638930,30961230,3363,2519
7,CIS,50096561,70751820,28260,14683
8,EMEA,111956750,148219090,43638,25128
9,OC,14941810,17650320,2975,2245


In [45]:
console_gcoin.groupBy("pubg_region").agg(sum("paid_use"), sum(col("paid_use") + col("free_use")), countDistinct("account_id"), countDistinct("paid_account_id")).toPandas()

Unnamed: 0,pubg_region,sum(paid_use),sum((paid_use + free_use)),count(DISTINCT account_id),count(DISTINCT paid_account_id)
0,CN,1471360,1767090,225,172
1,,140954970,169759390,34263,27524
2,SA,10263690,13147340,5027,3623
3,KR,1856650,2202250,287,243
4,SEA,600270,732050,187,145
5,JP,5514050,6516450,958,844
6,TW/HK,1232820,1454660,178,157
7,CIS,1517170,3898400,719,234
8,EMEA,30439100,37638570,10572,7939
9,OC,7282100,8708660,1890,1562


In [54]:
pc_gcoin.groupBy("product_name").agg(sum("qty"), sum("paid_use"), sum(col("paid_use") + col("free_use"))).toPandas()

Unnamed: 0,product_name,sum(qty),sum(paid_use),sum((paid_use + free_use))
0,CRAFTER PASS: McLAREN TOKEN & LEVEL PACK,100483,290783410,328579410
1,McLAREN LOOT CACHE PACK (x11),263883,599719359,659707500
2,McLaren Level UP,3096871,251001176,309687100
3,McLAREN LOOT CACHE PACK (x27),121476,697807760,759225000
4,McLAREN LOOT CACHE PACK (x55),121974,1416681225,1524675000
5,CRAFTER PASS: McLAREN TOKEN PACK,381117,164100299,377305830
6,McLAREN LOOT CACHE PACK (x1),1524728,168763727,381182000


In [55]:
console_gcoin.groupBy("product_name").agg(sum("qty"), sum("paid_use"), sum(col("paid_use") + col("free_use"))).toPandas()

Unnamed: 0,product_name,sum(qty),sum(paid_use),sum((paid_use + free_use))
0,CRAFTER PASS: McLAREN TOKEN & LEVEL PACK,9112,27441110,29796240
1,McLAREN LOOT CACHE PACK (x11),15968,34675720,39920000
2,,206685,15537350,20668500
3,McLAREN LOOT CACHE PACK (x27),7620,41685430,47625000
4,McLAREN LOOT CACHE PACK (x55),3688,40350170,46100000
5,CRAFTER PASS: McLAREN TOKEN PACK,35313,25719180,34959870
6,McLAREN LOOT CACHE PACK (x1),107021,15723220,26755250
