In [161]:
from pubg_util import mysql, load_schema, notifier
from sphynx import sphynx, NODE_SMALL, NODE_MEDIUM, NODE_LARGE
from pyspark.sql.functions import *
import pandas as pd
import pickle

spark = sphynx.get_spark(executor_count=8, app_name='west0_double', node_spec=NODE_LARGE)

Spark cluster not assigned. creating a new one...
Node spec: 8 executors with 48G RAM each
Job Port 4049 is assigned for requested cluster
Waiting for Spark master to be available...
Spark master launched!
Creating new Spark session, name: west0_double...
Waiting for all executors ready...
All executors connected!
Complete! elapsed time: 00:00:24


In [160]:
sphynx.stop()

Stopping Spark session...
Destroying Spark cluster...
Done!


In [163]:
cash2021 = load_data_mart("pc", "2021-11-10", "2021-12-07", "cash_mtx").where(col("event_name") == "202111_gcoinx2")

In [164]:
# user 지역
from pyspark.sql.types import * 

def classify_country(country_os, country_ip):
    if country_os != 'CN':
        return country_ip
    else:
        return country_os

country_type_udf = udf(classify_country, StringType())

meta_region = mysql.read_table(spark, 'metainfo', 'meta_bi_regions')

user2021 = load_data_mart("pc", "2021-12-07", "2021-12-07", "user_master")
user2021 = user2021.withColumn("country_new", country_type_udf("country_os", "country_ip"))
user2021 = user2021.join(meta_region, user2021.country_new == meta_region.country_code_iso2, "left")

In [165]:
cash2021 = cash2021.join(user2021.withColumnRenamed("accountid", "account_id").select("account_id", "pubg_region"), "account_id")

In [166]:
pu_master2021 = load_data_mart("pc", "2021-12-07", "2021-12-07", "pu_master")

In [167]:
from pyspark.sql import SparkSession, Window
window = Window.partitionBy("account_id").orderBy("date")

pu_master2021 = pu_master2021\
    .withColumn("row_number", row_number().over(window))\
    .where("row_number = 1")\
    .drop("row")

In [168]:
cash2021 = cash2021.join(pu_master2021.select("account_id", "first_mtx_date"), "account_id", "left") \
    .withColumn("npu", when(col("date") == col("first_mtx_date"), "npu").otherwise("pu"))

In [124]:
double_user_2021 = cash2021.select("account_id", "pubg_region", "npu").distinct()

In [60]:
cn_double_user_2021 = cash2021.where(col("pubg_region") == "CN").select("account_id").distinct()

## Daily Balance

In [63]:
topup_2021 = load_data_mart('pc', "2021-10-10", "2022-01-07", "gcoin_topup").join(double_user_2021, "account_id")
cn_topup_2021 = load_data_mart('pc', "2021-10-10", "2022-01-07", "gcoin_topup").join(cn_double_user_2021, "account_id")

In [64]:
from pyspark.sql import SparkSession, Window
window = Window.partitionBy("date", "account_id").orderBy(desc("time"))

topup_2021 = topup_2021\
    .withColumn("row_number", row_number().over(window))\
    .where("row_number = 1")\
    .drop("row")
cn_topup_2021 = cn_topup_2021\
    .withColumn("row_number", row_number().over(window))\
    .where("row_number = 1")\
    .drop("row")

In [67]:
topup_daily_2021 = topup_2021.groupBy("date").agg(sum("free_balance").alias("free_balance"), sum("paid_balance").alias("paid_balance")).orderBy("date").toPandas()
topup_daily_2021.to_csv("./topup_daily_2021.csv", index=False)

In [68]:
cn_topup_daily_2021 = cn_topup_2021.groupBy("date").agg(sum("free_balance").alias("free_balance"), sum("paid_balance").alias("paid_balance")).orderBy("date").toPandas()
cn_topup_daily_2021.to_csv("./cn_topup_daily_2021.csv", index=False)

## Pu / Npu

In [123]:
user_master2021 = load_data_mart("pc", "2021-11-10", "2021-12-07", "user_master").withColumnRenamed("accountid", "account_id")

In [45]:
cash2021_first_buy_date = cash2021.groupBy("account_id").agg(min("date").alias("date"), min("npu").alias("npu"))
cash2021_first_buy_date.join(user_master2021, ["date", "account_id"]).groupBy("npu", "usertype").agg(countDistinct("account_id").alias("user_cnt")) \
    .orderBy("npu", "usertype").toPandas()

Unnamed: 0,npu,usertype,user_cnt
0,npu,Exist,67224
1,npu,New,7007
2,npu,Return,7543
3,pu,Exist,430681
4,pu,New,1
5,pu,Return,29291


In [69]:
cn_cash2021_first_buy_date = cash2021.where(col("pubg_region") == "CN").groupBy("account_id").agg(min("date").alias("date"), min("npu").alias("npu"))
cn_cash2021_first_buy_date.join(user_master2021, ["date", "account_id"]).groupBy("npu", "usertype").agg(countDistinct("account_id").alias("user_cnt")) \
    .orderBy("npu", "usertype").toPandas()

Unnamed: 0,npu,usertype,user_cnt
0,npu,Exist,42764
1,npu,New,4907
2,npu,Return,4299
3,pu,Exist,296291
4,pu,Return,20097


## Product Revenue

In [35]:
cash2021.groupBy("npu", "product_id", "product_name").agg(sum("unit_sold").alias("unit_sold"), sum("ingame_revenue").alias("total_revenue")).orderBy("npu", "product_id").toPandas()

Unnamed: 0,npu,product_id,product_name,unit_sold,total_revenue
0,npu,gcoinbundledesc.1012,PUBG - Double G-coin 1020 G-COIN (500 + 520 BO...,51705,258068.6
1,npu,gcoinbundledesc.1013,PUBG - Double G-coin 5400 G-COIN (2500 + 2900 ...,29380,734243.15
2,npu,gcoinbundledesc.1014,PUBG - Double G-coin 11000 G-COIN (5000 + 6000...,16497,824702.04
3,pu,gcoinbundledesc.1012,PUBG - Double G-coin 1020 G-COIN (500 + 520 BO...,327644,1635191.0
4,pu,gcoinbundledesc.1013,PUBG - Double G-coin 5400 G-COIN (2500 + 2900 ...,291390,7282076.31
5,pu,gcoinbundledesc.1014,PUBG - Double G-coin 11000 G-COIN (5000 + 6000...,204592,10227727.42


In [70]:
cash2021.where(col("pubg_region") == "CN").groupBy("npu", "product_id", "product_name").agg(sum("unit_sold").alias("unit_sold"), sum("ingame_revenue").alias("total_revenue")).orderBy("npu", "product_id").toPandas()

Unnamed: 0,npu,product_id,product_name,unit_sold,total_revenue
0,npu,gcoinbundledesc.1012,PUBG - Double G-coin 1020 G-COIN (500 + 520 BO...,33338,166356.63
1,npu,gcoinbundledesc.1013,PUBG - Double G-coin 5400 G-COIN (2500 + 2900 ...,18651,466088.51
2,npu,gcoinbundledesc.1014,PUBG - Double G-coin 11000 G-COIN (5000 + 6000...,11500,574885.04
3,pu,gcoinbundledesc.1012,PUBG - Double G-coin 1020 G-COIN (500 + 520 BO...,233884,1167081.22
4,pu,gcoinbundledesc.1013,PUBG - Double G-coin 5400 G-COIN (2500 + 2900 ...,209522,5235954.89
5,pu,gcoinbundledesc.1014,PUBG - Double G-coin 11000 G-COIN (5000 + 6000...,149402,7468606.03


In [170]:
gcoin_use_2021 = load_data_mart("pc", "2021-10-01", "2021-10-31", "gcoin_use")

In [171]:
gcoin_group_2021 = gcoin_use_2021.groupBy("account_id").agg(sum('paid_use').alias("paid_use")) \
    .withColumn("group", when((col("paid_use") > 0) & (col("paid_use") <= 1500), "lowlight") \
                   .when((col("paid_use") > 1500) & (col("paid_use") <= 3920), "light") \
                   .when((col("paid_use") > 3920) & (col("paid_use") <= 10000), "normal") \
                   .when((col("paid_use") > 10000) & (col("paid_use") <= 80240), "heavy") \
                   .when(col("paid_use") > 80240, "superheavy"))

In [93]:
cash2021.join(gcoin_group_2021, "account_id", "left").groupBy("npu", "group").agg(countDistinct("account_id").alias("user_cnt"), sum("ingame_revenue").alias("ingame_revenue")) \
    .orderBy("npu", "group").toPandas()

Unnamed: 0,npu,group,user_cnt,ingame_revenue
0,npu,,81760,1816638.97
1,npu,heavy,1,49.99
2,npu,light,4,59.96
3,npu,lowlight,11,274.89
4,npu,normal,1,29.98
5,npu,superheavy,1,79.97
6,pu,,239481,8170009.22
7,pu,heavy,28558,1978000.09
8,pu,light,45424,2144808.29
9,pu,lowlight,125122,4734935.64


In [94]:
cash2021.join(gcoin_group_2021, "account_id", "left").where(col("pubg_region") == "CN").groupBy("npu", "group").agg(countDistinct("account_id").alias("user_cnt"), sum("ingame_revenue").alias("ingame_revenue")) \
    .orderBy("npu", "group").toPandas()

Unnamed: 0,npu,group,user_cnt,ingame_revenue
0,npu,,51968,1207230.2
1,npu,lowlight,2,99.98
2,pu,,155186,5603579.22
3,pu,heavy,21303,1498141.89
4,pu,light,30547,1525368.2
5,pu,lowlight,95363,3739409.84
6,pu,normal,21895,1301687.59
7,pu,superheavy,2706,203455.4


In [172]:
cash2021 = cash2021.join(gcoin_group_2021.select("account_id", "group"), "account_id", "left")

In [155]:
gcoin_w_2 = load_data_mart('pc', "2021-10-27", "2021-11-02", "gcoin_use") \
    .groupBy("account_id").agg(sum("paid_use").alias("paid_use"), sum(col("paid_use") + col("free_use")).alias("total_use")) \
    .withColumn("wk", lit("w_2"))
gcoin_w_1 = load_data_mart('pc', "2021-11-03", "2021-11-09", "gcoin_use") \
    .groupBy("account_id").agg(sum("paid_use").alias("paid_use"), sum(col("paid_use") + col("free_use")).alias("total_use")) \
    .withColumn("wk", lit("w_1"))
gcoin_w__1 = load_data_mart("pc", "2021-11-10", "2021-11-16", "gcoin_use") \
    .groupBy("account_id").agg(sum("paid_use").alias("paid_use"), sum(col("paid_use") + col("free_use")).alias("total_use")) \
    .withColumn("wk", lit("w+1"))
gcoin_w__2 = load_data_mart("pc", "2021-11-17", "2021-11-23", "gcoin_use") \
    .groupBy("account_id").agg(sum("paid_use").alias("paid_use"), sum(col("paid_use") + col("free_use")).alias("total_use")) \
    .withColumn("wk", lit("w+2"))

gcoin_wk = gcoin_w_2.unionByName(gcoin_w_1).unionByName(gcoin_w__1).unionByName(gcoin_w__2)

In [107]:
cash2021.select("account_id", "npu", "group").distinct().join(gcoin_wk, "account_id") \
    .groupBy("npu", "group", "wk").agg(sum("paid_use"), sum("total_use")).orderBy("npu", "group", "wk").toPandas()

Unnamed: 0,npu,group,wk,sum(paid_use),sum(total_use)
0,npu,,w+1,59162560,106737230
1,npu,,w+2,52927620,95942320
2,npu,,w_1,3200,61460
3,npu,,w_2,530,117950
4,npu,heavy,w_1,3520,3600
5,npu,light,w+1,440,500
6,npu,light,w+2,240,300
7,npu,light,w_2,4090,4290
8,npu,lowlight,w+1,6670,8240
9,npu,lowlight,w+2,1190,2690


In [156]:
cash2021.where((col("pubg_region") == "CN") & (col("platform") == "STEAM")).select("account_id", "npu", "group").distinct().join(gcoin_wk, "account_id") \
    .groupBy("npu", "group", "wk").agg(sum("paid_use"), sum("total_use")).orderBy("npu", "group", "wk").toPandas()

Unnamed: 0,npu,group,wk,sum(paid_use),sum(total_use)
0,npu,,w+1,41125440,75313950
1,npu,,w+2,37370930,68485950
2,npu,,w_1,0,44730
3,npu,,w_2,0,98020
4,npu,lowlight,w+1,5030,6440
5,npu,lowlight,w+2,690,2190
6,pu,,w+1,276595930,480135770
7,pu,,w+2,130358100,233778880
8,pu,,w_1,39078050,44151970
9,pu,,w_2,8327780,13005470


## Retention

In [125]:
start_date = "2021-11-10"
end_date = "2021-12-07"
user = load_data_mart("pc", end_date, end_date, "user_master").where((col("lastlogindate") >= start_date) & (col("server_type") == "LIVE"))

In [140]:
double_user_2021.select(countDistinct("account_id")).show(truncate=False)

+--------------------------+
|count(DISTINCT account_id)|
+--------------------------+
|541747                    |
+--------------------------+



In [129]:
user.select(countDistinct("accountid")).show(truncate=False)

+-------------------------+
|count(DISTINCT accountid)|
+-------------------------+
|5614273                  |
+-------------------------+



In [130]:
non_double_user_2021 = user.withColumnRenamed("accountid", "account_id").join(double_user_2021, "account_id", "leftanti").select("account_id").distinct()

In [131]:
non_double_user_2021.count()

5072526

In [134]:
d1 = (datetime.strptime(end_date, '%Y-%m-%d') + timedelta(1)).strftime('%Y-%m-%d')
d3 = (datetime.strptime(end_date, '%Y-%m-%d') + timedelta(3)).strftime('%Y-%m-%d')
d7 = (datetime.strptime(end_date, '%Y-%m-%d') + timedelta(7)).strftime('%Y-%m-%d')
d14 = (datetime.strptime(end_date, '%Y-%m-%d') + timedelta(14)).strftime('%Y-%m-%d')
d28 = (datetime.strptime(end_date, '%Y-%m-%d') + timedelta(28)).strftime('%Y-%m-%d')

In [141]:
[d1, d3, d7, d14, d28]

['2021-12-08', '2021-12-10', '2021-12-14', '2021-12-21', '2022-01-04']

In [146]:
end_date_user = load_data_mart("pc", end_date, end_date, "user_master").where((col("lastlogindate") == end_date) & (col("server_type") == "LIVE")).withColumnRenamed("accountid", "account_id")
end_date_user.join(double_user_2021, "account_id").select(countDistinct("account_id")).show(truncate=False)

+--------------------------+
|count(DISTINCT account_id)|
+--------------------------+
|335070                    |
+--------------------------+



In [147]:
end_date_user.join(non_double_user_2021, "account_id").select(countDistinct("account_id")).show(truncate=False)

+--------------------------+
|count(DISTINCT account_id)|
+--------------------------+
|1024123                   |
+--------------------------+



In [136]:
retention = {
    "date": ["D+1", "D+3", "D+7", "D+14", "D+28"]
}
pu_list = []
non_pu_list = []
for target_date in [d1, d3, d7, d14, d28]:
    user_df = load_data_mart('pc', target_date, target_date, "user_master").where((col("lastlogindate") > end_date) & (col("server_type") == "LIVE")).withColumnRenamed("accountid", "account_id")
    pu = user_df.join(double_user_2021, "account_id").select(countDistinct("account_id")).collect()[0][0]
    non_pu = user_df.join(non_double_user_2021, "account_id").select(countDistinct("account_id")).collect()[0][0]
    pu_list.append(pu)
    non_pu_list.append(non_pu)

In [137]:
retention["pu"] = pu_list
retention["non_pu"] = non_pu_list
retention_pd = pd.DataFrame.from_dict(retention)

In [138]:
retention_pd

Unnamed: 0,date,pu,non_pu
0,D+1,329863,972573
1,D+3,332688,1007720
2,D+7,281420,833946
3,D+14,269616,787452
4,D+28,216099,747062


## TOP Product

In [173]:
steam_cn_user = cash2021.where((col("pubg_region") == "CN") & (col("platform") == "STEAM")).withColumn("group", when(col("npu") == "npu", lit(None)).otherwise(col("group"))).select("account_id", "npu", "group").distinct()

In [174]:
gcoin_by_product = load_data_mart("pc", "2021-11-10", "2022-01-07", "gcoin_use").join(steam_cn_user, "account_id").groupBy("npu", "group", "product_name").agg(sum('paid_use').alias("paid_use"), sum(col("paid_use") + col("free_use")).alias("total_use"))

from pyspark.sql import SparkSession, Window
window_paid = Window.partitionBy("npu", "group").orderBy(desc("paid_use"))

gcoin_by_product_paid = gcoin_by_product\
    .withColumn("row_number", row_number().over(window_paid))\
    .where("row_number <= 5")
gcoin_by_product_paid.toPandas().to_csv("./gcoin_by_product_paid.csv", index=False)

In [175]:
window_total = Window.partitionBy("npu", "group").orderBy(desc("total_use"))

gcoin_by_product_total = gcoin_by_product\
    .withColumn("row_number", row_number().over(window_total))\
    .where("row_number <= 5")
gcoin_by_product_total.toPandas().to_csv("./gcoin_by_product_total.csv", index=False)