In [90]:
from pubg_util import mysql, load_schema, notifier
from sphynx import sphynx, NODE_SMALL, NODE_MEDIUM, NODE_LARGE
from pyspark.sql.functions import *
import pandas as pd
import pickle

spark = sphynx.get_spark(executor_count=8, app_name='west0_double_2022', node_spec=NODE_LARGE)

Spark cluster not assigned. creating a new one...
Node spec: 8 executors with 48G RAM each
Job Port 4048 is assigned for requested cluster
Waiting for Spark master to be available...
Spark master launched!
Creating new Spark session, name: west0_double_2022...
Waiting for all executors ready...
All executors connected!
Complete! elapsed time: 00:00:24


In [89]:
sphynx.stop()

Stopping Spark session...
Destroying Spark cluster...
Done!


In [92]:
cash2022 = load_data_mart("pc", "2022-11-23", "2022-11-28", "cash_mtx").where(col("event_name") == "202211_gcoinx2")

In [93]:
# user 지역
from pyspark.sql.types import * 

def classify_country(country_os, country_ip):
    if country_os != 'CN':
        return country_ip
    else:
        return country_os

country_type_udf = udf(classify_country, StringType())

meta_region = mysql.read_table(spark, 'metainfo', 'meta_bi_regions')

user2022 = load_data_mart("pc", "2022-11-28", "2022-11-28", "user_master")
user2022 = user2022.withColumn("country_new", country_type_udf("country_os", "country_ip"))
user2022 = user2022.join(meta_region, user2022.country_new == meta_region.country_code_iso2, "left")

In [94]:
cash2022 = cash2022.join(user2022.withColumnRenamed("accountid", "account_id").select("account_id", "pubg_region"), "account_id")

In [95]:
pu_master2022 = load_data_mart("pc", "2022-11-28", "2022-11-28", "pu_master")

In [96]:
from pyspark.sql import SparkSession, Window
window = Window.partitionBy("account_id").orderBy("date")

pu_master2022 = pu_master2022\
    .withColumn("row_number", row_number().over(window))\
    .where("row_number = 1")\
    .drop("row")

In [97]:
cash2022 = cash2022.join(pu_master2022.select("account_id", "first_mtx_date"), "account_id", "left") \
    .withColumn("npu", when(col("date") == col("first_mtx_date"), "npu").otherwise("pu"))

In [65]:
double_user_2022 = cash2022.select("account_id", "pubg_region", "npu").distinct()

In [19]:
cn_double_user_2022 = cash2022.where(col("pubg_region") == "CN").select("account_id").distinct()

## Daily Balance

In [21]:
topup_2022 = load_data_mart('pc', "2022-10-23", "2022-12-27", "gcoin_topup").join(double_user_2022, "account_id")
cn_topup_2022 = load_data_mart('pc', "2022-10-23", "2022-12-27", "gcoin_topup").join(cn_double_user_2022, "account_id")

no data exists in
s3a://pubg-log-labs/data_mart/economy_v3/gcoin_topup/pc/2022-12-27
no data exists in
s3a://pubg-log-labs/data_mart/economy_v3/gcoin_topup/pc/2022-12-27


In [22]:
from pyspark.sql import SparkSession, Window
window = Window.partitionBy("date", "account_id").orderBy(desc("time"))

topup_2022 = topup_2022\
    .withColumn("row_number", row_number().over(window))\
    .where("row_number = 1")\
    .drop("row")
cn_topup_2022 = cn_topup_2022\
    .withColumn("row_number", row_number().over(window))\
    .where("row_number = 1")\
    .drop("row")

In [23]:
topup_daily_2022 = topup_2022.groupBy("date").agg(sum("free_balance").alias("free_balance"), sum("paid_balance").alias("paid_balance")).orderBy("date").toPandas()
topup_daily_2022.to_csv("./topup_daily_2022.csv", index=False)

In [24]:
cn_topup_daily_2022 = cn_topup_2022.groupBy("date").agg(sum("free_balance").alias("free_balance"), sum("paid_balance").alias("paid_balance")).orderBy("date").toPandas()
cn_topup_daily_2022.to_csv("./cn_topup_daily_2022.csv", index=False)

## Pu / Npu

In [25]:
user_master2022 = load_data_mart("pc", "2022-11-23", "2022-11-28", "user_master").withColumnRenamed("accountid", "account_id")

In [26]:
cash2022_first_buy_date = cash2022.groupBy("account_id").agg(min("date").alias("date"), min("npu").alias("npu"))
cash2022_first_buy_date.join(user_master2022, ["date", "account_id"]).groupBy("npu", "usertype").agg(countDistinct("account_id").alias("user_cnt")) \
    .orderBy("npu", "usertype").toPandas()

Unnamed: 0,npu,usertype,user_cnt
0,npu,Exist,11573
1,npu,New,671
2,npu,Return,1555
3,pu,Exist,228902
4,pu,Return,5743


In [27]:
cn_cash2022_first_buy_date = cash2022.where(col("pubg_region") == "CN").groupBy("account_id").agg(min("date").alias("date"), min("npu").alias("npu"))
cn_cash2022_first_buy_date.join(user_master2022, ["date", "account_id"]).groupBy("npu", "usertype").agg(countDistinct("account_id").alias("user_cnt")) \
    .orderBy("npu", "usertype").toPandas()

Unnamed: 0,npu,usertype,user_cnt
0,npu,Exist,8810
1,npu,New,520
2,npu,Return,1188
3,pu,Exist,176078
4,pu,Return,3979


## Product Revenue

In [28]:
cash2022.groupBy("npu", "product_id", "product_name").agg(sum("unit_sold").alias("unit_sold"), sum("ingame_revenue").alias("total_revenue")).orderBy("npu", "product_id").toPandas()

Unnamed: 0,npu,product_id,product_name,unit_sold,total_revenue
0,npu,gcoinbundledesc.1012,PUBG - Double G-coin 1020 G-COIN (500 + 520 BO...,8024,40043.91
1,npu,gcoinbundledesc.1013,PUBG - Double G-coin 5400 G-COIN (2500 + 2900 ...,6146,153591.66
2,npu,gcoinbundledesc.1014,PUBG - Double G-coin 11000 G-COIN (5000 + 6000...,2617,130825.52
3,pu,gcoinbundledesc.1012,PUBG - Double G-coin 1020 G-COIN (500 + 520 BO...,152469,760884.26
4,pu,gcoinbundledesc.1013,PUBG - Double G-coin 5400 G-COIN (2500 + 2900 ...,146843,3669678.29
5,pu,gcoinbundledesc.1014,PUBG - Double G-coin 11000 G-COIN (5000 + 6000...,116256,5811700.13


In [29]:
cash2022.where(col("pubg_region") == "CN").groupBy("npu", "product_id", "product_name").agg(sum("unit_sold").alias("unit_sold"), sum("ingame_revenue").alias("total_revenue")).orderBy("npu", "product_id").toPandas()

Unnamed: 0,npu,product_id,product_name,unit_sold,total_revenue
0,npu,gcoinbundledesc.1012,PUBG - Double G-coin 1020 G-COIN (500 + 520 BO...,6022,30049.78
1,npu,gcoinbundledesc.1013,PUBG - Double G-coin 5400 G-COIN (2500 + 2900 ...,4891,122226.09
2,npu,gcoinbundledesc.1014,PUBG - Double G-coin 11000 G-COIN (5000 + 6000...,2034,101679.66
3,pu,gcoinbundledesc.1012,PUBG - Double G-coin 1020 G-COIN (500 + 520 BO...,120583,601709.18
4,pu,gcoinbundledesc.1013,PUBG - Double G-coin 5400 G-COIN (2500 + 2900 ...,117460,2935325.43
5,pu,gcoinbundledesc.1014,PUBG - Double G-coin 11000 G-COIN (5000 + 6000...,94495,4723805.06


In [98]:
gcoin_use_2022 = load_data_mart("pc", "2022-10-01", "2022-10-31", "gcoin_use")

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36149)
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36149)
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception

In [99]:
gcoin_group_2022 = gcoin_use_2022.groupBy("account_id").agg(sum('paid_use').alias("paid_use")) \
    .withColumn("group", when((col("paid_use") > 0) & (col("paid_use") <= 1500), "lowlight") \
                   .when((col("paid_use") > 1500) & (col("paid_use") <= 3920), "light") \
                   .when((col("paid_use") > 3920) & (col("paid_use") <= 10000), "normal") \
                   .when((col("paid_use") > 10000) & (col("paid_use") <= 80240), "heavy") \
                   .when(col("paid_use") > 80240, "superheavy"))

In [48]:
cash2022.join(gcoin_group_2022, "account_id", "left").groupBy("npu", "group").agg(countDistinct("account_id").alias("user_cnt"), sum("ingame_revenue").alias("ingame_revenue")) \
    .orderBy("npu", "group").toPandas()

Unnamed: 0,npu,group,user_cnt,ingame_revenue
0,npu,,13576,315734.44
1,npu,heavy,34,1759.38
2,npu,light,53,1964.24
3,npu,lowlight,79,2024.0
4,npu,normal,57,2979.03
5,pu,,108262,3807951.95
6,pu,heavy,31422,2051676.58
7,pu,light,21233,932930.31
8,pu,lowlight,38807,1461190.03
9,pu,normal,33742,1842144.92


In [49]:
cash2022.join(gcoin_group_2022, "account_id", "left").where(col("pubg_region") == "CN").groupBy("npu", "group").agg(countDistinct("account_id").alias("user_cnt"), sum("ingame_revenue").alias("ingame_revenue")) \
    .orderBy("npu", "group").toPandas()

Unnamed: 0,npu,group,user_cnt,ingame_revenue
0,npu,,10302,245363.81
1,npu,heavy,31,1704.41
2,npu,light,52,1939.25
3,npu,lowlight,77,1994.02
4,npu,normal,56,2954.04
5,pu,,80275,2984962.83
6,pu,heavy,25378,1688343.3
7,pu,light,15970,736166.3
8,pu,lowlight,30299,1200611.52
9,pu,normal,27087,1524601.66


In [100]:
cash2022 = cash2022.join(gcoin_group_2022.select("account_id", "group"), "account_id", "left")

In [84]:
gcoin_w_2 = load_data_mart('pc', "2022-11-09", "2022-11-15", "gcoin_use") \
    .groupBy("account_id").agg(sum("paid_use").alias("paid_use"), sum(col("paid_use") + col("free_use")).alias("total_use")) \
    .withColumn("wk", lit("w_2"))
gcoin_w_1 = load_data_mart('pc', "2022-11-16", "2022-11-22", "gcoin_use") \
    .groupBy("account_id").agg(sum("paid_use").alias("paid_use"), sum(col("paid_use") + col("free_use")).alias("total_use")) \
    .withColumn("wk", lit("w_1"))
gcoin_w__1 = load_data_mart("pc", "2022-11-23", "2022-11-29", "gcoin_use") \
    .groupBy("account_id").agg(sum("paid_use").alias("paid_use"), sum(col("paid_use") + col("free_use")).alias("total_use")) \
    .withColumn("wk", lit("w+1"))
gcoin_w__2 = load_data_mart("pc", "2022-11-30", "2022-12-06", "gcoin_use") \
    .groupBy("account_id").agg(sum("paid_use").alias("paid_use"), sum(col("paid_use") + col("free_use")).alias("total_use")) \
    .withColumn("wk", lit("w+2"))

gcoin_wk = gcoin_w_2.unionByName(gcoin_w_1).unionByName(gcoin_w__1).unionByName(gcoin_w__2)

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36149)
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36149)
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception

In [52]:
cash2022.select("account_id", "npu", "group").distinct().join(gcoin_wk, "account_id") \
    .groupBy("npu", "group", "wk").agg(sum("paid_use"), sum("total_use")).orderBy("npu", "group", "wk").toPandas()

Unnamed: 0,npu,group,wk,sum(paid_use),sum(total_use)
0,npu,,w+1,33258840,67016320
1,npu,,w+2,8338130,13668770
2,npu,,w_1,439380,1385270
3,npu,,w_2,261570,884250
4,npu,heavy,w+1,274230,486900
5,npu,heavy,w+2,234040,304920
6,npu,heavy,w_1,22360,29000
7,npu,heavy,w_2,191110,213140
8,npu,light,w+1,201110,410010
9,npu,light,w+2,33790,54920


In [85]:
cash2022.where((col("pubg_region") == "CN") & (col("platform") == "STEAM")).select("account_id", "npu", "group").distinct().join(gcoin_wk, "account_id") \
    .groupBy("npu", "group", "wk").agg(sum("paid_use"), sum("total_use")).orderBy("npu", "group", "wk").toPandas()

Unnamed: 0,npu,group,wk,sum(paid_use),sum(total_use)
0,npu,,w+1,25247190,51917780
1,npu,,w+2,6879290,11076790
2,npu,,w_1,424190,1155690
3,npu,,w_2,222670,719040
4,npu,heavy,w+1,264430,466740
5,npu,heavy,w+2,234040,304920
6,npu,heavy,w_1,22360,28040
7,npu,heavy,w_2,181110,202840
8,npu,light,w+1,198610,404110
9,npu,light,w+2,33790,54920


## Retention

In [66]:
start_date = "2022-11-23"
end_date = "2022-11-28"
user = load_data_mart("pc", end_date, end_date, "user_master").where((col("lastlogindate") >= start_date) & (col("server_type") == "LIVE"))

In [68]:
double_user_2022.select(countDistinct("account_id")).show(truncate=False)

+--------------------------+
|count(DISTINCT account_id)|
+--------------------------+
|248444                    |
+--------------------------+



In [69]:
user.select(countDistinct("accountid")).show(truncate=False)

+-------------------------+
|count(DISTINCT accountid)|
+-------------------------+
|4025640                  |
+-------------------------+



In [70]:
non_double_user_2022 = user.withColumnRenamed("accountid", "account_id").join(double_user_2022, "account_id", "leftanti").select("account_id").distinct()

In [71]:
non_double_user_2022.count()

3777196

In [72]:
d1 = (datetime.strptime(end_date, '%Y-%m-%d') + timedelta(1)).strftime('%Y-%m-%d')
d3 = (datetime.strptime(end_date, '%Y-%m-%d') + timedelta(3)).strftime('%Y-%m-%d')
d7 = (datetime.strptime(end_date, '%Y-%m-%d') + timedelta(7)).strftime('%Y-%m-%d')
d14 = (datetime.strptime(end_date, '%Y-%m-%d') + timedelta(14)).strftime('%Y-%m-%d')
d28 = (datetime.strptime(end_date, '%Y-%m-%d') + timedelta(28)).strftime('%Y-%m-%d')

In [77]:
[d1, d3, d7, d14, d28]

['2022-11-29', '2022-12-01', '2022-12-05', '2022-12-12', '2022-12-26']

In [78]:
end_date_user = load_data_mart("pc", end_date, end_date, "user_master").where((col("lastlogindate") == end_date) & (col("server_type") == "LIVE")).withColumnRenamed("accountid", "account_id")
end_date_user.join(double_user_2022, "account_id").select(countDistinct("account_id")).show(truncate=False)

+--------------------------+
|count(DISTINCT account_id)|
+--------------------------+
|166850                    |
+--------------------------+



In [79]:
end_date_user.join(non_double_user_2022, "account_id").select(countDistinct("account_id")).show(truncate=False)

+--------------------------+
|count(DISTINCT account_id)|
+--------------------------+
|1404187                   |
+--------------------------+



In [74]:
retention = {
    "date": ["D+1", "D+3", "D+7", "D+14", "D+28"]
}
pu_list = []
non_pu_list = []
for target_date in [d1, d3, d7, d14, d28]:
    user_df = load_data_mart('pc', target_date, target_date, "user_master").where((col("lastlogindate") >= target_date) & (col("server_type") == "LIVE")).withColumnRenamed("accountid", "account_id")
    pu = user_df.join(double_user_2022, "account_id").select(countDistinct("account_id")).collect()[0][0]
    non_pu = user_df.join(non_double_user_2022, "account_id").select(countDistinct("account_id")).collect()[0][0]
    pu_list.append(pu)
    non_pu_list.append(non_pu)

In [75]:
retention["pu"] = pu_list
retention["non_pu"] = non_pu_list
retention_pd = pd.DataFrame.from_dict(retention)

In [76]:
retention_pd

Unnamed: 0,date,pu,non_pu
0,D+1,162345,1120317
1,D+3,154656,999513
2,D+7,140750,921562
3,D+14,146256,856297
4,D+28,131947,775494


## TOP Product

In [101]:
steam_cn_user = cash2022.where((col("pubg_region") == "CN") & (col("platform") == "STEAM")).withColumn("group", when(col("npu") == "npu", lit(None)).otherwise(col("group"))).select("account_id", "npu", "group").distinct()

In [102]:
gcoin_by_product = load_data_mart("pc", "2022-11-23", "2022-12-27", "gcoin_use").join(steam_cn_user, "account_id").groupBy("npu", "group", "product_name").agg(sum('paid_use').alias("paid_use"), sum(col("paid_use") + col("free_use")).alias("total_use"))

from pyspark.sql import SparkSession, Window
# window_paid = Window.partitionBy("npu", "group").orderBy(desc("paid_use"))

# gcoin_by_product_paid = gcoin_by_product\
#     .withColumn("row_number", row_number().over(window_paid))\
#     .where("row_number <= 5")
# gcoin_by_product_paid.toPandas().to_csv("./gcoin_by_product_paid_2022.csv", index=False)

In [103]:
window_total = Window.partitionBy("npu", "group").orderBy(desc("total_use"))

gcoin_by_product_total = gcoin_by_product\
    .withColumn("row_number", row_number().over(window_total))\
    .where("row_number <= 5")
gcoin_by_product_total.toPandas().to_csv("./gcoin_by_product_total_2022.csv", index=False)