In [2]:
from pubg_util import mysql, load_schema, notifier
from sphynx import sphynx, NODE_SMALL, NODE_MEDIUM, NODE_LARGE
from pyspark.sql.functions import *
import pandas as pd
import pickle

spark = sphynx.get_spark(executor_count=8, app_name='west0_mclaren', node_spec=NODE_LARGE)

Spark cluster not assigned. creating a new one...
Node spec: 8 executors with 48G RAM each
Job Port 4049 is assigned for requested cluster
Waiting for Spark master to be available...
Spark master launched!
Creating new Spark session, name: west0_mclaren...
Waiting for all executors ready...
All executors connected!
Complete! elapsed time: 00:00:25


In [63]:
sphynx.stop()

Stopping Spark session...
Destroying Spark cluster...




Done!


In [1]:
pre_start_date = "2022-07-13"
pre_end_date = "2022-09-06"

post_start_date = "2022-09-07"
post_gcoin_end_date = "2022-11-02"
post_craft_end_date = "2022-11-07"

In [26]:
gcoin = load_data_mart("pc", pre_start_date, pre_end_date, "gcoin_use")

In [4]:
# pre기간동안 과금에 따라 유저 나누기
gcoin_by_user = gcoin.groupBy("account_id").agg(sum('paid_use').alias("paid_use"), sum(col("paid_use") + col("free_use")).alias("total_use"))

In [27]:
gcoin_by_user.select(expr('percentile(total_use, 0.2)').alias("0.2"), expr('percentile(total_use, 0.4)').alias("0.4"), expr('percentile(total_use, 0.6)').alias("0.6"), expr('percentile(total_use, 0.8)').alias("0.8")).show(truncate=False)

+-----+-----+------+------+
|0.2  |0.4  |0.6   |0.8   |
+-----+-----+------+------+
|390.0|580.0|1230.0|2090.0|
+-----+-----+------+------+



In [29]:
gcoin.join(gcoin_by_user.where(col("total_use") < 100), "account_id").show(truncate=False)

+----------------------------------------+----------+--------+-----------------+--------+------------+--------+------------+-----+---+-------------+----------------------------+-------------------+---------------------------------------------------------+------------------+----------------+-----------------------+------+--------+-----------------------+-----------+-------------------------+------------+--------+------------+----------+----------+----+-----+----+------+-------------------+-------------------+---------------+-------------------+--------+---------+
|account_id                              |date      |platform|product_id       |free_use|free_balance|paid_use|paid_balance|price|qty|reason       |time                        |transaction_id     |gcoin_by_expiry                                          |type              |is_salesid_exist|sales_id_              |device|currency|sales_id               |exchange_id|product_name             |product_type|category|sub_category|eve

In [30]:
gcoin_by_user = gcoin_by_user.withColumn("gcoin_use_group", when(col("total_use") <= 2000, lit("A")) \
                         .when((col("total_use") > 2000) & (col("total_use") <= 10000), lit("B")) \
                         .when((col("total_use") > 10000) & (col("total_use") <= 20000), lit("C")) \
                         .when((col("total_use") > 20000) & (col("total_use") <= 40000), lit("D")) \
                         .when((col("total_use") > 40000), lit("E")) \
                         .otherwise(None)
                        )

In [32]:
workshop_gcoin = gcoin.where(col("event_name") == "202207_workshop").groupBy("account_id").agg(sum(col("paid_use")+col("free_use")).alias("workshop_gcoin"))
gcoin_by_user = gcoin_by_user.join(workshop_gcoin, "account_id", "left").withColumn("if_workshop", when(col("workshop_gcoin").isNull(), lit("0")).otherwise(lit("0")))

In [35]:
gcoin.select("event_type").distinct().show(truncate=False)

+-------------+
|event_type   |
+-------------+
|yourshop     |
|other        |
|wsus         |
|other        |
|survivorpass |
|workshop     |
|streamer     |
|esports      |
|streamer     |
|holiday      |
|collaboration|
+-------------+



In [38]:
from pyspark.sql.window import Window

gcoin_by_user_and_type = gcoin.groupBy("account_id", "event_type").agg(sum(col("paid_use") + col("free_use")).alias("type_total_use"))
windowDept = Window.partitionBy("account_id").orderBy(col("type_total_use").desc())

In [40]:
user_top_type = gcoin_by_user_and_type.withColumn("row",row_number().over(windowDept)).filter(col("row") == 1).drop("row")

In [42]:
gcoin_by_user = gcoin_by_user.join(user_top_type, "account_id", "left").withColumnRenamed("event_type", "main_use_event_type")

In [43]:
mysql.drop_table("labs", "gcoin_by_user")
mysql.insert_table(gcoin_by_user, "labs", "gcoin_by_user")

In [44]:
def load_data_mart(table, startdate, enddate, device=None):
    df = None
    if device is None:
        for i in range(int((datetime.strptime(enddate, '%Y-%m-%d') - datetime.strptime(startdate, '%Y-%m-%d')).days)+1):
            try:
                if df is None:
                    df = spark.read.parquet("s3a://pubg-log-labs/data_mart/{table}/{startdate}".format(table=table, startdate=startdate))   
                else :
                    df = df.unionAll(spark.read.parquet("s3a://pubg-log-labs/data_mart/{table}/{startdate}".format(table=table, startdate=startdate)))  
            except Exception as e:
                pass
            startdate = (datetime.strptime(startdate, '%Y-%m-%d') + timedelta(days = 1)).strftime('%Y-%m-%d')
    else:
        device = device.upper()
        for i in range(int((datetime.strptime(enddate, '%Y-%m-%d') - datetime.strptime(startdate, '%Y-%m-%d')).days)+1):
            try:
                if df is None:
                    df = spark.read.parquet("s3a://pubg-log-labs/data_mart/{table}/{startdate}/{device}".format(table=table, startdate=startdate, device=device))   
                else :
                    df = df.unionAll(spark.read.parquet("s3a://pubg-log-labs/data_mart/{table}/{startdate}/{device}".format(table=table, startdate=startdate, device=device)))  
            except Exception as e:
                pass
            startdate = (datetime.strptime(startdate, '%Y-%m-%d') + timedelta(days = 1)).strftime('%Y-%m-%d')
    return df


In [56]:
gameplay = load_data_mart('official_gameplay_master', pre_start_date, pre_end_date)

In [57]:
gameplay.select("Platform").distinct().show(truncate=False)

+--------+
|Platform|
+--------+
|KAKAO   |
|STEAM   |
+--------+



In [58]:
gameplay_df = gameplay.groupBy("AccountId").agg(countDistinct("date").alias("play_date_cnt"), \
    sum(col("TppSoloPlayCount") + col("TppDuoPlayCount") + col("TppSquadPlayCount") + col("FppSoloPlayCount") + col("FppDuoPlayCount") + col("FppSquadPlayCount")).alias("play_cnt"), \
    sum(col("TppSoloGameMinute") + col("TppDuoGameMinute") + col("TppSquadGameMinute") + col("FppSoloGameMinute") + col("FppDuoGameMinute") + col("FppSquadGameMinute")).alias("play_min"))

In [59]:
df = gcoin_by_user.join(gameplay_df.withColumnRenamed("AccountId", "account_id"), "account_id", "full_outer")

In [62]:
mysql.drop_table("labs", "gcoin_by_user")
mysql.insert_table(df, "labs", "gcoin_by_user")