In [39]:
from pubg_util import mysql, load_schema, notifier
from sphynx import sphynx, NODE_SMALL, NODE_MEDIUM, NODE_LARGE
from pyspark.sql.functions import *
import pandas as pd
import pickle

spark = sphynx.get_spark(executor_count=8, app_name='west0_mclaren_m1', node_spec=NODE_LARGE)

Spark cluster not assigned. creating a new one...
Node spec: 8 executors with 48G RAM each
Job Port 4048 is assigned for requested cluster
Waiting for Spark master to be available...
Spark master launched!
Creating new Spark session, name: west0_mclaren_m1...
Waiting for all executors ready...
3/8 executors ready
All executors connected!
Complete! elapsed time: 00:00:46


In [62]:
sphynx.stop()

Stopping Spark session...
Destroying Spark cluster...
Done!


In [40]:
from datetime import datetime, timedelta

post_start_date = "2022-09-07"
post_pass_end_date = "2022-11-01"
post_craft_end_date = "2022-11-07"

In [41]:
def pass_user(startdate, enddate):
    df = None
    startdate = datetime.strptime(startdate, '%Y-%m-%d')
    enddate = datetime.strptime(enddate, '%Y-%m-%d')
    while startdate <= enddate :
        startdate = startdate.strftime('%Y-%m-%d')
        if df is None:
            df = spark.read.parquet("s3a://pubg-log-labs/data_mart/pass_user_meta/"+startdate)
        else :
            df = df.union(spark.read.parquet("s3a://pubg-log-labs/data_mart/pass_user_meta/"+startdate))
        startdate = datetime.strptime(startdate, '%Y-%m-%d') + timedelta(days = 1)
    return df


In [42]:
def load_pass(startdate, enddate):
    df = None
    startdate = datetime.strptime(startdate, '%Y-%m-%d')
    enddate = datetime.strptime(enddate, '%Y-%m-%d')
    while startdate <= enddate :
        startdate = startdate.strftime('%Y-%m-%d')
        if df is None:
            df = spark.read.parquet("s3a://pubg-log-labs/data_mart/pass_master/pc/"+startdate)
        else :
            df = df.union(spark.read.parquet("s3a://pubg-log-labs/data_mart/pass_master/pc/"+startdate))
        startdate = datetime.strptime(startdate, '%Y-%m-%d') + timedelta(days = 1)
    return df


In [43]:
pass_master = load_pass(post_pass_end_date, post_pass_end_date)

In [44]:
mclaren_pass_master = pass_master.where(col("period_type") == "mclaren")

In [45]:
mclaren_pass_master.groupBy("period_status").agg(countDistinct("AccountId")).show(truncate=False)

+-------------+-------------------------+
|period_status|count(DISTINCT AccountId)|
+-------------+-------------------------+
|active       |13466229                 |
|non_active   |91974395                 |
|pass         |481939                   |
+-------------+-------------------------+



- active: 접속하였지만 pass를 구매하지 않은 유저  
- non_active: 접속하지 않은 유저  
- pass: pass 가지고 있는 유저  

In [46]:
active_mclaren_pass_user = mclaren_pass_master.where(col("period_status") == "active")

In [47]:
## Pass Reward
reward_level = load_schema.lobby(spark,"pc", "live", "RewardLevel", post_start_date, post_pass_end_date)

In [48]:
active_mclaren_pass_user_reward = reward_level.join(active_mclaren_pass_user, "AccountId")

In [49]:
free_pass_user = active_mclaren_pass_user_reward.select("AccountId").distinct()

In [50]:
# event loot cache = itemdesc.14300005
crate_increased = load_schema.lobby(spark,"pc", "live", "WorkshopCrateIncreased", post_start_date, post_craft_end_date)

In [24]:
crate_increased.where((col("ItemDescId") == "itemdesc.14300005") & (col("Amount") < 0)).select(sum("Amount"), countDistinct("AccountId")).show(truncate=False)

+-----------+-------------------------+
|sum(Amount)|count(DISTINCT AccountId)|
+-----------+-------------------------+
|-47952309  |1826885                  |
+-----------+-------------------------+



In [52]:
event_loot_cache_user = crate_increased.where((col("ItemDescId") == "itemdesc.14300005") & (col("Amount") < 0)).select("AccountId").distinct()

## M-1 User

In [53]:
p_user = mysql.read_table(spark, "labs", "pass_user_total")
m2_user = mysql.read_table(spark, "labs", "m2_user")

In [55]:
p_user.groupBy("group").agg(count("*"), countDistinct("account_id")).show(truncate=False)

+-----+--------+--------------------------+
|group|count(1)|count(DISTINCT account_id)|
+-----+--------+--------------------------+
|P-1  |147774  |147774                    |
|P-3  |277093  |277093                    |
|P-2  |56730   |56730                     |
+-----+--------+--------------------------+



In [26]:
free_pass_user

DataFrame[AccountId: string]

In [27]:
event_loot_cache_user

DataFrame[AccountId: string]

In [54]:
m1_user_tmp = free_pass_user.withColumn("history", lit("free_pass")).unionByName(event_loot_cache_user.withColumn("history", lit("event_loot_cache")))

In [30]:
m1_user_tmp.select(count("*"), countDistinct("AccountId")).show(truncate=False)

+--------+-------------------------+
|count(1)|count(DISTINCT AccountId)|
+--------+-------------------------+
|5116143 |3750342                  |
+--------+-------------------------+



In [57]:
m1_user = m1_user_tmp.withColumnRenamed("AccountId", "account_id").join(p_user, "account_id", "leftanti") \
    .join(m2_user, "account_id", "leftanti")

In [59]:
m1_user.select(count("*"), countDistinct("account_id")).show(truncate=False)

+--------+--------------------------+
|count(1)|count(DISTINCT account_id)|
+--------+--------------------------+
|4433658 |3167117                   |
+--------+--------------------------+



In [61]:
m1_user = m1_user.withColumn("group", lit("M-1"))
mysql.drop_table("labs", "m1_user")
mysql.insert_table(m1_user, "labs", "m1_user")