In [362]:
from pubg_util import mysql, load_schema, notifier
from sphynx import sphynx, NODE_SMALL, NODE_MEDIUM, NODE_LARGE
from pyspark.sql.functions import *
import pandas as pd
import pickle

spark = sphynx.get_spark(executor_count=8, app_name='west0_pgc', node_spec=NODE_LARGE)

Spark cluster not assigned. creating a new one...
Node spec: 8 executors with 48G RAM each
Job Port 4049 is assigned for requested cluster
Waiting for Spark master to be available...
Spark master launched!
Creating new Spark session, name: west0_pgc...
Waiting for all executors ready...
All executors connected!
Complete! elapsed time: 00:00:37


In [364]:
start_date = "2022-10-26"
end_date = "2022-12-07"
pc_gcoin = load_data_mart("pc", start_date, end_date, "gcoin_use").where(col("event_name").like("%pgc%"))
console_gcoin = load_data_mart("console", start_date, end_date, "gcoin_use").where(col("event_name").like("%pgc%"))

In [372]:
pc_gcoin_by_user = pc_gcoin.groupBy("account_id").agg(min("date").alias("first_buy_date"), min("time").alias("first_buy_time"), countDistinct("product_id").alias("buy_cnt"))

In [215]:
pc_gcoin_by_user.groupBy("first_buy_date").agg(countDistinct("account_id").alias("user_cnt")).orderBy("first_buy_date").toPandas().to_csv("./user_cnt_by_first_buy_date.csv", index=False)

In [216]:
pc_gcoin_by_user.groupBy("buy_cnt").agg(countDistinct("account_id")).show(truncate=False)

+-------+--------------------------+
|buy_cnt|count(DISTINCT account_id)|
+-------+--------------------------+
|1      |166258                    |
|3      |276                       |
|2      |6525                      |
|4      |118                       |
+-------+--------------------------+



In [296]:
cash = load_data_mart("pc", "2022-11-23", "2022-11-28", "cash_mtx")

In [297]:
double_gcoin = cash.where(col("event_name") == "202211_gcoinx2")

In [225]:
double_gcoin = double_gcoin.withColumnRenamed("account_id", "cash_account_id")
# double gcoin 구매 이후 pgc 상품 구매한 유저
pc_gcoin.join(double_gcoin, (pc_gcoin.account_id == double_gcoin.cash_account_id) & (pc_gcoin.time > double_gcoin.time)) \
    .select(countDistinct("account_id")).show(truncate=False)

+--------------------------+
|count(DISTINCT account_id)|
+--------------------------+
|39991                     |
+--------------------------+



In [300]:
double_gcoin = double_gcoin.withColumnRenamed("account_id", "cash_account_id")
double_gcoin_pgc_user = pc_gcoin_by_user.join(double_gcoin, (pc_gcoin_by_user.account_id == double_gcoin.cash_account_id) & (pc_gcoin_by_user.first_buy_time > double_gcoin.time)) \
    .groupBy("account_id").agg(min("first_buy_time").alias("pgc_by_time"))

In [None]:
double_gcoin_pgc_user = pc_gcoin_by_user.join(double_gcoin, (pc_gcoin_by_user.account_id == double_gcoin.cash_account_id) & (pc_gcoin_by_user.first_buy_time > double_gcoin.time)) \
    .groupBy("account_id").agg(min("first_buy_time").alias("pgc_by_time"))

In [230]:
# double gcoin 구매 이후 찻 pgc 상품 구매한 유저
double_gcoin_pgc_user.select(countDistinct("account_id")).show(truncate=False)

+--------------------------+
|count(DISTINCT account_id)|
+--------------------------+
|39149                     |
+--------------------------+



In [283]:
pgc2021_gcoin = load_data_mart("pc", "2021-11-12", "2021-12-18", "gcoin_use").where(col("event_name").like("%pgc%"))

In [264]:
pgc2021_gcoin.groupBy("account_id").agg(countDistinct("product_id").alias("product_cnt")).groupBy("product_cnt").agg(countDistinct("account_id")).show(truncate=False)

+-----------+--------------------------+
|product_cnt|count(DISTINCT account_id)|
+-----------+--------------------------+
|1          |227963                    |
|3          |2110                      |
|2          |58219                     |
|4          |117                       |
+-----------+--------------------------+



In [57]:
pc_gcoin = pc_gcoin.withColumn("paid_account_id", when(col("paid_use") > 0, col("account_id")).otherwise(lit(None)))
console_gcoin = console_gcoin.withColumn("paid_account_id", when(col("paid_use") > 0, col("account_id")).otherwise(lit(None)))

In [53]:
# user 지역
from pyspark.sql.types import * 

def classify_country(country_os, country_ip):
    if country_os != 'CN':
        return country_ip
    else:
        return country_os

country_type_udf = udf(classify_country, StringType())

meta_region = mysql.read_table(spark, 'metainfo', 'meta_bi_regions')
pc_user = load_data_mart("pc", end_date, end_date, "user_master")
pc_user = pc_user.withColumn("country_new", country_type_udf("country_os", "country_ip"))
pc_user = pc_user.join(meta_region, pc_user.country_new == meta_region.country_code_iso2, "left")


console_user = load_data_mart("console", end_date, end_date, "user_master")
console_user = console_user.withColumn("country_new", country_type_udf("country_os", "country_ip"))
console_user = console_user.join(meta_region, console_user.country_new == meta_region.country_code_iso2, "left")

## PC

In [10]:
pc_user.where((col("lastlogindate") >= start_date) & (col("server_type") == "LIVE")).select(countDistinct("accountid").alias("pc_au")).show(truncate=False)

+--------+
|pc_au   |
+--------+
|10680789|
+--------+



In [6]:
pc_gcoin.select(countDistinct("paid_account_id").alias("paid_pu"), countDistinct("account_id").alias("pu"), sum("paid_use").alias("paid_use"), sum(col("paid_use") + col("free_use")).alias("total_use")).show(truncate=False)

+-------+------+---------+---------+
|paid_pu|pu    |paid_use |total_use|
+-------+------+---------+---------+
|168165 |173177|735800759|870188960|
+-------+------+---------+---------+



In [6]:
pc_user.where((col("lastlogindate") >= start_date) & (col("server_type") == "LIVE")).groupBy("pubg_region").agg(countDistinct("accountid").alias("au")).toPandas()

Unnamed: 0,pubg_region,au
0,CN,6859470
1,,313289
2,SA,326091
3,KR,1198589
4,SEA,552008
5,Undefined,1172
6,JP,34685
7,TW/HK,73907
8,CIS,958799
9,EMEA,946661


In [58]:
pc_gcoin.join(pc_user.select("accountid", "pubg_region").withColumnRenamed("accountid", "account_id"), "account_id", "left").groupBy("pubg_region").agg(countDistinct("paid_account_id").alias("paid_pu"), countDistinct("account_id").alias("pu"), sum("paid_use").alias("paid_use"), sum(col("paid_use") + col("free_use")).alias("total_use")).toPandas()

Unnamed: 0,pubg_region,paid_pu,pu,paid_use,total_use
0,CN,128611,131552,565253970,667682590
1,,3019,3205,12537730,14919500
2,SA,1345,1666,5056230,7419400
3,KR,18004,18511,81865160,93291330
4,SEA,3945,4083,16486640,20406370
5,JP,347,363,1525720,1780690
6,TW/HK,1212,1258,5531790,6427770
7,CIS,4402,4624,17815329,21759760
8,EMEA,6631,7246,26974760,33330830
9,OC,649,669,2753430,3170720


In [59]:
pc_gcoin.groupBy("product_id", "product_name").agg(sum("qty").alias("unit_sold"), sum("paid_use").alias("paid_use"), sum(col("paid_use") + col("free_use")).alias("total_use")).toPandas()

Unnamed: 0,product_id,product_name,unit_sold,paid_use,total_use
0,itemdesc.13000673,PGC 2022 MEGA BUNDLE + VOTING COUPON,115289,586016579,680205100
1,itemdesc.13000670,PGC 2022 IRIDESCENT DANGER BUNDLE + VOTING COUPON,21092,53987480,67494400
2,itemdesc.13000671,PGC 2022 TAC-TECH BUNDLE + VOTING COUPON,35613,90462660,113961600
3,itemdesc.13000672,PGC 2022 EMOTE SET,8614,5334040,8527860


### pgc 2021 vs pgc 2022 유저당 상품 구매 갯수

In [289]:
pgc2021_gcoin.groupBy("account_id").agg(collect_set("product_name").alias("products")).groupBy("products").agg(count("*").alias("user_cnt")).orderBy("user_cnt", ascending=False).toPandas() \
    .to_csv("./pgc2021_products_by_user.csv", index=False)

In [290]:
pc_gcoin.groupBy("account_id").agg(collect_set("product_name").alias("products")).groupBy("products").agg(count("*").alias("user_cnt")).orderBy("user_cnt", ascending=False).toPandas() \
    .to_csv("./pgc2022_products_by_user.csv", index=False)

In [284]:
pgc2021_by_user = pgc2021_gcoin.groupBy("account_id").agg(countDistinct("product_name").alias("cnt"), sum("paid_use").alias("paid_use"), sum(col("paid_use") + col("free_use")).alias("total_use"))
pgc2022_by_user = pc_gcoin.groupBy("account_id").agg(count("*").alias("cnt"), sum("paid_use").alias("paid_use"), sum(col("paid_use") + col("free_use")).alias("total_use"))

In [91]:
pgc2021_by_user.groupBy("cnt").agg(countDistinct("account_id").alias("user_cnt"), sum("total_use").alias("total_use")).withColumn("arppu", col("total_use")/col("user_cnt")).orderBy("cnt").toPandas()

Unnamed: 0,cnt,user_cnt,total_use,arppu
0,1,227963,980367960,4300.557371
1,2,58219,339902680,5838.346244
2,3,2110,15853780,7513.63981
3,4,117,1495260,12780.0


In [285]:
purchase_2 = pgc2021_gcoin.join(pgc2021_by_user.where(col("cnt") == 2).select("account_id"), "account_id").groupBy("account_id").agg(collect_set('product_name').alias("products"))
purchase_2.groupBy("products").agg(count("*").alias("user_cnt")).orderBy("user_cnt").show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------+--------+
|products                                                                                                            |user_cnt|
+--------------------------------------------------------------------------------------------------------------------+--------+
|[PGC 2021 CONTINGENT CHAOS JACKET BUNDLE + VOTING COUPONS, PGC 2021 MEGA BUNDLE + VOTING COUPONS]                   |165     |
|[PGC 2021 MEGA BUNDLE + VOTING COUPONS, PGC 2021 CONTINGENT CHAOS DUSTER BUNDLE + VOTING COUPONS]                   |291     |
|[PGC 2021 CONTINGENT CHAOS JACKET BUNDLE + VOTING COUPONS, PGC 2021 DANCE SPLASH EMOTE + VOTING COUPON BUNDLE]      |4113    |
|[PGC 2021 CONTINGENT CHAOS JACKET BUNDLE + VOTING COUPONS, PGC 2021 CONTINGENT CHAOS DUSTER BUNDLE + VOTING COUPONS]|4308    |
|[PGC 2021 DANCE SPLASH EMOTE + VOTING COUPON BUNDLE, PGC 2021 CONTINGENT CHAOS DUSTER BUNDLE + VOTING C

In [89]:
purchase_3 = pgc2021_gcoin.join(pgc2021_by_user.where(col("cnt") == 3).select("account_id"), "account_id").groupBy("account_id").agg(collect_set('product_name').alias("products"))

In [90]:
purchase_3.groupBy("products").agg(count("*").alias("user_cnt")).orderBy("user_cnt").show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------+
|products                                                                                                                                                                |user_cnt|
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------+
|[PGC 2021 CONTINGENT CHAOS JACKET BUNDLE + VOTING COUPONS, PGC 2021 MEGA BUNDLE + VOTING COUPONS, PGC 2021 CONTINGENT CHAOS DUSTER BUNDLE + VOTING COUPONS]             |24      |
|[PGC 2021 CONTINGENT CHAOS JACKET BUNDLE + VOTING COUPONS, PGC 2021 DANCE SPLASH EMOTE + VOTING COUPON BUNDLE, PGC 2021 MEGA BUNDLE + VOTING COUPONS]                   |187     |
|[PGC 2021 DANCE SPLASH EMOTE + VOTING COUPON BUNDLE, PGC 2021 MEGA BUNDLE + VOTING COUPONS, PGC 202

In [77]:
pgc2022_by_user.groupBy("cnt").agg(countDistinct("account_id").alias("user_cnt"), sum("total_use").alias("total_use")).withColumn("arppu", col("total_use")/col("user_cnt")).orderBy("cnt").toPandas()

Unnamed: 0,cnt,user_cnt,total_use,arppu
0,1,166258,820477980,4934.968423
1,2,6525,45214720,6929.459004
2,3,276,2928040,10608.84058
3,4,118,1568220,13290.0


In [85]:
purchase_3_2022 = pc_gcoin.join(pgc2022_by_user.where(col("cnt") == 3).select("account_id"), "account_id").groupBy("account_id").agg(collect_set('product_name').alias("products"))
purchase_3_2022.groupBy("products").agg(count("*").alias("user_cnt")).orderBy("user_cnt").show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------+--------+
|products                                                                                                                           |user_cnt|
+-----------------------------------------------------------------------------------------------------------------------------------+--------+
|[PGC 2022 IRIDESCENT DANGER BUNDLE + VOTING COUPON, PGC 2022 TAC-TECH BUNDLE + VOTING COUPON, PGC 2022 EMOTE SET]                  |37      |
|[PGC 2022 IRIDESCENT DANGER BUNDLE + VOTING COUPON, PGC 2022 MEGA BUNDLE + VOTING COUPON, PGC 2022 EMOTE SET]                      |40      |
|[PGC 2022 MEGA BUNDLE + VOTING COUPON, PGC 2022 TAC-TECH BUNDLE + VOTING COUPON, PGC 2022 EMOTE SET]                               |89      |
|[PGC 2022 IRIDESCENT DANGER BUNDLE + VOTING COUPON, PGC 2022 MEGA BUNDLE + VOTING COUPON, PGC 2022 TAC-TECH BUNDLE + VOTING COUPON]|110     |

## Console

In [23]:
console_user.where((col("lastlogindate") >= start_date) & (col("server_type") == "LIVE")).select(countDistinct("accountid").alias("console_au")).show(truncate=False)

+----------+
|console_au|
+----------+
|1652826   |
+----------+



In [7]:
console_gcoin.select(countDistinct("paid_account_id").alias("paid_pu"), countDistinct("account_id").alias("pu"), sum("paid_use").alias("paid_use"), sum(col("paid_use") + col("free_use")).alias("total_use")).show(truncate=False)

+-------+-----+--------+---------+
|paid_pu|pu   |paid_use|total_use|
+-------+-----+--------+---------+
|13451  |17943|39248660|50331480 |
+-------+-----+--------+---------+



In [7]:
console_user.where((col("lastlogindate") >= start_date) & (col("server_type") == "LIVE")).groupBy("pubg_region").agg(countDistinct("accountid").alias("au")).toPandas()

Unnamed: 0,pubg_region,au
0,CN,23610
1,,709023
2,SA,300261
3,KR,7359
4,SEA,33484
5,Undefined,435
6,JP,23396
7,TW/HK,5767
8,CIS,75168
9,EMEA,438700


In [24]:
console_gcoin.join(console_user.select("accountid", "pubg_region").withColumnRenamed("accountid", "account_id"), "account_id").groupBy("pubg_region").agg(countDistinct("paid_account_id").alias("paid_pu"), countDistinct("account_id").alias("pu"), sum("paid_use").alias("paid_use"), sum(col("paid_use") + col("free_use")).alias("total_use")).toPandas()

Unnamed: 0,pubg_region,paid_pu,pu,paid_use,total_use
0,CN,102,163,359730,459810
1,,8549,10727,25216410,31314420
2,SA,1304,1992,3596110,4910020
3,KR,82,100,271860,345630
4,SEA,84,121,221780,307510
5,JP,376,445,1107860,1353870
6,TW/HK,76,92,276430,330850
7,CIS,130,437,433010,1376850
8,EMEA,2290,3270,6417260,8282990
9,OC,458,596,1348210,1649530


----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 47216)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.6/socketserver.py", line 721, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 269, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 241, in poll
    if func():
  File "/usr/local/spark/python/pyspark/accumulators.py", line 245, in accum_updates
    num_updates = read_int(self.rfile)
  File "/usr/local/spark/python/pysp

In [63]:
console_gcoin.groupBy("product_id", "product_name").agg(sum("qty").alias("unit_sold"), sum("paid_use").alias("paid_use"), sum(col("paid_use") + col("free_use")).alias("total_use")).orderBy("total_use", ascending=False).toPandas()

Unnamed: 0,product_id,product_name,unit_sold,paid_use,total_use
0,itemdesc.13000676,PGC 2022 MEGA BUNDLE,4810,23584390,28379000
1,itemdesc.13000674,PGC 2022 IRIDESCENT DANGER BUNDLE,2010,5483870,6432000
2,itemdesc.13000675,PGC 2022 TAC-TECH BUNDLE,1515,4012220,4848000
3,itemdesc.12020040,"""PGC 2022"" Coupe RB",5582,1478830,2791000
4,itemdesc.16100356,"PGC 2022 Dance - Rise Up, Beat Down",1765,1050890,1747350
5,itemdesc.17210052,Hairstyle 36,2883,875250,1441500
6,itemdesc.12031144,PGC 2022 Tac-Tech - Helmet (Level 3),2858,705470,1429000
7,itemdesc.12010650,PGC 2022 - Pan,1367,919650,1353330
8,itemdesc.12032147,PGC 2022 Aced Backpack (Level 3),2069,572130,1034500
9,itemdesc.12010658,PGC 2022 Aced - Groza,720,472780,712800


## EP 가 상품에 포함됨에 따른 영향 분석

In [153]:
pickem_start_date = "2022-10-26"
pickem_end_date = "2022-11-18"

In [52]:
user_master = load_data_mart("pc", pickem_end_date, pickem_end_date, "user_master").where(col("lastlogindate") >= pickem_start_date).select(countDistinct("accountid")).show(truncate=False)

+-------------------------+
|count(DISTINCT accountid)|
+-------------------------+
|8083049                  |
+-------------------------+



In [155]:
pc_gcoin_user = pc_gcoin.select("account_id").distinct().withColumnRenamed("account_id", "AccountId")

In [366]:
device = "pc"
ep_purchase = load_schema.lobby(spark, device, "live", "PurchaseResult", start_date, end_date).where(col("currency").like("%esportspoint%"))

In [335]:
ep_purchase.count()

2294546

In [266]:
ep_purchase_user = ep_purchase.select("AccountId").distinct()

In [267]:
ep_purchase_user.count()

636394

In [270]:
pc_gcoin.join(ep_purchase_user, (pc_gcoin.account_id == ep_purchase_user.AccountId)).select(countDistinct("AccountId")).show(truncate=False)

+-------------------------+
|count(DISTINCT AccountId)|
+-------------------------+
|158480                   |
+-------------------------+



In [379]:
ep_purchase_gcoin = ep_purchase.join(pc_gcoin.select("account_id").distinct().withColumnRenamed("account_id", "AccountId"), "AccountId")

from pyspark.sql import functions as F
from pyspark.sql import Window

w = Window.partitionBy('AccountId')

ep_gcoin_purchase_products = ep_purchase_gcoin.withColumn(
            'product_set', F.collect_set('product_name').over(w)
        )

In [380]:
ep_gcoin_purchase_products.groupBy("product_set").agg(countDistinct("AccountId").alias("user_cnt")).orderBy("user_cnt", ascending=False).toPandas() \
    .to_csv("./ep_gcoin_purchase_products.csv", index=False)

In [268]:
ep_purchase_2021 = load_schema.lobby(spark, "pc", "live", "PurchaseResult", "2021-11-12", "2022-01-12").where(col("currency").like("%esportspoint%"))
ep_purchase_2021.select(countDistinct('AccountId')).show(truncate=False)

+-------------------------+
|count(DISTINCT AccountId)|
+-------------------------+
|507695                   |
+-------------------------+



In [272]:
pgc_ep_item = mysql.read_table(spark, "metainfo", "meta_vc_sales_items").where(col("currency") == "esportspoint").withColumn("product_name", when(col("product_name").isNull(), col("product_id")).otherwise(col("product_name")))
ep_purchase_2021 = ep_purchase_2021.join(pgc_ep_item.withColumnRenamed("product_id", "ProductId").select("ProductId", "product_name").distinct(), "ProductId")

In [273]:
ep_purchase_2021.groupBy("product_name").agg(countDistinct("AccountId"), sum("Amount").alias("amount"), sum(col("LocalPrice")*col("Amount"))).orderBy("amount", ascending=False).toPandas()

Unnamed: 0,product_name,count(DISTINCT AccountId),amount,sum((LocalPrice * Amount))
0,Nickname Change,268907,268920,16135200000
1,Hairstyle 28,230825,230842,13850520000
2,PGC 2021 Giltslate Contagion Backpack (Level 3),209634,209639,12578340000
3,"""PGC 2021 Contingent Chaos"" UAZ",189201,189213,22705560000
4,PGC 2021 Contingent Chaos - Pan,184562,184569,36913800000
5,PGC 2021 Contingent Chaos - Helmet (Level 3),118774,118778,14253360000
6,PGC 2021 Contingent Chaos - P90,115519,115527,13863240000


In [271]:
pgc2021_gcoin.join(ep_purchase_2021, (pgc2021_gcoin.account_id == ep_purchase_2021.AccountId)).select(countDistinct("AccountId")).show(truncate=False)

+-------------------------+
|count(DISTINCT AccountId)|
+-------------------------+
|236410                   |
+-------------------------+



In [242]:
# ep 상품 구매 -> 상점 상품 구매 -> ep 상품 구매
# ep_gcoin = ep 상품 구매 -> 상점 상품 구매 
ep_purchase = ep_purchase.withColumnRenamed("date", "ep_date").withColumnRenamed("Time", "ep_time")
# ep_gcoin = ep_purchase.join(pc_gcoin, (ep_purchase.ep_date == pc_gcoin.date) & (ep_purchase.AccountId == pc_gcoin.account_id) & (ep_purchase.ep_time < pc_gcoin.time))

In [237]:
ep_gcoin.select(count("*"), countDistinct("account_id")).show(truncate=False)

+--------+--------------------------+
|count(1)|count(DISTINCT account_id)|
+--------+--------------------------+
|7136    |3430                      |
+--------+--------------------------+



In [243]:
gcoin_ep = ep_purchase.join(ep_gcoin, (ep_purchase.ep_date == ep_gcoin.date) & (ep_purchase.AccountId == ep_gcoin.account_id) & (ep_purchase.ep_time > ep_gcoin.time))

In [244]:
gcoin_ep.select(count("*"), countDistinct("account_id")).show(truncate=False)

+--------+--------------------------+
|count(1)|count(DISTINCT account_id)|
+--------+--------------------------+
|14042   |2477                      |
+--------+--------------------------+



In [245]:
esports_store = load_schema.heimdall(spark, "prod", "{}-prod".format("pc"), start_date, end_date).where(col("category") == "store_explore") \
    .filter(col("action") == "click_store_esports")

In [259]:
# ep store 진입 -> 상품 구매
pc_gcoin = pc_gcoin.withColumn("timestamp", to_timestamp("time"))
ep_store_gcoin = esports_store.join(pc_gcoin, (esports_store.timestamp < pc_gcoin.timestamp) & (esports_store.dt == pc_gcoin.date) & (esports_store.user_id == pc_gcoin.account_id))

In [260]:
ep_store_gcoin.show(truncate=False)

+-------------------+-------------+-------------------+-----------+-----+----------------------------------------+-------------------------------------------------------------------------+---------------------------------------+-----------+----------+---+----------+--------+----------------------------------------+-----------------+--------+------------+--------+------------+-----+---+-------------+----------------------------+-------------------+--------------------------------------------------------------+------------------+----------------+---------+------+--------+--------+-----------------+----------------------------------------+------------+-----------+------------+----------+----------+-------+-----+----+-------+-------------------+-------------------+---------------+-------------------+--------------------------+
|timestamp          |category     |action             |label      |value|user_id                                 |session_id                                         

In [262]:
esports_store.select(countDistinct("user_id")).show(truncate=False)

+-----------------------+
|count(DISTINCT user_id)|
+-----------------------+
|1656553                |
+-----------------------+



In [261]:
ep_store_gcoin.select(countDistinct("account_id")).show(truncate=False)

+--------------------------+
|count(DISTINCT account_id)|
+--------------------------+
|88000                     |
+--------------------------+



In [154]:
pickem = load_schema.lobby(spark, device, "live", "PickStatusChanged", pickem_start_date, pickem_end_date)

In [119]:
pickem.select(countDistinct("AccountId")).show(truncate=False)

+-------------------------+
|count(DISTINCT AccountId)|
+-------------------------+
|1496187                  |
+-------------------------+



In [38]:
pickem = pickem.withColumn("game_id", when(col("GameId").like("%BR%"), "BR").when(col("GameId").like("%DUEL%"), "DUEL").otherwise(lit(None)))

In [39]:
pickem.groupBy("game_id").agg(countDistinct("AccountId")).show(truncate=False)

+-------+-------------------------+
|game_id|count(DISTINCT AccountId)|
+-------+-------------------------+
|BR     |1433189                  |
|DUEL   |721877                   |
+-------+-------------------------+



In [158]:
pickem_user = pickem.select("AccountId").distinct()

In [48]:
pc_gcoin_user = pc_gcoin_user.withColumn("part", lit(1))
ep_purchase_user = ep_purchase_user.withColumn("part", lit(3))
pickem_user = pickem_user.withColumn("part", lit(5))
participate = pc_gcoin_user.unionByName(ep_purchase_user).unionByName(pickem_user)

In [49]:
participate_by_user = participate.groupBy("AccountId").agg(sum("part").alias("part"))

In [50]:
participate_by_user.groupBy("part").agg(countDistinct("AccountId")).orderBy("part").show(truncate=False)

+----+-------------------------+
|part|count(DISTINCT AccountId)|
+----+-------------------------+
|1   |11239                    |
|3   |15773                    |
|4   |17571                    |
|5   |889679                   |
|6   |3458                     |
|8   |462141                   |
|9   |140909                   |
+----+-------------------------+



### 상품 구매 순서

In [14]:
from pyspark.sql import functions as F
from pyspark.sql import Window

w = Window.partitionBy('account_id').orderBy('time')

purchase_orderby_time = pc_gcoin.withColumn(
            'sorted_list', F.collect_list('product_name').over(w)
        ).groupBy("account_id").agg(max("sorted_list").alias("sorted_list"))

In [17]:
purchase_orderby_time = purchase_orderby_time.groupBy("sorted_list").agg(countDistinct("account_id").alias("user_cnt")).orderBy("user_cnt", ascending=False).toPandas()

In [19]:
purchase_orderby_time.to_csv("./pc_purchase_orderby_time.csv", index=False)

### EP 상품 구매 순서

In [376]:
pgc_ep_item = mysql.read_table(spark, "metainfo", "meta_vc_sales_items").where(col("currency") == "esportspoint").withColumn("product_name", when(col("product_name").isNull(), col("product_id")).otherwise(col("product_name")))
ep_purchase = ep_purchase.join(pgc_ep_item.withColumnRenamed("product_id", "ProductId").select("ProductId", "product_name").distinct(), "ProductId")

In [276]:
ep_purchase.groupBy("product_name").agg(countDistinct("AccountId"), sum("Amount").alias("amount"), sum(col("LocalPrice")*col("Amount"))).orderBy("amount", ascending=False).toPandas()

Unnamed: 0,product_name,count(DISTINCT AccountId),amount,sum((LocalPrice * Amount))
0,currencydesc.bp,474759,5567127,27835635000
1,currencydesc.chestkeypiece,229215,1450366,9427379000
2,Hairstyle 36,261986,261987,20958960000
3,Nickname Change,164896,164896,11542720000
4,PGC 2022 Aced Backpack (Level 3),158931,158931,12714480000
5,"""PGC 2022"" Coupe RB",123817,123817,26001570000
6,PGC 2022 Tac-Tech - Helmet (Level 3),123562,123564,9885120000
7,PGC 2022 - Pan,117880,117880,24754800000
8,PGC 2022 Aced - Groza,105789,105790,22215900000
9,PGC 2022 Emblem,53782,53782,3764740000


In [352]:
from pyspark.sql import functions as F
from pyspark.sql import Window

w = Window.partitionBy('AccountId').orderBy('Time')

ep_purchase_orderby_time = ep_purchase.withColumn(
            'sorted_list', F.collect_list('product_name').over(w)
        ).groupBy("AccountId").agg(max(col("sorted_list")).alias('sorted_list'))

In [353]:
ep_purchase_orderby_time = ep_purchase_orderby_time.groupBy("sorted_list").agg(countDistinct("AccountId").alias("user_cnt")).orderBy("user_cnt", ascending=False).toPandas()

In [354]:
ep_purchase_orderby_time.user_cnt.sum()

636394

In [355]:
ep_purchase_orderby_time.to_csv("./ep_purchase_orderby_time.csv", index=False)

In [356]:
from pyspark.sql import functions as F
from pyspark.sql import Window

w = Window.partitionBy('AccountId')

ep_purchase_products = ep_purchase.withColumn(
            'product_set', F.collect_set('product_name').over(w)
        )

In [357]:
ep_purchase_products_user_cnt = ep_purchase_products.groupBy("product_set").agg(countDistinct("AccountId").alias("user_cnt")).orderBy("user_cnt", ascending=False).toPandas()

In [358]:
ep_purchase_products_user_cnt.user_cnt.sum()

636394

In [359]:
ep_purchase_products_user_cnt.to_csv('./ep_purchase_products_user_cnt.csv', index=False)

### 픽뎀 전후 구매한 유저

In [303]:
pc_gcoin = pc_gcoin.withColumn("after_pickem", when(col("time")> "2022-11-18T09:00:00", lit("true")).otherwise(lit("false")))

In [96]:
pc_gcoin.groupBy("account_id").agg(countDistinct("after_pickem").alias("after_pickem")).groupBy("after_pickem").agg(count("*").alias("user_cnt")).show(truncate=False)

+------------+--------+
|after_pickem|user_cnt|
+------------+--------+
|1           |172021  |
|2           |1156    |
+------------+--------+



In [279]:
pc_gcoin.groupBy("after_pickem").agg(sum("paid_use"), sum(col("paid_use") + col("free_use")), countDistinct("account_id")).toPandas()

Unnamed: 0,after_pickem,sum(paid_use),sum((paid_use + free_use)),count(DISTINCT account_id)
0,False,554672360,616623010,121214
1,True,181128399,253565950,53119


In [199]:
after_pickem_user = pc_gcoin.groupBy("account_id").agg(countDistinct("after_pickem").alias("after_pickem")).filter(col("after_pickem") == 2)

In [201]:
pc_gcoin.join(after_pickem_user.select("account_id"), "account_id").groupBy("after_pickem").agg(sum("paid_use").alias("paid_use"), sum(col("paid_use")+col("free_use")).alias("total_use"), countDistinct("account_id").alias("pu_cnt")).show(truncate=False)

+------------+--------+---------+------+
|after_pickem|paid_use|total_use|pu_cnt|
+------------+--------+---------+------+
|false       |4087530 |4530900  |1156  |
|true        |2854870 |3796600  |1156  |
+------------+--------+---------+------+



In [116]:
# 픽뎀 전후로 구매한 유저들의 픽뎀 이후 상품 구매 후 ep 구매 여부
after_pickem_purchase_time = pc_gcoin.join(after_pickem_user.select("account_id"), "account_id").where(col("after_pickem") == "true").groupBy("account_id").agg(min("time").alias("purchase_time"))
after_pickem_ep_purchse = ep_purchase.join(after_pickem_purchase_time.withColumnRenamed("account_id", "AccountId"), "AccountId").where(col("Time") > col("purchase_time"))

In [117]:
after_pickem_ep_purchse.select(countDistinct("AccountId")).show(truncate=False)

+-------------------------+
|count(DISTINCT AccountId)|
+-------------------------+
|940                      |
+-------------------------+



In [None]:
# 1156명중 
1109: ep 상품 구매
1086: 픽뎀 종료 이후 ep 상품 구매
940: 픽뎀 종료 이후 상품 구매한 유저들의 ep 구매

In [114]:
after_pickem_ep_purchse.groupBy("ProductId").agg(sum("Amount").alias("amount"), mean("LocalPrice").alias("Price"), sum("LocalPrice").alias("sum_price"), countDistinct("AccountId").alias("user_cnt")).orderBy("user_cnt").show(truncate=False)

+--------------------------+------+--------+---------+--------+
|ProductId                 |amount|Price   |sum_price|user_cnt|
+--------------------------+------+--------+---------+--------+
|itemdesc.18040148         |291   |70000.0 |20370000 |291     |
|itemdesc.17210052         |303   |80000.0 |24240000 |303     |
|itemdesc.45000003         |353   |70000.0 |24710000 |353     |
|itemdesc.12020040         |423   |210000.0|88830000 |423     |
|currencydesc.chestkeypiece|4760  |6500.0  |5928000  |449     |
|itemdesc.12031144         |449   |80000.0 |35920000 |449     |
|itemdesc.12010650         |450   |210000.0|94500000 |450     |
|itemdesc.12032147         |462   |80000.0 |36960000 |462     |
|itemdesc.12010658         |463   |210000.0|97230000 |463     |
|currencydesc.bp           |17717 |5000.0  |7645000  |748     |
+--------------------------+------+--------+---------+--------+



In [183]:
ep_increased = load_schema.lobby(spark, "pc", "live", "CurrencyIncreased", start_date, end_date).where(col("Currency") == "currencydesc.esportspoint").where(col("Amount") > 0)

In [189]:
ep_increased = ep_increased.withColumn("before_pickem_end", when(col("Time") <= "2022-11-18T09:00:00", col("Amount")).otherwise(lit(None))) \
    .withColumn("after_pickem_end", when(col("Time") > "2022-11-18T09:00:00", col("Amount")).otherwise(lit(None)))

In [190]:
ep_increased_by_user = ep_increased.groupBy("AccountId").agg(sum("before_pickem_end").alias("before_pickem_end"), sum("after_pickem_end").alias("after_pickem_end"))

In [191]:
ep_increased_by_user.show(truncate=False)

+----------------------------------------+-----------------+----------------+
|AccountId                               |before_pickem_end|after_pickem_end|
+----------------------------------------+-----------------+----------------+
|account.9541443551274eaab0ec32fc31b5a138|null             |547600          |
|account.46ff42646894422e93e5856bbbed88eb|210000           |46800           |
|account.aba58be4146047c08f0e0873e3cd32aa|500000           |613600          |
|account.7f3c0937e6374bccac90fbaba241d5ea|210000           |223200          |
|account.dc4be04728e54ce8bfb0d8199a110a51|210000           |39200           |
|account.875bc3b4be474e38b1c6af8983747198|500000           |null            |
|account.fac4e636fffa4124b5dfeecf6a2ed64f|210000           |null            |
|account.ae42d88b7a88489fb9e3a24f9335df90|500000           |180000          |
|account.c13f26a2289743e1990e56000ae56da6|500000           |561600          |
|account.d48023a654c645cb8dda73c5524987ac|500000           |6248

In [192]:
# 전체 ep 획득 유저
ep_increased_by_user.select(countDistinct("AccountId")).show(truncate=False)

+-------------------------+
|count(DISTINCT AccountId)|
+-------------------------+
|741850                   |
+-------------------------+



In [194]:
after_ep = ep_increased_by_user.where((col("before_pickem_end").isNull()) & (~col("after_pickem_end").isNull())).select("AccountId").distinct()

In [193]:
# pickem 종료 이후에 ep 획득 유저수
after_ep.select(countDistinct("AccountId")).show(truncate=False)

+-------------------------+
|count(DISTINCT AccountId)|
+-------------------------+
|504899                   |
+-------------------------+



In [195]:
# pickem 종료이후로 ep를 획득해본 유저 중 첫 ep 획득이 상품 구매인 경우
after_pickem_first_ep_get = ep_increased.join(after_ep, "AccountId").groupBy("AccountId").agg(min("Time").alias("Time"))
ep_increased.join(after_pickem_first_ep_get, ["AccountId", "Time"]).filter(col("Reason") == "buy-cash-item").select(countDistinct("AccountId")).show(truncate=False)

+-------------------------+
|count(DISTINCT AccountId)|
+-------------------------+
|29098                    |
+-------------------------+



In [202]:
after_pickem_first_ep_by_purchase_user = ep_increased.join(after_pickem_first_ep_get, ["AccountId", "Time"]).filter(col("Reason") == "buy-cash-item").select("AccountId").distinct()

In [204]:
pc_gcoin.join(after_pickem_first_ep_by_purchase_user.withColumnRenamed("AccountId", "account_id"), "account_id").select(min("time"), max("time"), sum("paid_use"), sum(col("paid_use") + col("free_use"))).show(truncate=False)

+----------------------------+----------------------------+-------------+--------------------------+
|min(time)                   |max(time)                   |sum(paid_use)|sum((paid_use + free_use))|
+----------------------------+----------------------------+-------------+--------------------------+
|2022-10-26T05:43:56.3896756Z|2022-12-07T03:17:28.2478969Z|101174615    |141473140                 |
+----------------------------+----------------------------+-------------+--------------------------+



In [206]:
pc_gcoin.join(after_pickem_first_ep_by_purchase_user.withColumnRenamed("AccountId", "account_id"), "account_id").groupBy("product_id", "product_name").agg(sum("paid_use"), sum(col("paid_use") + col("free_use"))).toPandas()

Unnamed: 0,product_id,product_name,sum(paid_use),sum((paid_use + free_use))
0,itemdesc.13000673,PGC 2022 MEGA BUNDLE + VOTING COUPON,73207560,101816300
1,itemdesc.13000670,PGC 2022 IRIDESCENT DANGER BUNDLE + VOTING COUPON,9331490,13808000
2,itemdesc.13000671,PGC 2022 TAC-TECH BUNDLE + VOTING COUPON,18452205,25536000
3,itemdesc.13000672,PGC 2022 EMOTE SET,183360,312840


In [208]:
ep_purchase.join(after_pickem_first_ep_by_purchase_user, "AccountId").select(countDistinct("AccountId")).show(truncate=False)

+-------------------------+
|count(DISTINCT AccountId)|
+-------------------------+
|23512                    |
+-------------------------+



----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 55818)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.6/socketserver.py", line 721, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 269, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 241, in poll
    if func():
  File "/usr/local/spark/python/pyspark/accumulators.py", line 245, in accum_updates
    num_updates = read_int(self.rfile)
  File "/usr/local/spark/python/pysp

In [306]:
before_or_after_user = pc_gcoin.groupBy("account_id").agg(countDistinct("after_pickem").alias("before_and_after")).filter(col("before_and_after") == 1)
only_after_user = pc_gcoin.where(col("after_pickem") == "true").join(before_or_after_user, "account_id").groupBy("account_id").agg(min("time").alias("min_time"))

In [312]:
only_after_not_double_gcoin_user = only_after_user.join(double_gcoin_pgc_user, "account_id", "leftanti")

In [314]:
only_after_not_double_gcoin_user.select(countDistinct("account_id")).show(truncate=False)

+--------------------------+
|count(DISTINCT account_id)|
+--------------------------+
|12814                     |
+--------------------------+



In [315]:
only_after_not_double_gcoin_user.join(ep_purchase, (only_after_not_double_gcoin_user.account_id == ep_purchase.AccountId) & (only_after_not_double_gcoin_user.min_time < ep_purchase.Time)) \
    .select(countDistinct("account_id")).show(truncate=False)

+--------------------------+
|count(DISTINCT account_id)|
+--------------------------+
|9132                      |
+--------------------------+



In [145]:
# before or after
before_or_after_user.select(countDistinct("account_id")).show(truncate=False)

+--------------------------+
|count(DISTINCT account_id)|
+--------------------------+
|172021                    |
+--------------------------+



In [146]:
# before_and_after_user
# after_pickem_user = pc_gcoin.groupBy("account_id").agg(countDistinct("after_pickem").alias("after_pickem")).filter(col("after_pickem") == 2)
after_pickem_user.select(countDistinct("account_id")).show(truncate=False)

+--------------------------+
|count(DISTINCT account_id)|
+--------------------------+
|1156                      |
+--------------------------+



In [144]:
# pickem 종료 이후에 ep 획득한 유저 중 
# pickem 종료 이후에 상품 구매한 유저수
only_after_user.join(after_ep.withColumnRenamed("AccountId", "account_id"), "account_id").select(countDistinct("account_id")).show(truncate=False)

+--------------------------+
|count(DISTINCT account_id)|
+--------------------------+
|51004                     |
+--------------------------+



In [None]:
ep_increased.