In [1]:
from pubg_util import mysql, load_schema, notifier
from sphynx import sphynx, NODE_SMALL, NODE_MEDIUM, NODE_LARGE
from pyspark.sql.functions import *
import pandas as pd
import pickle

spark = sphynx.get_spark(executor_count=8, app_name='west0_ws_seg', node_spec=NODE_LARGE)

Spark cluster not assigned. creating a new one...
Node spec: 8 executors with 48G RAM each
Job Port 4049 is assigned for requested cluster
Waiting for Spark master to be available...
Spark master launched!
Creating new Spark session, name: west0_ws_seg...
Waiting for all executors ready...
All executors connected!
Complete! elapsed time: 00:00:29


In [94]:
sphynx.stop()

Stopping Spark session...
Destroying Spark cluster...
Done!


In [3]:
# 각 day 28
start_date = "2022-06-08"
end_date = "2022-07-05"

ws_start_date = "2022-07-13"
ws_day_28 = "2022-08-09"

In [4]:
pre_gcoin = load_data_mart("pc", start_date, end_date, "gcoin_use")

In [5]:
wsus_gcoin = pre_gcoin.where(col("event_name").like("%wsus%"))
wsus_user = wsus_gcoin.select("account_id").distinct()

In [6]:
costume = pre_gcoin.where((col("sub_category").isin(["costume", "gear", "vehicle"])) | \
                          ((col("sub_category") == "crate") & (~col("event_name").like("%wsus%")) & (~col("event_name").like("%survivorpass%"))))
costume_user = costume.select("account_id").distinct()

In [7]:
only_costume_gcoin = costume.join(wsus_user, "account_id", "leftanti")
costume_and_wsus_gcoin = wsus_gcoin.join(costume_user, "account_id")
only_wsus_gcoin= wsus_gcoin.join(costume_user, "account_id", "leftanti")

- a) only_costume_gcoin
- b) costume_and_wsus_gcoin
- c) only_wsus_gcoin

In [11]:
from pyspark.sql.types import * 

def get_user_info(end_date):
    def classify_country(country_os, country_ip):
        if country_os != 'CN':
            return country_ip
        else:
            return country_os

    country_type_udf = udf(classify_country, StringType())
    meta_region = mysql.read_table(spark, 'metainfo', 'meta_bi_regions')

    user = load_data_mart("pc", end_date, end_date, "user_master")
    user = user.withColumn("country_new", country_type_udf("country_os", "country_ip"))
    user = user.join(meta_region, user.country_new == meta_region.country_code_iso2, "left").withColumnRenamed("accountid", "AccountId").select("AccountId", "pubg_region", "lastlogindate")
    return user

In [9]:
def get_workshop_rate(start_date, end_date):
    user = get_user_info(end_date)
    
    bp_purchase = load_schema.lobby(spark, "pc", "live", "PurchaseResult", start_date, end_date).where((col("Currency").like("%bp%")) & (col("AnalyticEventType") == "workshop")) \
        .join(user, "AccountId")
    gcoin = load_data_mart("pc", start_date, end_date, "gcoin_use").where(col("event_name").like("%workshop%")).withColumnRenamed("account_id", "AccountId") \
        .join(user, "AccountId")
    open_crate = load_schema.lobby(spark, "pc", "live", "WorkshopCrateOpened", start_date, end_date) \
        .join(user, "AccountId")
    craft = load_schema.lobby(spark, "pc", "live", "WorkshopCrafted", start_date, end_date) \
        .join(user, "AccountId")
    disassemble = load_schema.lobby(spark, "pc", "live", "WorkshopDisassembled", start_date, end_date) \
        .join(user, "AccountId")
    repurpose = load_schema.lobby(spark, "pc", "live", "WorkshopRepurposed", start_date, end_date) \
        .join(user, "AccountId")
    special_craft = load_schema.lobby(spark, "pc", "live", "PurchaseResult", start_date, end_date).where(col("Currency") == "artisanstoken") \
        .join(user, "AccountId")
    
    workshop_user = (open_crate.groupBy("pubg_region", "AccountId").agg(sum("OpenAmount").alias("amount")).withColumn("action", lit("open"))) \
        .unionByName(craft.groupBy("pubg_region", "AccountId").agg(count("*").alias("amount")).withColumn("action", lit("craft"))) \
        .unionByName(disassemble.groupBy("pubg_region", "AccountId").agg(count("*").alias("amount")).withColumn("action", lit("disassemble"))) \
        .unionByName(repurpose.groupBy("pubg_region", "AccountId").agg(count("*").alias("amount")).withColumn("action", lit("repurpose"))) \
        .unionByName(special_craft.groupBy("pubg_region", "AccountId").agg(sum("Amount").alias("amount")).withColumn("action", lit("special_craft"))) \
        .unionByName(gcoin.groupBy("pubg_region", "AccountId").agg(sum("qty").alias("amount")).withColumn("action", lit("gcoin"))) \
        .unionByName(bp_purchase.groupBy("pubg_region", "AccountId").agg(count("*").alias("amount")).withColumn("action", lit("bp_workshop")))
    
    return workshop_user

In [12]:
user = get_user_info(ws_day_28)

In [13]:
# AU for Day 28 after workshop released
user.where(col('lastlogindate') >= ws_start_date).select(countDistinct("AccountId").alias("au")).show()

+-------+
|     au|
+-------+
|9660517|
+-------+



In [14]:
ws_user = get_workshop_rate(ws_start_date, ws_day_28)
ws_user.select(countDistinct("AccountId").alias("ws user cnt")).show()
ws_user.groupBy("action").agg(countDistinct("AccountId").alias("user_cnt")).toPandas()

+-----------+
|ws user cnt|
+-----------+
|    2845464|
+-----------+



Unnamed: 0,action,user_cnt
0,disassemble,1222985
1,repurpose,239705
2,bp_workshop,1966678
3,craft,1333668
4,gcoin,677314
5,special_craft,270200
6,open,2680502


In [15]:
def wsus_cnt(price):
    if price == 1800:
        return 10
    elif price == 200:
        return 1
    else:
        return 0
    
assign_wsus_cnt = udf(wsus_cnt, IntegerType())

In [16]:
ws_gcoin = load_data_mart("pc", ws_start_date, ws_day_28, "gcoin_use").where(col("event_name").like("%workshop%"))

In [17]:
gcoin_by_type = load_data_mart("pc", ws_start_date, ws_day_28, "gcoin_use") \
    .withColumn("type", when(col("event_name").like("%workshop%"), "workshop") \
                .when((col("sub_category").isin(["costume", "gear", "vehicle"])) | \
                          ((col("sub_category") == "crate") & (~col("event_name").like("%wsus%")) & (~col("event_name").like("%workshop%")) & (~col("event_name").like("%survivorpass%"))), "costume") \
                .when(col("event_name").like("%wsus%"), "wsus")) \
    .where(col("type").isin(["workshop", "costume", "wsus"])) \
    .withColumn("wsus_cnt", when(col("type") == "wsus", assign_wsus_cnt("price")).otherwise(lit(None)))

In [18]:
pre_gcoin_by_type = load_data_mart("pc", start_date, end_date, "gcoin_use") \
    .withColumn("type", when(col("event_name").like("%workshop%"), "workshop") \
                .when((col("sub_category").isin(["costume", "gear", "vehicle"])) | \
                          ((col("sub_category") == "crate") & (~col("event_name").like("%wsus%")) & (~col("event_name").like("%workshop%")) & (~col("event_name").like("%survivorpass%"))), "costume") \
                .when(col("event_name").like("%wsus%"), "wsus")) \
    .where(col("type").isin(["workshop", "costume", "wsus"])) \
    .withColumn("wsus_cnt", when(col("type") == "wsus", assign_wsus_cnt("price")).otherwise(lit(None)))

## a 그룹

In [19]:
only_costume_gcoin_by_user = only_costume_gcoin.groupBy("account_id").agg(sum(col("free_use") + col("paid_use")).alias("total_gcoin"), sum("paid_use").alias("paid_use"))

In [20]:
only_costume_gcoin_by_user = only_costume_gcoin_by_user.withColumn("group", \
                                         when(col("total_gcoin") <= 1000, "A") \
                                        .when((col("total_gcoin") > 1000) & (col("total_gcoin") <= 5000), "B") \
                                        .when((col("total_gcoin") > 5000) & (col("total_gcoin") <= 10000), "C") \
                                        .when((col("total_gcoin") > 10000) & (col("total_gcoin") <= 30000), "D") \
                                        .when((col("total_gcoin") > 30000), "E") \
                                     )
only_costume_gcoin_by_user.groupBy("group").agg(countDistinct("account_id").alias("user_cnt"), sum("total_gcoin").alias("total_gcoin"), sum("paid_use").alias("paid_use")).orderBy("group").toPandas()

Unnamed: 0,group,user_cnt,total_gcoin,paid_use
0,A,183076,99221880,12601580
1,B,107649,186890850,65420875
2,C,1818,11638190,9537840
3,D,102,1278380,1087550
4,E,1,39770,33580


In [21]:
only_costume_gcoin_by_user.where(col("paid_use")>0).groupBy("group").agg(countDistinct("account_id").alias("paid_pu")).orderBy("group").toPandas()

Unnamed: 0,group,paid_pu
0,A,34112
1,B,48610
2,C,1788
3,D,100
4,E,1


In [22]:
only_costume_gcoin_by_user.toPandas().to_csv("./only_costume_gcoin_by_user_day28.csv", index=False)

In [23]:
# AU by group
user.withColumnRenamed("AccountId", "account_id").where(col("lastlogindate") >= ws_start_date).join(only_costume_gcoin_by_user, "account_id").groupBy("group").agg(countDistinct("account_id").alias("au")).orderBy("group").toPandas()

Unnamed: 0,group,au
0,A,120369
1,B,71800
2,C,1574
3,D,91
4,E,1


In [25]:
# only_costume_gcoin_by_user = spark.createDataFrame(only_costume_gcoin_by_user)
workshop_a = ws_user.join(only_costume_gcoin_by_user.withColumnRenamed("account_id", "AccountId"), "AccountId")
workshop_a.groupBy("group", "action").agg(countDistinct("AccountId").alias("user_cnt")).orderBy("group", "action").toPandas()

Unnamed: 0,group,action,user_cnt
0,A,bp_workshop,60677
1,A,craft,48776
2,A,disassemble,46456
3,A,gcoin,27943
4,A,open,76490
5,A,repurpose,11555
6,A,special_craft,12987
7,B,bp_workshop,39562
8,B,craft,31837
9,B,disassemble,30579


In [26]:
workshop_a.groupBy("group").agg(countDistinct("AccountId").alias("ws user cnt")).orderBy("group").toPandas()

Unnamed: 0,group,ws user cnt
0,A,79132
1,B,49349
2,C,1344
3,D,77


In [27]:
pre_gcoin_by_type.join(only_costume_gcoin_by_user.select("account_id", "group"), "account_id") \
    .groupBy("account_id", "group", "type").agg(sum("paid_use").alias("paid_use"), sum("free_use").alias("free_use"), sum("wsus_cnt").alias("wsus_cnt")) \
    .withColumn("paid_user", when(col("paid_use") > 0, col("account_id")).otherwise(lit(None))) \
    .groupBy("group", "type").agg(countDistinct("account_id").alias("total_pu"), countDistinct("paid_user").alias("paid_pu"), sum("paid_use").alias("paid_use"), sum(col("paid_use") + col("free_use")).alias("total_use"), sum("wsus_cnt").alias("wsus_cnt")) \
    .orderBy("group", "type").toPandas()

Unnamed: 0,group,type,total_pu,paid_pu,paid_use,total_use,wsus_cnt
0,A,costume,183076,34112,12601580,99221880,
1,B,costume,107649,48610,65420875,186890850,
2,C,costume,1818,1788,9537840,11638190,
3,D,costume,102,100,1087550,1278380,
4,E,costume,1,1,33580,39770,


In [28]:
def chest_cnt(product_id):
    if product_id in ["itemdesc.14300001", "itemdesc.14300002", "itemdesc.13000577", "itemdesc.13000580"]:
        return 1
    elif product_id in ["itemdesc.13000578", "itemdesc.13000581"]:
        return 5
    elif product_id == "itemdesc.13000582":
        return 10
    else:
        return 0

assign_chest_cnt = udf(chest_cnt, IntegerType())
ws_gcoin = ws_gcoin.withColumn("chest_cnt", assign_chest_cnt("product_id"))

In [29]:
ws_gcoin.join(only_costume_gcoin_by_user.select("account_id", "group"), "account_id").groupBy("group") \
    .agg(countDistinct("account_id").alias("pu"), sum("paid_use").alias("paid_use"), sum(col("paid_use") + col("free_use")).alias("total_use"), sum("chest_cnt").alias("chest_cnt")).toPandas()

Unnamed: 0,group,pu,paid_use,total_use,chest_cnt
0,B,20900,112203370,131843150,420480
1,D,56,913440,1055940,2791
2,C,869,13720430,15641290,44412
3,A,27943,76749665,94791610,314490


In [30]:
gcoin_by_type.join(only_costume_gcoin_by_user.select("account_id", "group"), "account_id") \
    .groupBy("account_id", "group", "type").agg(sum("paid_use").alias("paid_use"), sum("free_use").alias("free_use"), sum("wsus_cnt").alias("wsus_cnt")) \
    .withColumn("paid_user", when(col("paid_use") > 0, col("account_id")).otherwise(lit(None))) \
    .groupBy("group", "type").agg(countDistinct("account_id").alias("total_pu"), countDistinct("paid_user").alias("paid_pu"), sum("paid_use").alias("paid_use"), sum(col("paid_use") + col("free_use")).alias("total_use"), sum("wsus_cnt").alias("wsus_cnt")) \
    .orderBy("group", "type").toPandas()

Unnamed: 0,group,type,total_pu,paid_pu,paid_use,total_use,wsus_cnt
0,A,costume,18206,10466,8741615,14503340,
1,A,workshop,27943,15515,76749665,94791610,
2,A,wsus,6066,2935,14817580,18184200,99351.0
3,B,costume,15290,11384,13037815,17752760,
4,B,workshop,20900,14611,112203370,131843150,
5,B,wsus,5533,3550,24290020,28369600,156072.0
6,C,costume,783,695,1440410,1735840,
7,C,workshop,869,763,13720430,15641290,
8,C,wsus,239,207,2735720,3132000,17315.0
9,D,costume,52,48,176620,201760,


In [31]:
# group별 G-COIN 사용 유저 수
load_data_mart("pc", ws_start_date, ws_day_28, "gcoin_use") \
    .join(only_costume_gcoin_by_user.select("account_id", "group"), "account_id") \
    .withColumn("type", when(col("event_name").like("%workshop%"), "workshop") \
                .when((col("sub_category").isin(["costume", "gear", "vehicle"])) | \
                          ((col("sub_category") == "crate") & (~col("event_name").like("%wsus%")) & (~col("event_name").like("%workshop%")) & (~col("event_name").like("%survivorpass%"))), "costume") \
                .when(col("event_name").like("%wsus%"), "wsus")) \
    .where(col("type").isin(["workshop", "costume", "wsus"])) \
    .groupBy("group").agg(countDistinct("account_id").alias("pu"), sum("paid_use").alias("paid_use"), sum(col("free_use") + col("paid_use")).alias("total_use")).orderBy("group").toPandas()

Unnamed: 0,group,pu,paid_use,total_use
0,A,40556,100308860,127479150
1,B,30533,149531205,177965510
2,C,1144,17896560,20509130
3,D,71,1308830,1492500


## b 그룹

In [33]:
costume_and_wsus_gcoin_user = wsus_gcoin.join(costume_user, "account_id").select("account_id").distinct()

In [34]:
costume_and_wsus_gcoin = costume.select("account_id", "product_id", "paid_use", "free_use", "sub_category", "event_type", "event_name", "price") \
    .unionByName(wsus_gcoin.select("account_id", "product_id", "paid_use", "free_use", "sub_category", "event_type", "event_name", "price")) \
    .join(costume_and_wsus_gcoin_user, "account_id")

In [35]:
costume_and_wsus_gcoin.select("event_name").distinct().show(truncate=False)

+-----------------------+
|event_name             |
+-----------------------+
|202205_bloom           |
|202205_bduck           |
|202204_10000days       |
|202204_rash            |
|202206_sanyang         |
|202203_wsus            |
|202106_vigilante       |
|202206_netenho         |
|202206_wsus_progressive|
|202204_kimblue         |
|yourshop6              |
|202203_ezqelusia       |
|202204_laborday        |
|202206_pnc             |
|202106_oceanfantasy    |
|202108_schoollook      |
|yourshop99             |
|202204_wsus_progressive|
|202206_highnoon        |
|202204_shaka           |
+-----------------------+
only showing top 20 rows



In [36]:
from pyspark.sql.types import * 

def wsus_cnt(price):
    if price == 1800:
        return 10
    elif price == 200:
        return 1
    else:
        return 0
    
assign_wsus_cnt = udf(wsus_cnt, IntegerType())
costume_and_wsus_gcoin = costume_and_wsus_gcoin.withColumn("wsus_cnt", assign_wsus_cnt("price"))

In [37]:
costume_and_wsus_gcoin_by_user = costume_and_wsus_gcoin.groupBy("account_id") \
    .agg(sum("wsus_cnt").alias("wsus_cnt"), sum("paid_use").alias("paid_use"), sum(col("paid_use") + col("free_use")).alias("total_use"))

In [38]:
costume_and_wsus_gcoin_by_user = costume_and_wsus_gcoin_by_user.withColumn("group", \
                                                                when(col("wsus_cnt") <=3, "A") \
                                                                .when((col("wsus_cnt") >= 4) & (col("wsus_cnt") <= 15), "B") \
                                                                .when((col("wsus_cnt") >= 16) & (col("wsus_cnt") <= 30), "C") \
                                                                .when((col("wsus_cnt") >= 31) & (col("wsus_cnt") <= 240), "D") \
                                                                .when((col("wsus_cnt") >= 241), "E") \
                                                                          )
costume_and_wsus_gcoin_by_user.groupBy("group").agg(countDistinct("account_id").alias("user_cnt"), sum("total_use").alias("total_use"), sum("paid_use").alias("paid_use"), sum("wsus_cnt").alias("wsus_cnt")).orderBy("group").toPandas()

Unnamed: 0,group,user_cnt,total_use,paid_use,wsus_cnt
0,A,51207,74445930,23289175,83636
1,B,25305,73393620,41200000,195467
2,C,8117,49701410,39951750,183376
3,D,18461,347137270,304634575,1675259
4,E,4500,465183050,415625365,2499080


In [39]:
costume_and_wsus_gcoin_by_user.where(col("paid_use")>0).groupBy("group").agg(countDistinct("account_id").alias("paid_pu")).orderBy("group").toPandas()

Unnamed: 0,group,paid_pu
0,A,19394
1,B,17376
2,C,7954
3,D,18354
4,E,4496


In [40]:
costume_and_wsus_gcoin_by_user.toPandas().to_csv("./costume_and_wsus_gcoin_by_user_day28.csv", index=False)

In [48]:
# costume_and_wsus_gcoin_by_user = pd.read_csv("./costume_and_wsus_gcoin_by_user.csv")

In [46]:
# AU by group
user.withColumnRenamed("AccountId", "account_id").where(col("lastlogindate") >= ws_start_date).join(costume_and_wsus_gcoin_by_user, "account_id").groupBy("group").agg(countDistinct("account_id").alias("au")).orderBy("group").toPandas()

Unnamed: 0,group,au
0,A,36061
1,B,19792
2,C,7148
3,D,16841
4,E,4267


In [47]:
# costume_and_wsus_gcoin_by_user = spark.createDataFrame(costume_and_wsus_gcoin_by_user)
workshop_b = ws_user.join(costume_and_wsus_gcoin_by_user.withColumnRenamed("account_id", "AccountId"), "AccountId")
workshop_b.groupBy("group", "action").agg(countDistinct("AccountId").alias("user_cnt")).orderBy("group").toPandas()

Unnamed: 0,group,action,user_cnt
0,A,craft,18720
1,A,open,26215
2,A,bp_workshop,21829
3,A,repurpose,6373
4,A,disassemble,18201
5,A,gcoin,13498
6,A,special_craft,7056
7,B,open,16172
8,B,gcoin,9723
9,B,craft,12426


In [48]:
workshop_b.groupBy("group").agg(countDistinct("AccountId").alias("ws user cnt")).orderBy("group").toPandas()

Unnamed: 0,group,ws user cnt
0,A,26901
1,B,16457
2,C,6426
3,D,15642
4,E,4095


In [49]:
pre_gcoin_by_type.join(costume_and_wsus_gcoin_by_user.select("account_id", "group"), "account_id") \
    .groupBy("account_id", "group", "type").agg(sum("paid_use").alias("paid_use"), sum("free_use").alias("free_use"), sum("wsus_cnt").alias("wsus_cnt")) \
    .withColumn("paid_user", when(col("paid_use") > 0, col("account_id")).otherwise(lit(None))) \
    .groupBy("group", "type").agg(countDistinct("account_id").alias("total_pu"), countDistinct("paid_user").alias("paid_pu"), sum("paid_use").alias("paid_use"), sum(col("paid_use") + col("free_use")).alias("total_use"), sum("wsus_cnt").alias("wsus_cnt")) \
    .orderBy("group", "type").toPandas()

Unnamed: 0,group,type,total_pu,paid_pu,paid_use,total_use,wsus_cnt
0,A,costume,51207,17212,20795880,58842330,
1,A,wsus,51207,12384,2493295,15603600,78018.0
2,B,costume,25305,14989,23424580,36827420,
3,B,wsus,25305,15371,17775420,36566200,191948.0
4,C,costume,8117,7085,12761785,16009610,
5,C,wsus,8117,7899,27189965,33691800,182671.0
6,D,costume,18461,16476,34814540,42982870,
7,D,wsus,18461,18341,269820035,304154400,1673484.0
8,E,costume,4500,4009,11230590,14066450,
9,E,wsus,4500,4496,404394775,451116600,2498619.0


In [50]:
ws_gcoin.join(costume_and_wsus_gcoin_by_user.select("account_id", "group"), "account_id").groupBy("group") \
    .agg(countDistinct("account_id").alias("pu"), sum("paid_use").alias("paid_use"), sum(col("paid_use") + col("free_use")).alias("total_use"), sum("chest_cnt").alias("chest_cnt")).toPandas()

Unnamed: 0,group,pu,paid_use,total_use,chest_cnt
0,E,3467,153650345,173485890,394559
1,B,9723,75442285,86695600,283869
2,D,12067,256895535,291041130,829645
3,C,4490,56319815,64051110,199238
4,A,13498,65679290,77234420,256949


In [51]:
gcoin_by_type.join(costume_and_wsus_gcoin_by_user.select("account_id", "group"), "account_id") \
    .groupBy("account_id", "group", "type").agg(sum("paid_use").alias("paid_use"), sum("free_use").alias("free_use"), sum("wsus_cnt").alias("wsus_cnt")) \
    .withColumn("paid_user", when(col("paid_use") > 0, col("account_id")).otherwise(lit(None))) \
    .groupBy("group", "type").agg(countDistinct("account_id").alias("total_pu"), countDistinct("paid_user").alias("paid_pu"), sum("paid_use").alias("paid_use"), sum(col("paid_use") + col("free_use")).alias("total_use"), sum("wsus_cnt").alias("wsus_cnt")) \
    .orderBy("group", "type").toPandas()

Unnamed: 0,group,type,total_pu,paid_pu,paid_use,total_use,wsus_cnt
0,A,costume,7060,5170,5210530,6900180,
1,A,workshop,13498,9076,65679290,77234420,
2,A,wsus,4242,2733,12430355,14776200,80645.0
3,B,costume,5324,4404,5158570,6313950,
4,B,workshop,9723,7846,75442285,86695600,
5,B,wsus,4331,3455,19790965,22916200,125431.0
6,C,costume,2628,2286,3042600,3651610,
7,C,workshop,4490,3894,56319815,64051110,
8,C,wsus,2477,2130,14285050,16513400,90618.0
9,D,costume,7147,6368,11066395,12995220,


In [52]:
# group별 G-COIN 사용 유저 수
load_data_mart("pc", ws_start_date, ws_day_28, "gcoin_use") \
    .join(costume_and_wsus_gcoin_by_user.select("account_id", "group"), "account_id") \
    .withColumn("type", when(col("event_name").like("%workshop%"), "workshop") \
                .when((col("sub_category").isin(["costume", "gear", "vehicle"])) | \
                          ((col("sub_category") == "crate") & (~col("event_name").like("%wsus%")) & (~col("event_name").like("%workshop%")) & (~col("event_name").like("%survivorpass%"))), "costume") \
                .when(col("event_name").like("%wsus%"), "wsus")) \
    .where(col("type").isin(["workshop", "costume", "wsus"])) \
    .groupBy("group").agg(countDistinct("account_id").alias("pu"), sum("paid_use").alias("paid_use"), sum(col("free_use") + col("paid_use")).alias("total_use")).orderBy("group").toPandas()

Unnamed: 0,group,pu,paid_use,total_use
0,A,17146,83320175,98910800
1,B,12068,100391820,115925750
2,C,5457,73647465,84216120
3,D,13946,341230510,387080950
4,E,3799,197305620,222501520


## c 그룹

In [41]:
only_wsus_gcoin = only_wsus_gcoin.withColumn("wsus_cnt", assign_wsus_cnt("price"))

In [42]:
only_wsus_gcoin_by_user = only_wsus_gcoin.groupBy("account_id") \
    .agg(sum("wsus_cnt").alias("wsus_cnt"), sum("paid_use").alias("paid_use"), sum(col("paid_use") + col("free_use")).alias("total_use"))

In [43]:
only_wsus_gcoin_by_user = only_wsus_gcoin_by_user.withColumn("group", \
                                                when(col("wsus_cnt") <=10, "A") \
                                                .when((col("wsus_cnt") >= 11) & (col("wsus_cnt") <= 50), "B") \
                                                .when((col("wsus_cnt") >= 51) & (col("wsus_cnt") <= 100), "C") \
                                                .when((col("wsus_cnt") >= 101) & (col("wsus_cnt") <= 800), "D") \
                                                .when((col("wsus_cnt") >= 801), "E") \
                                                          )
only_wsus_gcoin_by_user.groupBy("group").agg(countDistinct("account_id").alias("user_cnt"), sum("total_use").alias("total_use"), sum("paid_use").alias("paid_use"), sum("wsus_cnt").alias("wsus_cnt")).orderBy("group").toPandas()

Unnamed: 0,group,user_cnt,total_use,paid_use,wsus_cnt
0,A,446271,301497600,26486870,1565360
1,B,43819,179353400,134450045,972916
2,C,11413,144609200,127183550,794954
3,D,10819,428877400,382855700,2369900
4,E,471,85043200,76112820,471672


In [44]:
only_wsus_gcoin_by_user.where(col("paid_use")>0).groupBy("group").agg(countDistinct("account_id").alias("paid_pu")).orderBy("group").toPandas()

Unnamed: 0,group,paid_pu
0,A,72804
1,B,39044
2,C,11327
3,D,10784
4,E,471


In [45]:
only_wsus_gcoin_by_user.toPandas().to_csv("./only_wsus_gcoin_by_user.csv", index=False)

In [57]:
# only_wsus_gcoin_by_user = pd.read_csv("./only_wsus_gcoin_by_user.csv")

In [53]:
# AU by group
user.withColumnRenamed("AccountId", "account_id").where(col("lastlogindate") >= ws_start_date).join(only_wsus_gcoin_by_user, "account_id").groupBy("group").agg(countDistinct("account_id").alias("au")).orderBy("group").toPandas()

Unnamed: 0,group,au
0,A,271629
1,B,36774
2,C,10266
3,D,9854
4,E,448


In [54]:
# only_wsus_gcoin_by_user = spark.createDataFrame(only_wsus_gcoin_by_user)
workshop_c = ws_user.join(only_wsus_gcoin_by_user.withColumnRenamed("account_id", "AccountId"), "AccountId")
workshop_c.groupBy("group", "action").agg(countDistinct("AccountId").alias("user_cnt")).orderBy("group").toPandas()

Unnamed: 0,group,action,user_cnt
0,A,repurpose,26233
1,A,craft,106030
2,A,disassemble,100015
3,A,open,172426
4,A,bp_workshop,129855
5,A,gcoin,62466
6,A,special_craft,29081
7,B,open,30989
8,B,repurpose,9406
9,B,gcoin,16947


In [55]:
workshop_c.groupBy("group").agg(countDistinct("AccountId").alias("ws user cnt")).orderBy("group").toPandas()

Unnamed: 0,group,ws user cnt
0,A,178083
1,B,31468
2,C,9198
3,D,8999
4,E,421


In [56]:
pre_gcoin_by_type.join(only_wsus_gcoin_by_user.select("account_id", "group"), "account_id") \
    .groupBy("account_id", "group", "type").agg(sum("paid_use").alias("paid_use"), sum("free_use").alias("free_use"), sum("wsus_cnt").alias("wsus_cnt")) \
    .withColumn("paid_user", when(col("paid_use") > 0, col("account_id")).otherwise(lit(None))) \
    .groupBy("group", "type").agg(countDistinct("account_id").alias("total_pu"), countDistinct("paid_user").alias("paid_pu"), sum("paid_use").alias("paid_use"), sum(col("paid_use") + col("free_use")).alias("total_use"), sum("wsus_cnt").alias("wsus_cnt")) \
    .orderBy("group", "type").toPandas()

Unnamed: 0,group,type,total_pu,paid_pu,paid_use,total_use,wsus_cnt
0,A,wsus,446271,72804,26486870,301497600,1565360
1,B,wsus,43819,39044,134450045,179353400,972916
2,C,wsus,11413,11327,127183550,144609200,794954
3,D,wsus,10819,10784,382855700,428877400,2369900
4,E,wsus,471,471,76112820,85043200,471672


In [57]:
ws_gcoin.join(only_wsus_gcoin_by_user.select("account_id", "group"), "account_id").groupBy("group") \
    .agg(countDistinct("account_id").alias("pu"), sum("paid_use").alias("paid_use"), sum(col("paid_use") + col("free_use")).alias("total_use"), sum("chest_cnt").alias("chest_cnt")).toPandas()

Unnamed: 0,group,pu,paid_use,total_use,chest_cnt
0,E,288,9464470,10739010,22542
1,B,16947,108965790,125411580,418203
2,D,5889,98688835,112040880,317703
3,C,5583,61058930,69633980,212686
4,A,62466,159250550,196337630,658912


In [58]:
gcoin_by_type.join(only_wsus_gcoin_by_user.select("account_id", "group"), "account_id") \
    .groupBy("account_id", "group", "type").agg(sum("paid_use").alias("paid_use"), sum("free_use").alias("free_use"), sum("wsus_cnt").alias("wsus_cnt")) \
    .withColumn("paid_user", when(col("paid_use") > 0, col("account_id")).otherwise(lit(None))) \
    .groupBy("group", "type").agg(countDistinct("account_id").alias("total_pu"), countDistinct("paid_user").alias("paid_pu"), sum("paid_use").alias("paid_use"), sum(col("paid_use") + col("free_use")).alias("total_use"), sum("wsus_cnt").alias("wsus_cnt")) \
    .orderBy("group", "type").toPandas()

Unnamed: 0,group,type,total_pu,paid_pu,paid_use,total_use,wsus_cnt
0,A,costume,27491,16921,14366490,22068000,
1,A,workshop,62466,32646,159250550,196337630,
2,A,wsus,30320,14685,58216850,73913000,400901.0
3,B,costume,7753,6196,6111660,7822940,
4,B,workshop,16947,13022,108965790,125411580,
5,B,wsus,10255,8363,48090335,55836400,305390.0
6,C,costume,2793,2300,2758050,3433080,
7,C,workshop,5583,4561,61058930,69633980,
8,C,wsus,3667,3164,29590565,34014600,187121.0
9,D,costume,2931,2412,3110410,3880190,


In [59]:
# group별 G-COIN 사용 유저 수
load_data_mart("pc", ws_start_date, ws_day_28, "gcoin_use") \
    .join(only_wsus_gcoin_by_user.select("account_id", "group"), "account_id") \
    .withColumn("type", when(col("event_name").like("%workshop%"), "workshop") \
                .when((col("sub_category").isin(["costume", "gear", "vehicle"])) | \
                          ((col("sub_category") == "crate") & (~col("event_name").like("%wsus%")) & (~col("event_name").like("%workshop%")) & (~col("event_name").like("%survivorpass%"))), "costume") \
                .when(col("event_name").like("%wsus%"), "wsus")) \
    .where(col("type").isin(["workshop", "costume", "wsus"])) \
    .groupBy("group").agg(countDistinct("account_id").alias("pu"), sum("paid_use").alias("paid_use"), sum(col("free_use") + col("paid_use")).alias("total_use")).orderBy("group").toPandas()

Unnamed: 0,group,pu,paid_use,total_use
0,A,89471,231833890,292318630
1,B,22530,163167785,189070920
2,C,7174,93407545,107081660
3,D,7309,149373915,169434270
4,E,342,12430690,14073710


----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 52262)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.6/socketserver.py", line 721, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 269, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 241, in poll
    if func():
  File "/usr/local/spark/python/pyspark/accumulators.py", line 245, in accum_updates
    num_updates = read_int(self.rfile)
  File "/usr/local/spark/python/pysp