In [1]:
# # # Parameters
DT = "2024-06-11"   

## **One Model Profile 생성 작업**

- 개요
    - One Model Profile 생성 작업
- 내용
    - item, cat1, cat2 level로 프로파일링 작업 수행
    - 단기 preference 정보
        - 7일 내 item 정보를 통해서 생성
    - 장기 preference 정보
        - 30일 내 cat1, cat2 정보를 통해서 생성
        - cat2가 더 상위 레벨임. 어떤 레벨로 장기 구성할지 선택 필요
        - 다른 프로파일은 60일인데 이건 30일인 이유? 데이터가 너무 커서.. 필요시 날짜 늘리는거 고려 필요

In [2]:
from skt.gcp import get_bigquery_client, bq_insert_overwrite, get_max_part, bq_to_df, bq_to_pandas, pandas_to_bq_table, load_query_result_to_table, df_to_bq_table
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
from pyspark.sql import functions as F
from datetime import datetime, date, timedelta
from skt.ye import get_spark
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# 프로파일 : xdr

In [3]:
# DT = "2024-06-02"
current_date = datetime.strptime(DT, "%Y-%m-%d")
DT_threshold1 = (current_date - timedelta(days=30)).strftime("%Y-%m-%d")
DT_threshold2 = (current_date - timedelta(days=7)).strftime("%Y-%m-%d")

print("DT : ", DT)
print("DT_threshold : ", DT_threshold1)
print("DT_threshold : ", DT_threshold2)

DT :  2024-06-11
DT_threshold :  2024-05-12
DT_threshold :  2024-06-04


In [4]:
query = f"""
    select *
    from adot_reco.recgpt_log_sequence_daily_prd
    where dt >= '{DT_threshold1}' and type = 'xdr'
"""

In [5]:
data = bq_to_df(query)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/11 23:21:31 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.
24/06/11 23:21:53 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.enabled' instead of it.


In [10]:
# data1.count()

In [11]:
data.show(n=10)

[Stage 0:>                                                          (0 + 1) / 1]

+--------------------+----+--------+-------------+----+--------------------+--------+----+----+----------+----+
|        svc_mgmt_num|item|    cat1|         cat2|cat3|           unix_time|second_s|etc1|etc2|        dt|type|
+--------------------+----+--------+-------------+----+--------------------+--------+----+----+----------+----+
|1cc2330f0e6b3b22d...|  S1|Security|         Life|null|1715416300.000000000|   63100|null|null|2024-05-11| xdr|
|9ec09e3e8e662bf9d...|  S1|Security|         Life|null|1715426327.000000000|   73127|null|null|2024-05-11| xdr|
|94a3475462b10e657...|  S1|Security|         Life|null|1715399294.000000000|   46094|null|null|2024-05-11| xdr|
|c6139fc43c455dd49...|  S1|Security|         Life|null|1715381060.000000000|   27860|null|null|2024-05-11| xdr|
|9db8e0a583694d018...|  S1|Security|         Life|null|1715379336.000000000|   26136|null|null|2024-05-11| xdr|
|fa363be78a6b9712c...|  S1|Security|         Life|null|1715415173.000000000|   61973|null|null|2024-05-1

                                                                                

In [12]:
data.select('cat3').distinct().show()



+----+
|cat3|
+----+
|null|
+----+



                                                                                

In [13]:
data.select('cat2').distinct().show()



+-------------+
|         cat2|
+-------------+
|    Education|
|Entertainment|
|  Information|
|         Life|
|      Finance|
|     Shopping|
|      Utility|
|         Game|
|     Business|
|     Location|
|      Leisure|
|Communication|
+-------------+



                                                                                

In [14]:
data.select('cat1').distinct().show()



+-----------------+
|             cat1|
+-----------------+
|      SmartDevice|
|       References|
|  Information_etc|
|   Fresh_delivery|
|     Location_etc|
|              Job|
|             Food|
|              SNS|
|          sharing|
|    OnlinePayment|
|            telco|
|             Used|
|         Security|
|      RealEstates|
| Culture and Arts|
|     Shopping_etc|
|Coupons_Discounts|
|       SportsGame|
|     Ride_request|
|           Sports|
+-----------------+
only showing top 20 rows



                                                                                

In [15]:
data.select('item').distinct().show()



+--------------------+
|                item|
+--------------------+
|                29cm|
|                BAND|
|             Zigbang|
|         efine.go.kr|
|  L.POINT with L.PAY|
|Garena Free Fire ...|
|         CHANEL CODE|
|  The New York Times|
|       GirlFrontLine|
|          WithNature|
|              PHO.TO|
|      EBS Elementary|
|          kric.go.kr|
|              Winwin|
|   SlotSocialCasino2|
|        EpicWar_Game|
|gwangju.childcare...|
|    Basketball Stars|
|            EashShop|
|      CasualJoyGames|
+--------------------+
only showing top 20 rows



                                                                                

In [6]:
def prefer_list(data, cate="cat1", thre=0.7):
    data = data.select(['svc_mgmt_num',f'{cate}']).na.drop("any", subset=f'{cate}')
    data = data.groupby('svc_mgmt_num',f'{cate}').agg(count("*").alias(f'{cate}_cnt'))
    data_user = data.groupby('svc_mgmt_num').agg(max(f'{cate}_cnt').alias(f'{cate}_max_cnt'))
    data = data.join(data_user, on='svc_mgmt_num', how="left")
    data = data.withColumn(f'{cate}_prop', col(f'{cate}_cnt')/col(f'{cate}_max_cnt'))
    data = data.filter(col(f'{cate}_prop') > thre)
    
    window_spec = Window.partitionBy("svc_mgmt_num").orderBy(desc(f'{cate}_prop'))

    top_keywords = data.withColumn("rank", row_number().over(window_spec)) \
                    .filter(col("rank") <= 10) \
                    .drop("rank")
    data = top_keywords.groupBy("svc_mgmt_num").agg(concat_ws(", ",collect_list(f'{cate}')).alias(f'{cate}_list'))
    
    return data

## xdr item level

In [7]:
data_item = data.filter(col('dt') >= f'{DT_threshold2}')
data_item = data_item.select(['svc_mgmt_num','item']).na.drop("any", subset='item')
data_item = prefer_list(data_item, cate="item", thre=0.5)

## xdr cat level
- cat2(더 상위 카테고리) > cat1

- cat1

In [8]:
data_cat1 = prefer_list(data, cate="cat1", thre=0.6)

- cat2

In [9]:
data_cat2 = prefer_list(data, cate="cat2", thre=0.6)

### 전체 데이터 Merge

In [10]:
full_data = data_item.join(data_cat1,on="svc_mgmt_num")
full_data = full_data.join(data_cat2,on="svc_mgmt_num")
full_data = full_data.withColumn("dt", lit(DT).cast("date"))

In [11]:
# full_data.show(n=10)

In [12]:
full_data = full_data.withColumnRenamed("item_list", "xdr_item_prefer")\
                     .withColumnRenamed("cat1_list", "xdr_cat1_prefer")\
                     .withColumnRenamed("cat2_list", "xdr_cat2_prefer")

In [13]:
full_data.printSchema()

root
 |-- svc_mgmt_num: string (nullable = true)
 |-- xdr_item_prefer: string (nullable = false)
 |-- xdr_cat1_prefer: string (nullable = false)
 |-- xdr_cat2_prefer: string (nullable = false)
 |-- dt: date (nullable = true)



### AIDP 저장

In [14]:
dest_dataset = "x1113099"
partitioned_dest_table = "one_model_profile_xdr"

In [15]:
get_bigquery_client().query(f"""
    CREATE TABLE IF NOT EXISTS {dest_dataset}.{partitioned_dest_table}
    (
        svc_mgmt_num STRING,
        xdr_item_prefer STRING,
        xdr_cat1_prefer STRING,
        xdr_cat2_prefer STRING,
        dt DATE,
    )
    PARTITION BY dt
""").result()

print(f"생성된 테이블 : {dest_dataset}.{partitioned_dest_table}")

생성된 테이블 : x1113099.one_model_profile_xdr


In [16]:
df_to_bq_table(df=full_data,
               dataset=dest_dataset,
               table_name=partitioned_dest_table,
               mode="overwrite")

                                                                                