In [1]:
# # # Parameters
DT = "2024-06-11"   

## **One Model Profile 생성 작업**

- 개요
    - One Model Profile 생성 작업
- 내용
    - item, cat1, cat2 level로 프로파일링 작업 수행
    - 단기 preference 정보
        - 7일 내 item 정보를 통해서 생성
    - 장기 preference 정보
        - 60일 내 cat1, cat2 정보를 통해서 생성
        - cat2가 더 상위 레벨임. 어떤 레벨로 장기 구성할지 선택 필요

In [2]:
from skt.gcp import get_bigquery_client, bq_insert_overwrite, get_max_part, bq_to_df, bq_to_pandas, pandas_to_bq_table, load_query_result_to_table, df_to_bq_table
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
from pyspark.sql import functions as F
from datetime import datetime, date, timedelta
from skt.ye import get_spark
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# 프로파일 : 11st

In [3]:
# DT = "2024-06-02"
current_date = datetime.strptime(DT, "%Y-%m-%d")
DT_threshold1 = (current_date - timedelta(days=60)).strftime("%Y-%m-%d")
DT_threshold2 = (current_date - timedelta(days=7)).strftime("%Y-%m-%d")

print("DT : ", DT)
print("DT_threshold : ", DT_threshold1)
print("DT_threshold : ", DT_threshold2)

DT :  2024-06-11
DT_threshold :  2024-04-12
DT_threshold :  2024-06-04


In [4]:
query = f"""
    select *
    from adot_reco.recgpt_log_sequence_lag_daily_prd
    where dt >= '{DT_threshold1}' and type = "11st"
"""

In [5]:
data = bq_to_df(query)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/11 15:24:53 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.
24/06/11 15:25:16 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.enabled' instead of it.


In [6]:
data.show(n=10)

[Stage 0:>                                                          (0 + 1) / 1]

+--------------------+----+---------+--------+--------+--------------------+--------+----+----+----------+----+
|        svc_mgmt_num|item|     cat1|    cat2|    cat3|           unix_time|second_s|etc1|etc2|        dt|type|
+--------------------+----+---------+--------+--------+--------------------+--------+----+----+----------+----+
|6c1f2db8c3b9819bf...| CPU|   PC부품|      PC|ORDCMPLT|1712881397.000000000|   33797|null|null|2024-04-12|11st|
|6c1f2db8c3b9819bf...| CPU|   PC부품|      PC|ORDCMPLT|1712881397.000000000|   33797|null|null|2024-04-12|11st|
|418e8f687b4f355e3...| SSD| 저장장치|      PC|ORDCMPLT|1712875968.000000000|   28368|null|null|2024-04-12|11st|
|55a4c32edece172e1...| SSD| 저장장치|      PC|ORDCMPLT|1712889741.000000000|   42141|null|null|2024-04-12|11st|
|45d07caef1d3e5f3a...|  빵|과자/간식|가공식품|ORDCMPLT|1712905196.000000000|   57596|null|null|2024-04-12|11st|
|da7a4f556718bc409...|  빵|과자/간식|가공식품|ORDCMPLT|1712884208.000000000|   36608|null|null|2024-04-12|11st|
|08c8a058a826bc3ec...|

                                                                                

In [7]:
data.select('cat3').distinct().show()

[Stage 1:>                                                          (0 + 1) / 1]

+--------+
|    cat3|
+--------+
|ORDCMPLT|
+--------+



                                                                                

In [8]:
data.select('cat2').distinct().show()

[Stage 4:>                                                          (0 + 1) / 1]

+------------+
|        cat2|
+------------+
|        여행|
|          PC|
|  스포츠패션|
|스마트디지털|
|해당사항없음|
|    해외쇼핑|
|      생필품|
|        리빙|
|    생활문화|
|    신선식품|
|       E쿠폰|
|  브랜드패션|
|    가공식품|
|        뷰티|
|        레저|
|        가전|
|     B2B사업|
|  트렌드패션|
+------------+



                                                                                

In [9]:
data.select('cat1').distinct().show()

[Stage 7:>                                                          (0 + 1) / 1]

+----------------+
|            cat1|
+----------------+
|            악기|
|      스포츠신발|
|  여행/숙박/항공|
|            조명|
|            시계|
|            낚시|
|        실버용품|
|        음향가전|
|         꽃/원예|
|            골프|
|   세탁기/건조기|
|      비즈11번가|
|        화방용품|
|   프린터/복합기|
|구기/라켓/스포츠|
|      자동차용품|
|  세제/방향/살충|
|          선케어|
|    DIY자재/용품|
|      이미용가전|
+----------------+
only showing top 20 rows



                                                                                

In [10]:
data.select('item').distinct().show()

[Stage 10:>                                                         (0 + 1) / 1]

+-----------------+
|             item|
+-----------------+
|           후드티|
|      샤워기/수전|
|후방카메라/감지기|
|      식품/영양제|
|     헤어액세서리|
|        피자/치킨|
|           액션캠|
|             샴푸|
|           관상어|
|  아이스크림/빙수|
|         튜닝용품|
|   게임기주변기기|
|             사료|
|             시계|
|             조명|
|  디자인/팬시용품|
|           골프화|
|           젤네일|
|           파라솔|
|         메인보드|
+-----------------+
only showing top 20 rows



                                                                                

In [11]:
def prefer_list(data, cate="cat1", thre=0.7):
    data = data.select(['svc_mgmt_num',f'{cate}']).na.drop("any", subset=f'{cate}')
    data = data.groupby('svc_mgmt_num',f'{cate}').agg(count("*").alias(f'{cate}_cnt'))
    data_user = data.groupby('svc_mgmt_num').agg(max(f'{cate}_cnt').alias(f'{cate}_max_cnt'))
    data = data.join(data_user, on='svc_mgmt_num', how="left")
    data = data.withColumn(f'{cate}_prop', col(f'{cate}_cnt')/col(f'{cate}_max_cnt'))
    data = data.filter(col(f'{cate}_prop') > thre)
    
    window_spec = Window.partitionBy("svc_mgmt_num").orderBy(desc(f'{cate}_prop'))

    top_keywords = data.withColumn("rank", row_number().over(window_spec)) \
                    .filter(col("rank") <= 10) \
                    .drop("rank")
    
    # data = top_keywords.groupBy("svc_mgmt_num").agg(collect_list(f'{cate}').alias(f'{cate}_list'))
    data = top_keywords.groupBy("svc_mgmt_num").agg(concat_ws(", ",collect_list(f'{cate}')).alias(f'{cate}_list'))
    
    return data

## 11st item level

In [12]:
data_item = data.filter(col('dt') >= f'{DT_threshold2}')
data_item = data_item.select(['svc_mgmt_num','item']).na.drop("any", subset='item')
data_item = prefer_list(data_item, cate="item", thre=0.4)

## 11st cat level
- cat2(더 상위 카테고리) > cat1

- cat1

In [13]:
data_cat1 = prefer_list(data, cate="cat1", thre=0.5)

- cat2

In [14]:
data_cat2 = prefer_list(data, cate="cat2", thre=0.5)

### 전체 데이터 Merge

In [15]:
full_data = data_item.join(data_cat1,on="svc_mgmt_num")
full_data = full_data.join(data_cat2,on="svc_mgmt_num")

In [16]:
full_data = full_data.withColumn("dt", lit(DT).cast("date"))

In [17]:
full_data.show(n=10)

[Stage 38:>                                                         (0 + 1) / 1]

+--------------------+-------------------------------+------------------------------------+-------------------------------+----------+
|        svc_mgmt_num|                      item_list|                           cat1_list|                      cat2_list|        dt|
+--------------------+-------------------------------+------------------------------------+-------------------------------+----------+
|000477938a6a87581...|                       롤화장지|                              화장지|                         생필품|2024-06-11|
|000b234a19db8309f...|기타,종이컵/테이크아웃컵,염색약|                           과자/간식|              생활문화,가공식품|2024-06-11|
|001110011e9c3c4c8...|                             잼|                  가공식품,도서/음반|              생활문화,가공식품|2024-06-11|
|001281cdb47b6bf1d...|                       클렌징폼|                            침실가구|                           리빙|2024-06-11|
|0013405015c4ef6fe...|                         닭고기|               간편식/냉장/냉동,축산|              가공식품,신선식품|2024-06-11|
|00

                                                                                

In [18]:
full_data = full_data.drop('cat3_list')
full_data = full_data.withColumnRenamed("item_list", "_11st_item_prefer")\
                      .withColumnRenamed("cat1_list", "_11st_cat1_prefer")\
                      .withColumnRenamed("cat2_list", "_11st_cat2_prefer")

In [19]:
full_data.printSchema()

root
 |-- svc_mgmt_num: string (nullable = true)
 |-- _11st_item_prefer: string (nullable = false)
 |-- _11st_cat1_prefer: string (nullable = false)
 |-- _11st_cat2_prefer: string (nullable = false)
 |-- dt: date (nullable = true)



### AIDP 저장

In [20]:
dest_dataset = "x1113099"
partitioned_dest_table = "one_model_profile_11st"

In [21]:
get_bigquery_client().query(f"""
    CREATE TABLE IF NOT EXISTS {dest_dataset}.{partitioned_dest_table}
    (
        svc_mgmt_num STRING,
        _11st_item_prefer STRING,
        _11st_cat1_prefer STRING,
        _11st_cat2_prefer STRING,
        dt DATE,
    )
    PARTITION BY dt
""").result()

print(f"생성된 테이블 : {dest_dataset}.{partitioned_dest_table}")

생성된 테이블 : x1113099.one_model_profile_11st


In [22]:
df_to_bq_table(df=full_data,
               dataset=dest_dataset,
               table_name=partitioned_dest_table,
               mode="overwrite")

                                                                                