In [1]:
# # # Parameters
DT = "2024-06-11"  

## **One Model Profile 생성 작업**

- 개요
    - One Model Profile 생성 작업
- 내용
    - 단기 preference 정보
        - 7일 내 item 정보를 통해서 생성

In [2]:
from skt.gcp import get_bigquery_client, bq_insert_overwrite, get_max_part, bq_to_df, bq_to_pandas, pandas_to_bq_table, load_query_result_to_table, df_to_bq_table
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
from pyspark.sql import functions as F
from datetime import datetime, date, timedelta
from skt.ye import get_spark
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# 프로파일 : tmbr

In [3]:
# DT = "2024-06-02"
current_date = datetime.strptime(DT, "%Y-%m-%d")
DT_threshold1 = (current_date - timedelta(days=60)).strftime("%Y-%m-%d")
DT_threshold2 = (current_date - timedelta(days=7)).strftime("%Y-%m-%d")

print("DT : ", DT)
print("DT_threshold : ", DT_threshold1)
print("DT_threshold : ", DT_threshold2)

DT :  2024-06-11
DT_threshold :  2024-04-12
DT_threshold :  2024-06-04


In [4]:
query = f"""
    select *
    from adot_reco.recgpt_log_sequence_lag_daily_prd
    where dt >= '{DT_threshold1}' and type='tmbr'
"""

In [5]:
data = bq_to_df(query)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/11 23:16:51 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.
24/06/11 23:17:14 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.enabled' instead of it.


In [6]:
data.show(n=10)

[Stage 0:>                                                          (0 + 1) / 1]

+--------------------+----+----+----+----+--------------------+--------+----+----+----------+----+
|        svc_mgmt_num|item|cat1|cat2|cat3|           unix_time|second_s|etc1|etc2|        dt|type|
+--------------------+----+----+----+----+--------------------+--------+----+----+----------+----+
|96819fc6430838f5e...|  CU|null|null|null|1712906180.000000000| 58580.0|null|null|2024-04-12|tmbr|
|30780fb49ce9dbbd4...|  CU|null|null|null|1712930825.000000000| 83225.0|null|null|2024-04-12|tmbr|
|028f95b7d7f3fd667...|  CU|null|null|null|1712920092.000000000| 72492.0|null|null|2024-04-12|tmbr|
|a1d118c880cf7e740...|  CU|null|null|null|1712914777.000000000| 67177.0|null|null|2024-04-12|tmbr|
|ce96c0459609781b1...|  CU|null|null|null|1712927247.000000000| 79647.0|null|null|2024-04-12|tmbr|
|788b69b9176dd3874...|  CU|null|null|null|1712932187.000000000| 84587.0|null|null|2024-04-12|tmbr|
|3678400d0beebeee3...|  CU|null|null|null|1712918252.000000000| 70652.0|null|null|2024-04-12|tmbr|
|e38b67717

                                                                                

In [7]:
data.select('item').distinct().show()



+--------------------+
|                item|
+--------------------+
|          신라면세점|
|                씽씽|
|            온더보더|
|          안경매니져|
|롯데월드어드벤처부산|
|            카페베네|
|            SK스토아|
|              그리팅|
|              사이판|
|              피자헛|
|                던킨|
|              이월드|
|          파리크라상|
|          할리스커피|
|        스피드메이트|
|        배스킨라빈스|
|            착한소비|
|        신세계면세점|
|            퍼블로그|
|                 FLO|
+--------------------+
only showing top 20 rows



                                                                                

In [8]:
def prefer_list(data, cate="cat1", thre=0.7):
    data = data.select(['svc_mgmt_num',f'{cate}']).na.drop("any", subset=f'{cate}')
    data = data.groupby('svc_mgmt_num',f'{cate}').agg(count("*").alias(f'{cate}_cnt'))
    data_user = data.groupby('svc_mgmt_num').agg(max(f'{cate}_cnt').alias(f'{cate}_max_cnt'))
    data = data.join(data_user, on='svc_mgmt_num', how="left")
    data = data.withColumn(f'{cate}_prop', col(f'{cate}_cnt')/col(f'{cate}_max_cnt'))
    data = data.filter(col(f'{cate}_prop') > thre)
    
    window_spec = Window.partitionBy("svc_mgmt_num").orderBy(desc(f'{cate}_prop'))

    top_keywords = data.withColumn("rank", row_number().over(window_spec)) \
                    .filter(col("rank") <= 10) \
                    .drop("rank")
    data = top_keywords.groupBy("svc_mgmt_num").agg(concat_ws(", ",collect_list(f'{cate}')).alias(f'{cate}_list'))
    
    return data

## tmbr item level

In [9]:
data_item = data.filter(col('dt') >= f'{DT_threshold2}')
data_item = data_item.select(['svc_mgmt_num','item']).na.drop("any", subset='item')
data_item = prefer_list(data_item, cate="item", thre=0.3)

In [10]:
full_data = data_item.withColumn("dt", lit(DT).cast("date"))
full_data = full_data.withColumnRenamed("item_list", "tmbr_item_prefer")

In [11]:
full_data.printSchema()

root
 |-- svc_mgmt_num: string (nullable = true)
 |-- tmbr_item_prefer: string (nullable = false)
 |-- dt: date (nullable = true)



### AIDP 저장

In [12]:
dest_dataset = "x1113099"
partitioned_dest_table = "one_model_profile_tmbr"

In [13]:
get_bigquery_client().query(f"""
    CREATE TABLE IF NOT EXISTS {dest_dataset}.{partitioned_dest_table}
    (
        svc_mgmt_num STRING,
        tmbr_item_prefer STRING,
        dt DATE,
    )
    PARTITION BY dt
""").result()

print(f"생성된 테이블 : {dest_dataset}.{partitioned_dest_table}")

생성된 테이블 : x1113099.one_model_profile_tmbr


In [14]:
df_to_bq_table(df=full_data,
               dataset=dest_dataset,
               table_name=partitioned_dest_table,
               mode="overwrite")

                                                                                