In [1]:
# # # Parameters
DT = "2024-07-19"   

## **One Model Profile 생성 작업**

- 개요
    - One Model Profile 생성 작업
- 내용
    - item, cat1, cat2 level로 프로파일링 작업 수행
    - 단기 preference 정보
        - 7일 내 item 정보를 통해서 생성
        - 이때, item은 정규화 로직을 통해 추출된 단어 기반으로 최종 단어들은 추출
    - 장기 preference 정보
        - 60일 내 cat1, cat2 정보를 통해서 생성
        - cat2가 더 상위 레벨임. 어떤 레벨로 장기 구성할지 선택 필요- 내용- 내용

In [2]:
from skt.gcp import get_bigquery_client, bq_insert_overwrite, get_max_part, bq_to_df, bq_to_pandas, pandas_to_bq_table, load_query_result_to_table, df_to_bq_table
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col, lit, count, log, exp, sum as spark_sum
from pyspark.sql import functions as F
from datetime import datetime, date, timedelta
from skt.ye import get_spark
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import math
from pyspark.sql.types import DoubleType

In [29]:
def tmbr_extract_profile(data, thre=0.6, freq_wei = 0.6, dt_cnt = 60, col_names1='tmbr_cat1_total', col_names2='tmbr_cat2_total'):
    #############################
    ### 빈도기반 확률 값 계산 ###
    #############################
    data_freq = data.select(['svc_mgmt_num','luna_id','cat1']).na.drop("any", subset='cat1')
    data_freq = data_freq.groupby('svc_mgmt_num','luna_id','cat1').agg(count("*").alias('cat1_cnt'))
    data_freq_user = data_freq.groupby(['svc_mgmt_num','luna_id']).agg(max('cat1_cnt').alias('cat1_max_cnt'))
    data_freq = data_freq.join(data_freq_user, on=['svc_mgmt_num','luna_id'], how="left")
    data_freq = data_freq.withColumn('cat1_prop', col('cat1_cnt')/col('cat1_max_cnt'))
    
    result_df = data_freq.select('svc_mgmt_num','luna_id',"cat1", "cat1_prop").dropDuplicates(['luna_id',"cat1"])
    
    ############################
    ### IDF 기반 weight 계산 ###
    ############################
    data_dt_cnt = data.select(['luna_id','cat1','dt']).distinct().groupby(['luna_id','cat1']).agg(count("*").alias("luna_cat1_cnt"))
    data_dt_cnt = data_dt_cnt.withColumn("df_weight", 1/log(lit(dt_cnt) / col("luna_cat1_cnt") + 1.0e-8))
    data_dt_cnt = data_dt_cnt.withColumn("rev_df_weight", 1.0/(1.0 + exp(-col("df_weight"))))
    
    #### Merge ####
    merge_data = result_df.dropna(subset="luna_id").select(['luna_id','cat1','cat1_prop']).join(data_dt_cnt.select(['luna_id', 'cat1','rev_df_weight']),on=['luna_id','cat1'],how="left")
    merge_data = merge_data.withColumn('score', col('cat1_prop')*freq_wei + col('rev_df_weight')*(1-freq_wei))
    
    data_cat1_distinct = data_cat1.select(['cat1','cat2']).distinct()
    merge_data = merge_data.join(data_cat1_distinct, on="cat1", how="left")
    
    window_spec = Window.partitionBy(['luna_id']).orderBy(desc('score'))
    merge_data = merge_data.withColumn("rank", row_number().over(window_spec)) \
                    .filter(col("rank") <= 10) \
                    .drop("rank")
    
    merge_data = merge_data.filter(col("score") >= thre).orderBy(desc("score")).groupBy("luna_id").agg(concat_ws(", ",collect_set("cat1")).alias("cat1_list"), concat_ws(", ",collect_set("cat2")).alias("cat2_list"))
    merge_data = merge_data.withColumnRenamed("cat1_list", col_names1)
    merge_data = merge_data.withColumnRenamed("cat2_list", col_names2)
    
    return merge_data

def calculate_days(start_date, end_date):
    date_range = pd.date_range(start=start_date, end=end_date)
    total_days = len(date_range)
    weekend_days = date_range.to_series().map(lambda x: x.weekday() >= 5).sum()
    weekday_days = total_days - weekend_days
    return total_days, int(weekend_days), int(weekday_days)

# 프로파일 : tmbr

In [4]:
# DT = "2024-06-02"
current_date = datetime.strptime(DT, "%Y-%m-%d") - timedelta(days=2)
DT_threshold1 = (current_date - timedelta(days=59)).strftime("%Y-%m-%d")
DT_threshold2 = (current_date - timedelta(days=6)).strftime("%Y-%m-%d")

print("DT : ", DT)
print("DT_threshold : ", DT_threshold1)
print("DT_threshold : ", DT_threshold2)

DT :  2024-07-19
DT_threshold :  2024-05-19
DT_threshold :  2024-07-11


In [5]:
total_days, weekend_days, weekday_days = calculate_days(DT_threshold1, current_date.strftime("%Y-%m-%d"))

In [6]:
total_days, weekend_days, weekday_days

(60, 17, 43)

In [7]:
query_cat1 = f"""
    select *
    from adot_reco_dev.tmbr_cat1_cnt
    where dt >= '{DT_threshold1}'
"""

In [8]:
data_cat1 = bq_to_df(query_cat1)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/07/19 11:02:30 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.
24/07/19 11:02:52 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.enabled' instead of it.


In [9]:
# data_cat1.show(n=10)

## 전체, 주말, 주중(출근, 퇴근, 일과시간) 별 cat1 list 추출

In [10]:
# def adot_extract_profile(data, thre=0.6, ecdf_wei = 0.6, dt_cnt = 60, col_names='adot_cat1_total')
# total_days, weekend_days, weekday_days

In [30]:
total_data = tmbr_extract_profile(data_cat1, 0.6, 0.6, total_days , 'tmbr_cat1_total', 'tmbr_cat2_total')
total_data = total_data.join(tmbr_extract_profile(data_cat1.filter(col('is_weekend')==1), 0.5, 0.6, weekend_days, 'tmbr_cat1_weekend', 'tmbr_cat2_weekend'),on="luna_id",how="left")
total_data = total_data.join(tmbr_extract_profile(data_cat1.filter(col('is_weekend')==0), 0.5, 0.6, weekend_days, 'tmbr_cat1_weekday', 'tmbr_cat2_weekday'),on="luna_id",how="left")

In [31]:
total_data.columns

['luna_id',
 'tmbr_cat1_total',
 'tmbr_cat2_total',
 'tmbr_cat1_weekend',
 'tmbr_cat2_weekend',
 'tmbr_cat1_weekday',
 'tmbr_cat2_weekday']

In [32]:
total_data.show(n=10)

[Stage 919:>                                                        (0 + 1) / 1]

+--------------------+-----------------------------+-------------------------------+-----------------+------------------------+-----------------------------+------------------------------+
|             luna_id|              tmbr_cat1_total|                tmbr_cat2_total|tmbr_cat1_weekend|       tmbr_cat2_weekend|            tmbr_cat1_weekday|             tmbr_cat2_weekday|
+--------------------+-----------------------------+-------------------------------+-----------------+------------------------+-----------------------------+------------------------------+
|APL00000BJ0H4R30LSLC|                       쇼핑몰|                           쇼핑|             null|                    null|                       쇼핑몰|                          쇼핑|
|APL00000BJ19IFQXK54W|                         영화|                 영화/공연/전시|     영화, 쇼핑몰|    쇼핑, 영화/공연/전시|                         영화|                영화/공연/전시|
|APL00000BJ1VXU5TVOQO|               영화, 베이커리|       영화/공연/전시, 베이커리|   영화, 베이커리|영화/공연/전시, 베이커리

                                                                                

## Profile text 화

In [33]:
# profile_template = bq_to_pandas("select * from adot_reco_dev.profile_template")

In [34]:
# template = list(profile_template[profile_template['source_domain']=="xtr"].template)[0]

In [35]:
# template = list(profile_template[profile_template['source_domain']=="xtr"].template)[0]
template = "저는 SKT 멤버십 할인 혜택 중 {cat1_profile} 관련 혜택을 주로 사용했어요."

def profile_text(tmbr_cat1):
    return template.format(cat1_profile=tmbr_cat1)

profile_text_udf = udf(profile_text, StringType())
total_data = total_data.withColumn("tmbr_profile", profile_text_udf(total_data["tmbr_cat1_total"]))

In [36]:
total_data.columns

['luna_id',
 'tmbr_cat1_total',
 'tmbr_cat2_total',
 'tmbr_cat1_weekend',
 'tmbr_cat2_weekend',
 'tmbr_cat1_weekday',
 'tmbr_cat2_weekday',
 'tmbr_profile']

In [37]:
total_data.show(n=10)

[Stage 1103:>                                                       (0 + 1) / 1]

+--------------------+-----------------------------+-------------------------------+-----------------+------------------------+-----------------------------+------------------------------+-----------------------------+
|             luna_id|              tmbr_cat1_total|                tmbr_cat2_total|tmbr_cat1_weekend|       tmbr_cat2_weekend|            tmbr_cat1_weekday|             tmbr_cat2_weekday|                 tmbr_profile|
+--------------------+-----------------------------+-------------------------------+-----------------+------------------------+-----------------------------+------------------------------+-----------------------------+
|APL00000BJ0H4R30LSLC|                       쇼핑몰|                           쇼핑|             null|                    null|                       쇼핑몰|                          쇼핑|저는 SKT 멤버십 할인 혜택 ...|
|APL00000BJ19IFQXK54W|                         영화|                 영화/공연/전시|     영화, 쇼핑몰|    쇼핑, 영화/공연/전시|                         영화|         

                                                                                

### AIDP 저장

In [38]:
dest_dataset = "x1113099"
partitioned_dest_table = "user_retrieval_profile_tmbr_text"

In [39]:
get_bigquery_client().query(f"""
    CREATE TABLE IF NOT EXISTS {dest_dataset}.{partitioned_dest_table}
    (
        luna_id STRING,
        tmbr_cat1_total STRING,
        tmbr_cat2_total STRING,
        tmbr_cat1_weekend STRING,
        tmbr_cat2_weekend STRING,
        tmbr_cat1_weekday STRING,
        tmbr_cat2_weekday STRING,
        tmbr_profile STRING
    )
""").result()

print(f"생성된 테이블 : {dest_dataset}.{partitioned_dest_table}")

생성된 테이블 : x1113099.user_retrieval_profile_tmbr_text


In [40]:
df_to_bq_table(df=total_data,
               dataset=dest_dataset,
               table_name=partitioned_dest_table,
               mode="overwrite")

                                                                                