In [1]:
# # # Parameters
DT = "2024-06-26"   

## **One Model Profile 생성 작업**

- 개요
    - One Model Profile 생성 작업
- 내용
    - item, cat1, cat2 level로 프로파일링 작업 수행
    - 단기 preference 정보
        - 7일 내 item 정보를 통해서 생성
        - 이때, item은 정규화 로직을 통해 추출된 단어 기반으로 최종 단어들은 추출
    - 장기 preference 정보
        - 60일 내 cat1, cat2 정보를 통해서 생성
        - cat2가 더 상위 레벨임. 어떤 레벨로 장기 구성할지 선택 필요- 내용- 내용

In [2]:
from skt.gcp import get_bigquery_client, bq_insert_overwrite, get_max_part, bq_to_df, bq_to_pandas, pandas_to_bq_table, load_query_result_to_table, df_to_bq_table
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col, lit, count, log, exp, sum as spark_sum
from pyspark.sql import functions as F
from datetime import datetime, date, timedelta
from skt.ye import get_spark
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import math
from pyspark.sql.types import DoubleType

In [3]:
def calculate_days(start_date, end_date):
    date_range = pd.date_range(start=start_date, end=end_date)
    total_days = len(date_range)
    weekend_days = date_range.to_series().map(lambda x: x.weekday() >= 5).sum()
    weekday_days = total_days - weekend_days
    return total_days, int(weekend_days), int(weekday_days)

# 프로파일 : tmap

In [4]:
# DT = "2024-06-02"
current_date = datetime.strptime(DT, "%Y-%m-%d") - timedelta(days=2)
DT_threshold1 = (current_date - timedelta(days=59)).strftime("%Y-%m-%d")
DT_threshold2 = (current_date - timedelta(days=6)).strftime("%Y-%m-%d")

print("DT : ", DT)
print("DT_threshold : ", DT_threshold1)
print("DT_threshold : ", DT_threshold2)

DT :  2024-06-26
DT_threshold :  2024-04-26
DT_threshold :  2024-06-18


In [5]:
total_days, weekend_days, weekday_days = calculate_days(DT_threshold1, current_date.strftime("%Y-%m-%d"))

In [6]:
total_days, weekend_days, weekday_days

(60, 18, 42)

In [7]:
query_item = f"""
    select *
    from adot_reco_dev.tmap_item_cnt
    where dt >= '{DT_threshold1}'
"""

In [8]:
data_item = bq_to_df(query_item)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/28 13:06:04 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.
24/06/28 13:06:29 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.enabled' instead of it.


In [10]:
data_item.show(n=10)

[Stage 0:>                                                          (0 + 1) / 1]

+--------------------+--------------------+------------+----+----+--------+--------+----------+----------+
|        svc_mgmt_num|             luna_id|        item|cat1|cat2|item_cnt|second_s|is_weekend|        dt|
+--------------------+--------------------+------------+----+----+--------+--------+----------+----------+
|58a0b20493b152fe3...|                null|      이케아|가구|쇼핑|       2|     0_6|         0|2024-04-26|
|c404add21c4e58a1b...|                null|      이케아|가구|쇼핑|       2|   13_18|         0|2024-04-26|
|bdaa84add3536a5bf...|APL00000D1MO3IPNKX6O|      이케아|가구|쇼핑|       2|   13_18|         0|2024-04-26|
|4bc57aac33df27c8c...|                null|    가구기타|가구|쇼핑|       2|   13_18|         0|2024-04-26|
|f0d641aa5fb457a40...|APL00000BPNBTD1BRE9S|    가구기타|가구|쇼핑|       4|   13_18|         0|2024-04-26|
|2bea244d24c088905...|APL00000DBGP8NSGVYF4|기타국내가구|가구|쇼핑|       2|   13_18|         0|2024-04-26|
|88f5c7b5b500ab920...|APL00000BU4IESM90CG0|      이케아|가구|쇼핑|       4|   18_21|       

                                                                                

In [9]:
def extract_prefer_list(data, thre=0.7, col_names='tmap_total'):
    data_distinct = data.select(['cat1','cat2','item']).distinct()
    
    data = data.select(['svc_mgmt_num','luna_id','item']).na.drop("any", subset='item')
    data = data.groupby('svc_mgmt_num','luna_id','item').agg(count("*").alias('item_cnt'))
    data_user = data.groupby(['svc_mgmt_num','luna_id']).agg(max('item_cnt').alias('item_max_cnt'))
    data = data.join(data_user, on=['svc_mgmt_num','luna_id'], how="left")
    data = data.withColumn('item_prop', col('item_cnt')/col('item_max_cnt'))
    
    # 전체 빈도수 기반으로 선호 장소 추출
    data1 = data.filter(col('item_prop') > thre)
    window_spec1 = Window.partitionBy(["svc_mgmt_num",'luna_id']).orderBy(desc('item_prop'))

    top_keywords1 = data1.withColumn("rank", row_number().over(window_spec1)) \
                    .filter(col("rank") <= 10) \
                    .drop("rank")
    
    top_keywords1 = top_keywords1.dropDuplicates(['luna_id','item']).select(['svc_mgmt_num', 'luna_id', 'item'])
    # data1 = top_keywords.groupBy("svc_mgmt_num",'luna_id').agg(concat_ws(", ",collect_list(f'{cate}')).alias(f'{cate}_list'))
    
    # IDF 고려하여 유저가 특이하게 방문하는 장소 추출
    data_item_user_cnt = data.select(['svc_mgmt_num','luna_id']).distinct().count()
    
    data_idf = data.select(['svc_mgmt_num','luna_id','item']).distinct()
    data_idf_cnt = data_idf.groupby('item').agg(count("*").alias('item_idf_cnt'))
    data2 = data.join(data_idf_cnt,on='item',how="left")
    data2 = data2.withColumn('idf_wei', log(data_item_user_cnt / (1+col("item_idf_cnt"))))
    data2 = data2.withColumn('rev_item_prop', 0.6/(1.0 + exp(-col("idf_wei")))+col('item_prop')*0.4)
    data2 = data2.filter(col('rev_item_prop') > 0.7)
    
    window_spec2 = Window.partitionBy(["svc_mgmt_num",'luna_id']).orderBy(desc('rev_item_prop'))

    top_keywords2 = data2.withColumn("rank", row_number().over(window_spec2)) \
                    .filter(col("rank") <= 5) \
                    .drop("rank")
    
    top_keywords2 = top_keywords2.dropDuplicates(['luna_id','item'])
    # top_keywords2 = top_keywords2.withColumnRenamed('item', 'item_rev')
    
    # top_keywords = top_keywords1.join(top_keywords2, on=['svc_mgmt_num', 'luna_id'], how="left")
    item_per_group = top_keywords1.groupBy(['svc_mgmt_num', 'luna_id']).agg(collect_set("item").alias("item_values"))
    
    top_keywords2 = top_keywords2.join(item_per_group, on=['svc_mgmt_num', 'luna_id'])
    top_keywords2 = top_keywords2.filter(~array_contains(col("item_values"), col("item"))).select(['svc_mgmt_num', 'luna_id', 'item'])
    
    
    ## cat1, cat2 붙이기 위한 작업
    top_keywords = top_keywords1.union(top_keywords2)
    top_keywords = top_keywords.join(data_distinct, on="item", how="left")
    
    ## 전체 데이터 merge
    data = top_keywords1.groupBy('luna_id').agg(concat_ws(", ",collect_set('item')).alias(f'{col_names}_item_list'))
    data = data.join(top_keywords2.groupBy('luna_id').agg(concat_ws(", ",collect_set('item')).alias(f'{col_names}_item_list_unique')), on=['luna_id'], how="left")
    data = data.join(top_keywords.groupBy('luna_id').agg(concat_ws(", ",collect_set('cat1')).alias(f'{col_names}_cat1_list'), concat_ws(", ",collect_set('cat2')).alias(f'{col_names}_cat2_list')), on=['luna_id'], how="left")
    

    return data

## item 기준으로 주말, 평일로 나누고, item에 해당하는 cat1,cat2도 함께 저장하는 형태

In [10]:
total_data = extract_prefer_list(data_item, 0.5, 'tmap_total')
total_data = total_data.join(extract_prefer_list(data_item.filter(col('is_weekend')==1), 0.5, 'tmap_weekend'),on="luna_id",how="left")
total_data = total_data.join(extract_prefer_list(data_item.filter(col('is_weekend')==0), 0.5, 'tmap_weekday'),on="luna_id",how="left")

                                                                                

In [11]:
total_data.printSchema()

root
 |-- luna_id: string (nullable = true)
 |-- tmap_total_item_list: string (nullable = false)
 |-- tmap_total_item_list_unique: string (nullable = true)
 |-- tmap_total_cat1_list: string (nullable = true)
 |-- tmap_total_cat2_list: string (nullable = true)
 |-- tmap_weekend_item_list: string (nullable = true)
 |-- tmap_weekend_item_list_unique: string (nullable = true)
 |-- tmap_weekend_cat1_list: string (nullable = true)
 |-- tmap_weekend_cat2_list: string (nullable = true)
 |-- tmap_weekday_item_list: string (nullable = true)
 |-- tmap_weekday_item_list_unique: string (nullable = true)
 |-- tmap_weekday_cat1_list: string (nullable = true)
 |-- tmap_weekday_cat2_list: string (nullable = true)



In [12]:
total_data = total_data.withColumn("tmap_total_item_list", concat_ws(", ", total_data.tmap_total_item_list, total_data.tmap_total_item_list_unique))

### Profile text 화

In [13]:
profile_template = bq_to_pandas("select * from adot_reco_dev.profile_template")

query: select * from adot_reco_dev.profile_template
destination: skt-datahub._775c5ccab1096b3cccd7ac34a5db11c0a354fb07.anonede30ccb69980634173a489a463132ab54db42983ba882a85f38a05248ccba31
total_rows: 6
slot_secs: 0.058





Downloading: 100%|[32m██████████[0m|


In [14]:
template = list(profile_template[profile_template['source_domain']=="tmap"].template)[0]

def profile_text(tmap_cat1,tmap_item):
    return template.format(cat1_profile=tmap_cat1, item_profile=tmap_item)

profile_text_udf = udf(profile_text, StringType())
total_data = total_data.withColumn("tmap_profile", profile_text_udf(total_data["tmap_total_cat1_list"], total_data["tmap_total_item_list"]))

In [15]:
total_data.columns

['luna_id',
 'tmap_total_item_list',
 'tmap_total_item_list_unique',
 'tmap_total_cat1_list',
 'tmap_total_cat2_list',
 'tmap_weekend_item_list',
 'tmap_weekend_item_list_unique',
 'tmap_weekend_cat1_list',
 'tmap_weekend_cat2_list',
 'tmap_weekday_item_list',
 'tmap_weekday_item_list_unique',
 'tmap_weekday_cat1_list',
 'tmap_weekday_cat2_list',
 'tmap_profile']

### AIDP 저장

In [16]:
dest_dataset = "x1113099"
partitioned_dest_table = "user_retrieval_profile_tmap_text"

In [17]:
get_bigquery_client().query(f"""
    CREATE TABLE IF NOT EXISTS {dest_dataset}.{partitioned_dest_table}
    (
        luna_id STRING,
        tmap_total_item_list STRING,
        tmap_total_item_list_unique STRING,
        tmap_total_cat1_list STRING,
        tmap_total_cat2_list STRING,
        tmap_weekend_item_list STRING,
        tmap_weekend_item_list_unique STRING,
        tmap_weekend_cat1_list STRING,
        tmap_weekend_cat2_list STRING,
        tmap_weekday_item_list STRING,
        tmap_weekday_item_list_unique STRING,
        tmap_weekday_cat1_list STRING,
        tmap_weekday_cat2_list STRING,
        tmap_profile STRING
    )
""").result()

print(f"생성된 테이블 : {dest_dataset}.{partitioned_dest_table}")

생성된 테이블 : x1113099.user_retrieval_profile_tmap_text


In [18]:
df_to_bq_table(df=total_data,
               dataset=dest_dataset,
               table_name=partitioned_dest_table,
               mode="overwrite")

                                                                                