In [1]:
# # # Parameters
DT = "2024-06-26"   

## **One Model Profile 생성 작업**

- 개요
    - One Model Profile 생성 작업
- 내용
    - item, cat1, cat2 level로 프로파일링 작업 수행
    - 단기 preference 정보
        - 7일 내 item 정보를 통해서 생성
        - 이때, item은 정규화 로직을 통해 추출된 단어 기반으로 최종 단어들은 추출
    - 장기 preference 정보
        - 60일 내 cat1, cat2 정보를 통해서 생성
        - cat2가 더 상위 레벨임. 어떤 레벨로 장기 구성할지 선택 필요- 내용- 내용

In [1]:
from skt.gcp import get_bigquery_client, bq_insert_overwrite, get_max_part, bq_to_df, bq_to_pandas, pandas_to_bq_table, load_query_result_to_table, df_to_bq_table
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col, lit, count, log, exp, sum as spark_sum
from pyspark.sql import functions as F
from datetime import datetime, date, timedelta
from skt.ye import get_spark
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import math
from pyspark.sql.types import DoubleType

In [10]:
def adot_extract_profile(data, thre=0.6, ecdf_wei = 0.6, dt_cnt = 60, col_names='adot_cat1_total'):
    #################
    ### ECDF 계산 ###
    #################
    
    # 그룹별로 데이터 정렬
    windowSpec = Window.partitionBy("cat1").orderBy("cat1_cnt_sum").rowsBetween(Window.unboundedPreceding, 0)


    data_sum = data.groupBy(["svc_mgmt_num","luna_id","cat1"]).agg(sum('cat1_cnt').alias('cat1_cnt_sum'))

    # 그룹별로 값의 개수 세기
    count_by_group = data_sum.groupBy("cat1").agg(count("*").alias("count"))

    # 원래 데이터프레임에 그룹별 카운트를 조인
    df_with_count = data_sum.join(count_by_group, on="cat1", how="left")

    # 누적합 계산
    df_with_cumsum = df_with_count.withColumn("cum_sum", spark_sum(lit(1)).over(windowSpec))

    # ECDF 계산
    df_with_ecdf = df_with_cumsum.withColumn("ecdf", col("cum_sum") / col("count"))

    # 필요한 열 선택
    result_df = df_with_ecdf.select('svc_mgmt_num','luna_id',"cat1", "cat1_cnt_sum", "ecdf").dropDuplicates(['luna_id',"cat1"])
    
    ############################
    ### IDF 기반 weight 계산 ###
    ############################
    data_dt_cnt = data.select(['luna_id','cat1','dt']).distinct().groupby(['luna_id','cat1']).agg(count("*").alias("luna_cat1_cnt"))
    data_dt_cnt = data_dt_cnt.withColumn("df_weight", 1/log(lit(dt_cnt) / col("luna_cat1_cnt") + 1.0e-8))
    data_dt_cnt = data_dt_cnt.withColumn("rev_df_weight", 1.0/(1.0 + exp(-col("df_weight"))))
    
    #### Merge ####
    merge_data = result_df.dropna(subset="luna_id").select(['luna_id','cat1','ecdf']).join(data_dt_cnt.select(['luna_id', 'cat1','rev_df_weight']),on=['luna_id','cat1'],how="left")
    merge_data = merge_data.withColumn('score', col('ecdf')*ecdf_wei + col('rev_df_weight')*(1-ecdf_wei))
    merge_data = merge_data.filter(col("score") >= thre).orderBy(desc("score")).groupBy("luna_id").agg(concat_ws(", ",collect_set("cat1")).alias("cat1_list"))
    merge_data = merge_data.withColumnRenamed("cat1_list", col_names)
    
    return merge_data

def calculate_days(start_date, end_date):
    date_range = pd.date_range(start=start_date, end=end_date)
    total_days = len(date_range)
    weekend_days = date_range.to_series().map(lambda x: x.weekday() >= 5).sum()
    weekday_days = total_days - weekend_days
    return total_days, int(weekend_days), int(weekday_days)

# 프로파일 : adot

In [11]:
# DT = "2024-06-02"
current_date = datetime.strptime(DT, "%Y-%m-%d") - timedelta(days=1)
DT_threshold1 = (current_date - timedelta(days=59)).strftime("%Y-%m-%d")
DT_threshold2 = (current_date - timedelta(days=6)).strftime("%Y-%m-%d")

print("DT : ", DT)
print("DT_threshold : ", DT_threshold1)
print("DT_threshold : ", DT_threshold2)

DT :  2024-06-26
DT_threshold :  2024-04-27
DT_threshold :  2024-06-19


In [12]:
total_days, weekend_days, weekday_days = calculate_days(DT_threshold1, current_date.strftime("%Y-%m-%d"))

In [13]:
total_days, weekend_days, weekday_days

(60, 18, 42)

In [14]:
# query_cat2 = f"""
#     select *
#     from adot_reco_dev.adot_cat2_cnt
#     where dt >= '{DT_threshold1}'
# """

query_cat1 = f"""
    select *
    from adot_reco_dev.adot_cat1_cnt
    where dt >= '{DT_threshold1}'
"""

# query_item = f"""
#     select *
#     from adot_reco_dev.adot_item_cnt
#     where dt >= '{DT_threshold2}'
# """

In [15]:
# data_cat2 = bq_to_df(query_cat2)
data_cat1 = bq_to_df(query_cat1)
# data_item = bq_to_df(query_item)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/27 17:14:04 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.
24/06/27 17:14:27 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.enabled' instead of it.
----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 32771)
Traceback (most recent call last):
  File "/usr/lib/python3.8/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib/python3.8/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib/python3.8/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, 

In [16]:
data_cat1.show(n=10)

[Stage 0:>                                                          (0 + 1) / 1]

+--------------------+--------------------+----+------------------+--------+--------+----------+----------+
|        svc_mgmt_num|             luna_id|cat2|              cat1|cat1_cnt|second_s|is_weekend|        dt|
+--------------------+--------------------+----+------------------+--------+--------+----------+----------+
|0f037d5a2f42ba8ab...|APL00000D1QENHBVQPS0|    |      apollo_radio|       1|   13_18|         1|2024-06-22|
|98107bdf0577d2ac8...|APL00000CXD2M7KGYI2O|    |      apollo_alarm|       1|   21_24|         1|2024-06-22|
|                null|APL00000DOEYDJI0LDKW|    |      apollo_radio|       1|   11_13|         1|2024-06-22|
|52cc2bdc6157c97cc...|APL00000BLHBPJQJ4RNK|    |      apollo_radio|       1|    9_11|         0|2024-06-12|
|5204e1b27da2f33a1...|APL00000D6M8ZFJVWLJ4|    |      apollo_radio|       1|     0_6|         0|2024-05-27|
|2fd8fe1dadaca3d99...|APL00000D5310BS3WGSG|    |      apollo_radio|       1|   13_18|         0|2024-06-18|
|8e7d69711b4196406...|APL000

                                                                                

In [17]:
data_cat1.select('second_s').distinct().show()



+--------+
|second_s|
+--------+
|   13_18|
|   21_24|
|   18_21|
|   11_13|
|    9_11|
|     0_6|
|     6_9|
+--------+



                                                                                

In [18]:
sevice_set = {'apollo_alarm':"알람", "apollo_anniversary":"생일", "apollo_calendar":"캘린더", "apollo_calendar2":"특일정보", "apollo_call":"전화", "apollo_campaign":"캠페인", "apollo_character":"캐릭터꾸미기",
              "apollo_chatgpt":"챗T", "apollo_congestion":"혼잡도", "apollo_cqa":"CQA", "apollo_dailycostume":"오늘뭐입지", "apollo_dictionary":"어학사전", "apollo_english":"튜터", "apollo_fortune":"운세",
              "apollo_friendsbitna":"프렌즈_길빛나", "apollo_friendsharu":"프렌즈_강하루", "apollo_friendsj":"프렌즈_육제이", "apollo_game":"게임", "apollo_glm":"감성대화", "apollo_glmtosqa":"GLMtoSQA", "apollo_keep":"keep",
              "apollo_liveqa":"LiveQA", "apollo_menu":"메뉴추천", "apollo_message":"문자", "apollo_mj":"맛집추천", "apollo_mno":"T서비스", "apollo_music":"음악", "apollo_mytv":"TV", "apollo_news":"뉴스", "apollo_photo":"포토",
              "apollo_podcast":"팟캐스트", "apollo_qfeed":"큐피드", "apollo_quest":"퀘스트", "apollo_radio":"라디오", "apollo_recipe":"레시피", "apollo_reward":"리워드", "apollo_ring":"링", "apollo_routine":"루틴",
              "apollo_samsungstock":"증권", "apollo_sleep":"sleep", "apollo_sports":"스포츠", "apollo_sqa":"SQA", "apollo_survey":"선호도조사", "apollo_time":"시간", "apollo_tmap":"TMAP", "apollo_tmembership":"T 멤버십",
              "apollo_tworld":"T 월드", "apollo_video":"비디오", "apollo_weather":"날씨", "apollo_wordchain":"끝말잇기", "apollo_friends":"프렌즈톡", "apollo_media":"미디어", "apollo_qa":"대표발화"}

In [19]:
def adot_domain_to_text(adot_cat1):
    if adot_cat1 in list(sevice_set.keys()):
        return sevice_set[adot_cat1]
    else:
        return "기타 서비스"

adot_domain_to_text_udf = udf(adot_domain_to_text, StringType())
data_cat1 = data_cat1.withColumn("cat1", adot_domain_to_text_udf(data_cat1["cat1"]))

## 전체, 주말, 주중(출근, 퇴근, 일과시간) 별 cat1 list 추출

In [20]:
# def adot_extract_profile(data, thre=0.6, ecdf_wei = 0.6, dt_cnt = 60, col_names='adot_cat1_total')
# total_days, weekend_days, weekday_days

In [21]:
total_data = adot_extract_profile(data_cat1, 0.5, 0.6, total_days , 'adot_cat1_total')
total_data = total_data.join(adot_extract_profile(data_cat1.filter(col('is_weekend')==1), 0.4, 0.6, weekend_days, 'adot_cat1_weekend'),on="luna_id",how="left")
total_data = total_data.join(adot_extract_profile(data_cat1.filter((col('is_weekend')==0) & (col('second_s')=="6_9")), 0.4, 0.6, weekday_days, 'adot_cat1_go_to_work'),on="luna_id",how="left")
total_data = total_data.join(adot_extract_profile(data_cat1.filter((col('is_weekend')==0) & ((col('second_s')=="18_21") | (col('second_s')=="21_24"))), 0.4, 0.6, weekday_days, 'adot_cat1_leave_work'),on="luna_id",how="left")
total_data = total_data.join(adot_extract_profile(data_cat1.filter((col('is_weekend')==0) & ((col('second_s')=="9_11") | (col('second_s')=="11_13") | (col('second_s')=="13_18"))), 0.4, 0.6, weekday_days, 'adot_cat1_working_hour'),on="luna_id",how="left")

In [22]:
total_data.columns

['luna_id',
 'adot_cat1_total',
 'adot_cat1_weekend',
 'adot_cat1_go_to_work',
 'adot_cat1_leave_work',
 'adot_cat1_working_hour']

## Profile text 화

In [2]:
profile_template = bq_to_pandas("select * from adot_reco_dev.profile_template")

unsupported operand type(s) for /: 'NoneType' and 'int'




Downloading: 100%|[32m██████████[0m|


In [23]:
template = list(profile_template[profile_template['source_domain']=="adot"].template)[0]

def profile_text(adot_cat1):
    return template.format(cat1_profile=adot_cat1)

profile_text_udf = udf(profile_text, StringType())
total_data = total_data.withColumn("adot_profile", profile_text_udf(total_data["adot_cat1_total"]))

In [25]:
total_data.columns

['luna_id',
 'adot_cat1_total',
 'adot_cat1_weekend',
 'adot_cat1_go_to_work',
 'adot_cat1_leave_work',
 'adot_cat1_working_hour',
 'adot_profile']

[Stage 4:=> (21 + 8) / 52][Stage 5:>   (0 + 0) / 52][Stage 6:>   (0 + 0) / 52]

In [4]:
profile_template

Unnamed: 0,template,source_domain,target_domain,dt,property
0,저는 에이닷 서비스 중 주로 {cat1_profile} 서비스를 사용했어요,adot,adot,2024-06-27,
1,저는 주로 {cat1_profile} 카테고리와 관련된 앱을 사용했어요,xtr,adot,2024-06-27,
2,저는 이동 시 주로 {cat1_profile} 관련 장소 중 {item_profil...,tmap,adot,2024-06-27,
3,평소 {longterm_profile} 관련 용품 구매에 관심,tdeal,adot,2024-06-27,long
4,최근 {shortterm_profile} 관련 용품 구매,tdeal,adot,2024-06-27,short
5,특징: {state_profile},tdeal,adot,2024-06-27,state


### AIDP 저장

In [26]:
dest_dataset = "x1113099"
partitioned_dest_table = "user_retrieval_profile_adot_text"

In [27]:
get_bigquery_client().query(f"""
    CREATE TABLE IF NOT EXISTS {dest_dataset}.{partitioned_dest_table}
    (
        luna_id STRING,
        adot_cat1_total STRING,
        adot_cat1_weekend STRING,
        adot_cat1_go_to_work STRING,
        adot_cat1_leave_work STRING,
        adot_cat1_working_hours STRING,
        adot_profile STRING
    )
""").result()

print(f"생성된 테이블 : {dest_dataset}.{partitioned_dest_table}")

[Stage 4:=> (22 + 8) / 52][Stage 5:>   (0 + 0) / 52][Stage 6:>   (0 + 0) / 52]

생성된 테이블 : x1113099.user_retrieval_profile_adot_text


In [28]:
df_to_bq_table(df=total_data,
               dataset=dest_dataset,
               table_name=partitioned_dest_table,
               mode="overwrite")

                                                                                

### ECDF 계산

In [8]:
# 그룹별로 데이터 정렬
windowSpec = Window.partitionBy("cat1").orderBy("cat1_cnt_sum").rowsBetween(Window.unboundedPreceding, 0)


data_cat1_sum = data_cat1.groupBy(["svc_mgmt_num","luna_id","cat1"]).agg(sum('cat1_cnt').alias('cat1_cnt_sum'))

# 그룹별로 값의 개수 세기
count_by_group = data_cat1_sum.groupBy("cat1").agg(count("*").alias("count"))

# 원래 데이터프레임에 그룹별 카운트를 조인
df_with_count = data_cat1_sum.join(count_by_group, on="cat1", how="left")

# 누적합 계산
df_with_cumsum = df_with_count.withColumn("cum_sum", spark_sum(lit(1)).over(windowSpec))

# ECDF 계산
df_with_ecdf = df_with_cumsum.withColumn("ecdf", col("cum_sum") / col("count"))

# 필요한 열 선택
result_df = df_with_ecdf.select('svc_mgmt_num','luna_id',"cat1", "cat1_cnt_sum", "ecdf")

In [9]:
# result_df.show(n=10)

In [10]:
# result_df.filter(col('cat1')=="apollo_calendar").orderBy(desc('ecdf')).show(n=10)

### IDF 기반 weight 계산

In [11]:
data_cat1_dt_cnt = data_cat1.select(['luna_id','cat1','dt']).distinct().groupby(['luna_id','cat1']).agg(count("*").alias("luna_cat1_cnt"))
data_cat1_dt_cnt = data_cat1_dt_cnt.withColumn("df_weight", 1/log(lit(60.0) / col("luna_cat1_cnt") + 1.0e-8))

In [12]:
# data_cat1_dt_cnt.show(n=10)

In [13]:
# def logistic_function(x, k=1):
#     return 1 / (1 + math.exp(-k * x))

# # UDF로 등록
# logistic_function_udf = udf(lambda x: logistic_function(x), DoubleType())

# 새로운 열로 로지스틱 함수 적용
# data_cat1_dt_cnt = data_cat1_dt_cnt.withColumn("rev_df_weight", logistic_function_udf(col("df_weight")))
data_cat1_dt_cnt = data_cat1_dt_cnt.withColumn("rev_df_weight", 1.0/(1.0 + exp(-col("df_weight"))))

In [14]:
# data_cat1_dt_cnt.show(n=10)

In [15]:
merge_data = result_df.dropna(subset="luna_id").select(['luna_id','cat1','ecdf']).join(data_cat1_dt_cnt.select(['luna_id', 'cat1','rev_df_weight']),on=['luna_id','cat1'],how="left")
merge_data = merge_data.withColumn('score', col('ecdf')*0.6 + col('rev_df_weight')*0.4)
merge_data1 = merge_data.filter(col("score") >= 0.6).orderBy(desc("score")).groupBy("luna_id").agg(concat_ws(",",collect_list("cat1")).alias("cat1_list"))

In [16]:
merge_data1_pd = merge_data1.limit(100).toPandas()

                                                                                

In [18]:
merge_data1_pd.loc[0,'cat1_list']

'apollo_news,apollo_call,apollo_music,apollo_routine,apollo_game'

In [19]:
merge_data1_pd.loc[1,'cat1_list']

'apollo_congestion'

In [20]:
merge_data1_pd.head(20)

Unnamed: 0,luna_id,cat1_list
0,APL00000BJ19IFQXK54W,"apollo_news,apollo_call,apollo_music,apollo_ro..."
1,APL00000BJ1GGZ9T2MM8,apollo_congestion
2,APL00000BJ3M81SBZEO0,"apollo_music,apollo_congestion"
3,APL00000BJ3T4RRPWT1C,"apollo_weather,apollo_mno,apollo_news,apollo_r..."
4,APL00000BJ450BY2BSOW,apollo_mno
5,APL00000BJ7880KK6QKG,apollo_mytv
6,APL00000BJB0ZL9A77K0,apollo_music
7,APL00000BJBSY57E9S00,apollo_mno
8,APL00000BJCAJNIZHBLS,"apollo_fortune,apollo_routine,apollo_congestio..."
9,APL00000BJEGDEVI6OLC,apollo_congestion
