In [35]:
# # # Parameters
DT = "2024-06-26"   

## **One Model Profile 생성 작업**

- 개요
    - One Model Profile 생성 작업
- 내용
    - item, cat1, cat2 level로 프로파일링 작업 수행
    - 단기 preference 정보
        - 7일 내 item 정보를 통해서 생성
        - 이때, item은 정규화 로직을 통해 추출된 단어 기반으로 최종 단어들은 추출
    - 장기 preference 정보
        - 60일 내 cat1, cat2 정보를 통해서 생성
        - cat2가 더 상위 레벨임. 어떤 레벨로 장기 구성할지 선택 필요- 내용- 내용

In [36]:
from skt.gcp import get_bigquery_client, bq_insert_overwrite, get_max_part, bq_to_df, bq_to_pandas, pandas_to_bq_table, load_query_result_to_table, df_to_bq_table
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col, lit, count, log, exp, sum as spark_sum
from pyspark.sql import functions as F
from datetime import datetime, date, timedelta
from skt.ye import get_spark
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import math
from pyspark.sql.types import DoubleType

In [37]:
def xdr_extract_profile(data, thre=0.6, ecdf_wei = 0.6, dt_cnt = 60, col_names1='xdr_cat1_total', col_names2='xdr_cat2_total'):
    #################
    ### ECDF 계산 ###
    #################
    
    # 그룹별로 데이터 정렬
    windowSpec = Window.partitionBy("cat1").orderBy("cat1_cnt_sum").rowsBetween(Window.unboundedPreceding, 0)


    data_sum = data.groupBy(["svc_mgmt_num","luna_id","cat1"]).agg(sum('cat1_cnt').alias('cat1_cnt_sum'))

    # 그룹별로 값의 개수 세기
    count_by_group = data_sum.groupBy("cat1").agg(count("*").alias("count"))

    # 원래 데이터프레임에 그룹별 카운트를 조인
    df_with_count = data_sum.join(count_by_group, on="cat1", how="left")

    # 누적합 계산
    df_with_cumsum = df_with_count.withColumn("cum_sum", spark_sum(lit(1)).over(windowSpec))

    # ECDF 계산
    df_with_ecdf = df_with_cumsum.withColumn("ecdf", col("cum_sum") / col("count"))

    # 필요한 열 선택
    result_df = df_with_ecdf.select('svc_mgmt_num','luna_id',"cat1", "cat1_cnt_sum", "ecdf").dropDuplicates(['luna_id',"cat1"])
    
    ############################
    ### IDF 기반 weight 계산 ###
    ############################
    data_dt_cnt = data.select(['luna_id','cat1','dt']).distinct().groupby(['luna_id','cat1']).agg(count("*").alias("luna_cat1_cnt"))
    data_dt_cnt = data_dt_cnt.withColumn("df_weight", 1/log(lit(dt_cnt) / col("luna_cat1_cnt") + 1.0e-8))
    data_dt_cnt = data_dt_cnt.withColumn("rev_df_weight", 1.0/(1.0 + exp(-col("df_weight"))))
    
    #### Merge ####
    merge_data = result_df.dropna(subset="luna_id").select(['luna_id','cat1','ecdf']).join(data_dt_cnt.select(['luna_id', 'cat1','rev_df_weight']),on=['luna_id','cat1'],how="left")
    merge_data = merge_data.withColumn('score', col('ecdf')*ecdf_wei + col('rev_df_weight')*(1-ecdf_wei))
    
    data_cat1_distinct = data_cat1.select(['cat1','cat2']).distinct()
    merge_data = merge_data.join(data_cat1_distinct, on="cat1", how="left")
    
    window_spec = Window.partitionBy(['luna_id']).orderBy(desc('score'))
    merge_data = merge_data.withColumn("rank", row_number().over(window_spec)) \
                    .filter(col("rank") <= 10) \
                    .drop("rank")
    
    merge_data = merge_data.filter(col("score") >= thre).orderBy(desc("score")).groupBy("luna_id").agg(concat_ws(",",collect_set("cat1")).alias("cat1_list"), concat_ws(",",collect_set("cat2")).alias("cat2_list"))
    merge_data = merge_data.withColumnRenamed("cat1_list", col_names1)
    merge_data = merge_data.withColumnRenamed("cat2_list", col_names2)
    
    return merge_data

def calculate_days(start_date, end_date):
    date_range = pd.date_range(start=start_date, end=end_date)
    total_days = len(date_range)
    weekend_days = date_range.to_series().map(lambda x: x.weekday() >= 5).sum()
    weekday_days = total_days - weekend_days
    return total_days, int(weekend_days), int(weekday_days)

# 프로파일 : xdr

In [38]:
# DT = "2024-06-02"
current_date = datetime.strptime(DT, "%Y-%m-%d") - timedelta(days=2)
DT_threshold1 = (current_date - timedelta(days=29)).strftime("%Y-%m-%d")
DT_threshold2 = (current_date - timedelta(days=6)).strftime("%Y-%m-%d")

print("DT : ", DT)
print("DT_threshold : ", DT_threshold1)
print("DT_threshold : ", DT_threshold2)

DT :  2024-06-26
DT_threshold :  2024-05-26
DT_threshold :  2024-06-18


In [39]:
total_days, weekend_days, weekday_days = calculate_days(DT_threshold1, current_date.strftime("%Y-%m-%d"))

In [40]:
total_days, weekend_days, weekday_days

(30, 9, 21)

In [41]:
# query_cat2 = f"""
#     select *
#     from adot_reco_dev.xdr_cat2_cnt
#     where dt >= '{DT_threshold1}'
# """

query_cat1 = f"""
    select *
    from adot_reco_dev.xdr_cat1_cnt
    where dt >= '{DT_threshold1}'
"""

In [42]:
# data_cat2 = bq_to_df(query_cat2)
data_cat1 = bq_to_df(query_cat1)

24/06/28 09:18:33 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.enabled' instead of it.


In [43]:
cat2_set = {'information':'정보', 'life':'생활', 'location':'위치/이동', 'game':'게임', 'shopping':'쇼핑',
            'communication':'커뮤니케이션', 'finance':'금융', 'utility':'유틸리티', 'entertainment':'엔터테인먼트',
            'education':'교육', 'business':'회사/사업', 'leisure':'스포츠/레져/여행'}

In [44]:
cat1_set = {'references':'지식/위키', 'fashion_beauty':'스타일/패션/뷰티', 'art_design':'디자인', 'location_etc':'location_etc',
           'action':'액션게임', 'openmarket':'온라인 쇼핑몰', 'voip':'화상 채팅/회의', 'stock':'주식투자', 'blog':'블로그', 'casual':'캐주얼 게임',
           'bank_card':'은행', 'webservice':'웹서비스', 'information_etc':'기타 정보 커뮤니티', 'payment':'페이앱',
            'entertainment_etc':'entertainment_etc', 'cartoon':'만화', 'sns':'소셜미디어', 'religion':'종교',
           'carlife':'자동차/중고차/렌트', 'government':'정부운영', 'job':'구직', 'weather':'날씨/미세먼지', 'sktservice':'SKT서비스',
           'strategy':'전략게임', 'sharing':'공유 전기 자전거/킥보드', 'operation/management':'출퇴근 관리', 'itshopping':'IT/ 전자기기 쇼핑몰',
           'puzzle_quiz':'퍼즐/퀴즈 게임', 'messenger':'메신저', 'travel':'여행', 'mall_mart_dfs':'대형 쇼핑몰 앱',
           'b2b_solution':'전자상거래 비즈니스/마케팅 제휴', 'life_etc':'life_etc', 'lifestyle':'라이프스타일', 'simulation':'시뮬레이션 게임', 'food':'맛집/음식점, 배달',
           'hotel':'호텔/숙박', 'rhythmgame':'리듬게임', 'move/interior':'이사 및 인테리어', 'education_info':'학생/아이 케어 및 관리',
           'delivery_cargo':'물류/택배', 'productivity_tools':'공유 협업', 'mnd':'군인', 'ride_request':'공유차(택시앱)',
           'vr/ar':'vr/ar', 'leisure':'취미 활동', 'dating':'데이팅', 'assetmanagement':'자산 관리/투자', 'racing':'레이싱게임',
           'communication_etc':'communication_etc', 'telco':'통신사', 'cloudgaming':'클라우드 게임', 'portal':'포탈앱',
           'business_etc':'회사 관련', 'email':'메일', 'aiservice':'AI서비스', 'overseas direct purchase':'해외직구',
           'used':'중고거래', 'movie':'영화관', 'news':'온라인 뉴스', 'coupons_discounts':'쿠폰/포인트앱', 'video_broadcasting':'라이브 방송 플랫폼',
           'sportsgame':'스포츠게임', 'book':'온라인 서점/e 북', 'socialcommerce':'소셜커머스', 'culture and arts':'문화예술',
           'board':'보드게임', 'transport':'운송(공유차/비행기/버스)', 'smartdevice':'홈 IOT', 'map_navigation':'지도/네비', 'game_etc':'game_etc',
           'homeshopping':'홈쇼핑', 'p2p_webhard':'P2P_Webhard', 'photo':'사진', 'shopping_etc':'Shopping_etc', 'security':'보안',
           'pcs':'PC 쇼핑몰', 'english':'영어교육', 'fresh_delivery':'신선 식품/식재료 전문 스토어', 'finance_etc':'finance_etc', 'utility_etc':'utility_etc',
           'radio':'라디오', 'music':'음악', 'education_etc':'education_etc', 'insurance':'보험', 'roleplaying':'롤플레잉게임',
           'kids':'유아/아기', 'school':'대학교 홈페이지', 'onlinepayment':'온라인 결제', 'health':'헬스/비대면진료', 'svod':'영상 OTT', 'card':'카드게임',
           'companion animal':'동물 캠페인', 'realestates':'부동산', 'cryptocurrency':'암호화폐', 'sports':'스포츠',
           'adult':'성인관련', 'humor':'유머/코믹'}

# community = business('커뮤니티/게시판'), communication('대학교 관련 정보') 둘다 있음

In [45]:
def xdr_cat2_to_text(xdr_cat2):
    if xdr_cat2 in list(cat2_set.keys()):
        return cat2_set[xdr_cat2]
    else:
        return "기타"

xdr_cat2_to_text_udf = udf(xdr_cat2_to_text, StringType())
data_cat1 = data_cat1.withColumn("cat2", xdr_cat2_to_text_udf(data_cat1["cat2"]))

In [46]:
def xdr_cat1_to_text(xdr_cat1, xdr_cat2):
    if xdr_cat1 in list(cat1_set.keys()):
        if xdr_cat1 == "community":
            if xdr_cat2 == "회사/사업":
                return '커뮤니티/게시판'
            else: # '커뮤니케이션'
                return '대학교 관련 정보'
        else:
            return cat1_set[xdr_cat1]
    else:
        return "기타"

xdr_cat1_to_text_udf = udf(xdr_cat1_to_text, StringType())
data_cat1 = data_cat1.withColumn("cat1", xdr_cat1_to_text_udf(data_cat1["cat1"], data_cat1["cat2"]))

In [47]:
# data_cat2.show(n=10)

In [35]:
data_cat1.show(n=10)

+--------------------+--------------------+----+----+--------+--------+----------+----------+
|        svc_mgmt_num|             luna_id|cat2|cat1|cat1_cnt|second_s|is_weekend|        dt|
+--------------------+--------------------+----+----+--------+--------+----------+----------+
|dfc3313a5d6a4a9ae...|                null|life| job|       4|   13_18|         1|2024-05-26|
|596545417fca4b9c4...|                null|life| job|       2|    9_11|         1|2024-05-26|
|877b2b77661bb3879...|                null|life| job|       2|   21_24|         1|2024-05-26|
|796fde72c83867aab...|APL00000CW0IO667EO00|life| job|       2|   18_21|         1|2024-05-26|
|815445a95ccfba272...|                null|life| job|       4|   18_21|         1|2024-05-26|
|007b090b9ebc5c6e0...|APL00000DHUYR674EEBK|life| job|       2|   21_24|         1|2024-05-26|
|25bc24900d5b213a1...|                null|life| job|       2|   18_21|         1|2024-05-26|
|f4800c58a08184255...|                null|life| job|       

## 전체, 주말, 주중(출근, 퇴근, 일과시간) 별 cat1 list 추출

In [36]:
# def adot_extract_profile(data, thre=0.6, ecdf_wei = 0.6, dt_cnt = 60, col_names='adot_cat1_total')
# total_days, weekend_days, weekday_days

In [48]:
total_data = xdr_extract_profile(data_cat1, 0.6, 0.6, total_days , 'xdr_cat1_total', 'xdr_cat2_total')
total_data = total_data.join(xdr_extract_profile(data_cat1.filter(col('is_weekend')==1), 0.5, 0.6, weekend_days, 'xdr_cat1_weekend', 'xdr_cat2_weekend'),on="luna_id",how="left")
total_data = total_data.join(xdr_extract_profile(data_cat1.filter((col('is_weekend')==0) & (col('second_s')=="6_9")), 0.5, 0.6, weekday_days, 'xdr_cat1_go_to_work', "xdr_cat2_go_to_work"),on="luna_id",how="left")
total_data = total_data.join(xdr_extract_profile(data_cat1.filter((col('is_weekend')==0) & ((col('second_s')=="18_21") | (col('second_s')=="21_24"))), 0.5, 0.6, weekday_days, 'xdr_cat1_leave_work', 'xdr_cat2_leave_work'),on="luna_id",how="left")
total_data = total_data.join(xdr_extract_profile(data_cat1.filter((col('is_weekend')==0) & ((col('second_s')=="9_11") | (col('second_s')=="11_13") | (col('second_s')=="13_18"))), 0.5, 0.6, weekday_days, 'xdr_cat1_working_hour','xdr_cat2_working_hour'),on="luna_id",how="left")

In [49]:
total_data.columns

['luna_id',
 'xdr_cat1_total',
 'xdr_cat2_total',
 'xdr_cat1_weekend',
 'xdr_cat2_weekend',
 'xdr_cat1_go_to_work',
 'xdr_cat2_go_to_work',
 'xdr_cat1_leave_work',
 'xdr_cat2_leave_work',
 'xdr_cat1_working_hour',
 'xdr_cat2_working_hour']

In [31]:
# total_data.show(n=10)

## Profile text 화

In [50]:
profile_template = bq_to_pandas("select * from adot_reco_dev.profile_template")

unsupported operand type(s) for /: 'NoneType' and 'int'




Downloading: 100%|[32m██████████[0m|


In [51]:
template = list(profile_template[profile_template['source_domain']=="xtr"].template)[0]

def profile_text(xdr_cat1):
    return template.format(cat1_profile=xdr_cat1)

profile_text_udf = udf(profile_text, StringType())
total_data = total_data.withColumn("xdr_profile", profile_text_udf(total_data["xdr_cat1_total"]))

In [52]:
total_data.columns

['luna_id',
 'xdr_cat1_total',
 'xdr_cat2_total',
 'xdr_cat1_weekend',
 'xdr_cat2_weekend',
 'xdr_cat1_go_to_work',
 'xdr_cat2_go_to_work',
 'xdr_cat1_leave_work',
 'xdr_cat2_leave_work',
 'xdr_cat1_working_hour',
 'xdr_cat2_working_hour',
 'xdr_profile']

### AIDP 저장

In [53]:
dest_dataset = "x1113099"
partitioned_dest_table = "user_retrieval_profile_xdr_text"

In [54]:
get_bigquery_client().query(f"""
    CREATE TABLE IF NOT EXISTS {dest_dataset}.{partitioned_dest_table}
    (
        luna_id STRING,
        xdr_cat1_total STRING,
        xdr_cat2_total STRING,
        xdr_cat1_weekend STRING,
        xdr_cat2_weekend STRING,
        xdr_cat1_go_to_work STRING,
        xdr_cat2_go_to_work STRING,
        xdr_cat1_leave_work STRING,
        xdr_cat2_leave_work STRING,
        xdr_cat1_working_hours STRING,
        xdr_cat2_working_hours STRING,
        xdr_profile STRING
    )
""").result()

print(f"생성된 테이블 : {dest_dataset}.{partitioned_dest_table}")

생성된 테이블 : x1113099.user_retrieval_profile_xdr_text


In [55]:
df_to_bq_table(df=total_data,
               dataset=dest_dataset,
               table_name=partitioned_dest_table,
               mode="overwrite")

24/06/28 11:28:16 WARN DAGScheduler: Broadcasting large task binary with size 1642.9 KiB
                                                                                

### ECDF 계산

In [8]:
# 그룹별로 데이터 정렬
windowSpec = Window.partitionBy("cat1").orderBy("cat1_cnt_sum").rowsBetween(Window.unboundedPreceding, 0)


data_cat1_sum = data_cat1.groupBy(["svc_mgmt_num","luna_id","cat1"]).agg(sum('cat1_cnt').alias('cat1_cnt_sum'))

# 그룹별로 값의 개수 세기
count_by_group = data_cat1_sum.groupBy("cat1").agg(count("*").alias("count"))

# 원래 데이터프레임에 그룹별 카운트를 조인
df_with_count = data_cat1_sum.join(count_by_group, on="cat1", how="left")

# 누적합 계산
df_with_cumsum = df_with_count.withColumn("cum_sum", spark_sum(lit(1)).over(windowSpec))

# ECDF 계산
df_with_ecdf = df_with_cumsum.withColumn("ecdf", col("cum_sum") / col("count"))

# 필요한 열 선택
result_df = df_with_ecdf.select('svc_mgmt_num','luna_id',"cat1", "cat1_cnt_sum", "ecdf")

In [9]:
# result_df.show(n=10)

In [10]:
# result_df.filter(col('cat1')=="apollo_calendar").orderBy(desc('ecdf')).show(n=10)

### IDF 기반 weight 계산

In [11]:
data_cat1_dt_cnt = data_cat1.select(['luna_id','cat1','dt']).distinct().groupby(['luna_id','cat1']).agg(count("*").alias("luna_cat1_cnt"))
data_cat1_dt_cnt = data_cat1_dt_cnt.withColumn("df_weight", 1/log(lit(60.0) / col("luna_cat1_cnt") + 1.0e-8))

In [12]:
# data_cat1_dt_cnt.show(n=10)

In [13]:
# def logistic_function(x, k=1):
#     return 1 / (1 + math.exp(-k * x))

# # UDF로 등록
# logistic_function_udf = udf(lambda x: logistic_function(x), DoubleType())

# 새로운 열로 로지스틱 함수 적용
# data_cat1_dt_cnt = data_cat1_dt_cnt.withColumn("rev_df_weight", logistic_function_udf(col("df_weight")))
data_cat1_dt_cnt = data_cat1_dt_cnt.withColumn("rev_df_weight", 1.0/(1.0 + exp(-col("df_weight"))))

In [14]:
# data_cat1_dt_cnt.show(n=10)

In [15]:
merge_data = result_df.dropna(subset="luna_id").select(['luna_id','cat1','ecdf']).join(data_cat1_dt_cnt.select(['luna_id', 'cat1','rev_df_weight']),on=['luna_id','cat1'],how="left")
merge_data = merge_data.withColumn('score', col('ecdf')*0.6 + col('rev_df_weight')*0.4)
merge_data1 = merge_data.filter(col("score") >= 0.6).orderBy(desc("score")).groupBy("luna_id").agg(concat_ws(",",collect_list("cat1")).alias("cat1_list"))

In [16]:
merge_data1_pd = merge_data1.limit(100).toPandas()

                                                                                

In [18]:
merge_data1_pd.loc[0,'cat1_list']

'apollo_news,apollo_call,apollo_music,apollo_routine,apollo_game'

In [19]:
merge_data1_pd.loc[1,'cat1_list']

'apollo_congestion'

In [20]:
merge_data1_pd.head(20)

Unnamed: 0,luna_id,cat1_list
0,APL00000BJ19IFQXK54W,"apollo_news,apollo_call,apollo_music,apollo_ro..."
1,APL00000BJ1GGZ9T2MM8,apollo_congestion
2,APL00000BJ3M81SBZEO0,"apollo_music,apollo_congestion"
3,APL00000BJ3T4RRPWT1C,"apollo_weather,apollo_mno,apollo_news,apollo_r..."
4,APL00000BJ450BY2BSOW,apollo_mno
5,APL00000BJ7880KK6QKG,apollo_mytv
6,APL00000BJB0ZL9A77K0,apollo_music
7,APL00000BJBSY57E9S00,apollo_mno
8,APL00000BJCAJNIZHBLS,"apollo_fortune,apollo_routine,apollo_congestio..."
9,APL00000BJEGDEVI6OLC,apollo_congestion
