In [None]:
from skt.gcp import (
    PROJECT_ID,
    bq_insert_overwrite,
    bq_to_df,
    bq_to_pandas,
    get_bigquery_client,
    bq_table_exists,
    get_max_part,
    load_query_result_to_table,
    pandas_to_bq,
    pandas_to_bq_table,
    load_bigquery_ipython_magic,
    get_bigquery_client,
    _print_query_job_results,
    load_query_result_to_partitions
    
)

from skt.ye import (
    get_hdfs_conn,
    get_spark,
    hive_execute,
    hive_to_pandas,
    pandas_to_parquet,
    slack_send,
    get_secrets
)

In [None]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.functions import (
    row_number, 
    col, 
    lit, 
    count, 
    log, 
    exp, 
    sum as spark_sum
)
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType

In [None]:
import pandas as pd
from datetime import datetime, date, timedelta

In [None]:
print(f'current_dt: {current_dt}')
print(f'state: {state}')
print(f'long_duration: {long_duration}')


In [None]:
execution_dt = datetime.strptime(current_dt, '%Y-%m-%d')
execution_dt_one_ago = (execution_dt - timedelta(days=1)).strftime('%Y-%m-%d')
log_duration = int(log_duration) - 1
short_duration = 6


In [None]:
long_start_dt = (current_date - timedelta(days=log_duration)).strftime("%Y-%m-%d")
short_start_dt = (current_date - timedelta(days=short_duration)).strftime("%Y-%m-%d")

print("long_start_dt : ", long_start_dt)
print("short_start_dt : ", short_start_dt)

In [None]:
db_name = "adot_reco_dev"

In [None]:
def adot_extract_profile(data, thre=0.6, ecdf_wei = 0.6, dt_cnt = 60, col_names='cat1_profiles'):
    #################
    ### ECDF 계산 ###
    #################
    
    # 그룹별로 데이터 정렬
    windowSpec = Window.partitionBy("cat1").orderBy("cat1_cnt_sum").rowsBetween(Window.unboundedPreceding, 0)

    data_sum = data.groupBy(["svc_mgmt_num","luna_id","cat1"]).agg(sum('cat1_cnt').alias('cat1_cnt_sum'))

    # 그룹별로 값의 개수 세기
    count_by_group = data_sum.groupBy("cat1").agg(count("*").alias("count"))

    # 원래 데이터프레임에 그룹별 카운트를 조인
    df_with_count = data_sum.join(count_by_group, on="cat1", how="left")

    # 누적합 계산
    df_with_cumsum = df_with_count.withColumn("cum_sum", spark_sum(lit(1)).over(windowSpec))

    # ECDF 계산
    df_with_ecdf = df_with_cumsum.withColumn("ecdf", col("cum_sum") / col("count"))

    # 필요한 열 선택
    result_df = df_with_ecdf.select('svc_mgmt_num','luna_id',"cat1", "cat1_cnt_sum", "ecdf").dropDuplicates(['luna_id',"cat1"])
    
    ############################
    ### IDF 기반 weight 계산 ###
    ############################
    data_dt_cnt = data.select(['luna_id','cat1','dt']).distinct().groupby(['luna_id','cat1']).agg(count("*").alias("luna_cat1_cnt"))
    data_dt_cnt = data_dt_cnt.withColumn("df_weight", 1/log(lit(dt_cnt) / col("luna_cat1_cnt") + 1.0e-8))
    data_dt_cnt = data_dt_cnt.withColumn("rev_df_weight", 1.0/(1.0 + exp(-col("df_weight"))))
    
    #### Merge ####
    merge_data = result_df.dropna(subset="luna_id").select(['luna_id','cat1','ecdf']).join(data_dt_cnt.select(['luna_id', 'cat1','rev_df_weight']),on=['luna_id','cat1'],how="left")
    merge_data = merge_data.withColumn('score', col('ecdf')*ecdf_wei + col('rev_df_weight')*(1-ecdf_wei))
    merge_data = merge_data.filter(col("score") >= thre).orderBy(desc("score")).groupBy("luna_id").agg(concat_ws(", ", collect_set("cat1")).alias("cat1_list"))
    merge_data = merge_data.withColumnRenamed("cat1_list", col_names)
    
    return merge_data

In [None]:
def calculate_days(start_date, end_date):
    date_range = pd.date_range(start=start_date, end=end_date)
    total_days = len(date_range)
    weekend_days = date_range.to_series().map(lambda x: x.weekday() >= 5).sum()
    weekday_days = total_days - weekend_days
    return total_days, int(weekend_days), int(weekday_days)

## **One Model Profile 생성 작업**

- 개요
    - One Model Profile 생성 작업
- 내용
    - item, cat1, cat2 level로 프로파일링 작업 수행
    - 단기 preference 정보
        - 7일 내 item 정보를 통해서 생성
        - 이때, item은 정규화 로직을 통해 추출된 단어 기반으로 최종 단어들은 추출
    - 장기 preference 정보
        - 60일 내 cat1, cat2 정보를 통해서 생성
        - cat2가 더 상위 레벨임. 어떤 레벨로 장기 구성할지 선택 필요- 내용- 내용

In [12]:
total_days, weekend_days, weekday_days = calculate_days(long_start_dt, current_dt)
log_format = f"""
    total_days: {total_days},
    weekend_days: {weekend_days},
    weekday_days: {weekday_days}
"""
print(log_format)

In [14]:
query_cat1 = f"""
    SELECT *
    FROM {db_name}.adot_cat1_cnt
    where dt >= '{long_start_dt}'
"""

In [15]:
data_cat1 = bq_to_df(query_cat1)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/27 17:14:04 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.
24/06/27 17:14:27 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.enabled' instead of it.
----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 32771)
Traceback (most recent call last):
  File "/usr/lib/python3.8/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib/python3.8/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib/python3.8/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, 

In [16]:
data_cat1.show(3, False)

[Stage 0:>                                                          (0 + 1) / 1]

+--------------------+--------------------+----+------------------+--------+--------+----------+----------+
|        svc_mgmt_num|             luna_id|cat2|              cat1|cat1_cnt|second_s|is_weekend|        dt|
+--------------------+--------------------+----+------------------+--------+--------+----------+----------+
|0f037d5a2f42ba8ab...|APL00000D1QENHBVQPS0|    |      apollo_radio|       1|   13_18|         1|2024-06-22|
|98107bdf0577d2ac8...|APL00000CXD2M7KGYI2O|    |      apollo_alarm|       1|   21_24|         1|2024-06-22|
|                null|APL00000DOEYDJI0LDKW|    |      apollo_radio|       1|   11_13|         1|2024-06-22|
|52cc2bdc6157c97cc...|APL00000BLHBPJQJ4RNK|    |      apollo_radio|       1|    9_11|         0|2024-06-12|
|5204e1b27da2f33a1...|APL00000D6M8ZFJVWLJ4|    |      apollo_radio|       1|     0_6|         0|2024-05-27|
|2fd8fe1dadaca3d99...|APL00000D5310BS3WGSG|    |      apollo_radio|       1|   13_18|         0|2024-06-18|
|8e7d69711b4196406...|APL000

                                                                                

In [18]:
adot_sevice_set = {
              "apollo_alarm":"알람", 
              "apollo_anniversary":"생일", 
              "apollo_calendar":"캘린더", 
              "apollo_calendar2":"특일정보", 
              "apollo_call":"전화", 
              "apollo_campaign":"캠페인", 
              "apollo_character":"캐릭터꾸미기",
              "apollo_chatgpt":"챗T", 
              "apollo_congestion":"혼잡도", 
              "apollo_cqa":"CQA", 
              "apollo_dailycostume":"오늘뭐입지", 
              "apollo_dictionary":"어학사전", 
              "apollo_english":"튜터", 
              "apollo_fortune":"운세",
              "apollo_friendsbitna":"프렌즈_길빛나", 
              "apollo_friendsharu":"프렌즈_강하루", 
              "apollo_friendsj":"프렌즈_육제이", 
              "apollo_game":"게임", 
              "apollo_glm":"감성대화", 
              "apollo_glmtosqa":"GLMtoSQA", 
              "apollo_keep":"keep",
              "apollo_liveqa":"LiveQA",
              "apollo_menu":"메뉴추천", 
              "apollo_message":"문자", 
              "apollo_mj":"맛집추천", 
              "apollo_mno":"T서비스", 
              "apollo_music":"음악", 
              "apollo_mytv":"TV", 
              "apollo_news":"뉴스", 
              "apollo_photo":"포토",
              "apollo_podcast":"팟캐스트", 
              "apollo_qfeed":"큐피드", 
              "apollo_quest":"퀘스트", 
              "apollo_radio":"라디오", 
              "apollo_recipe":"레시피", 
              "apollo_reward":"리워드", 
              "apollo_ring":"링", 
              "apollo_routine":"루틴",
              "apollo_samsungstock":"증권", 
              "apollo_sleep":"sleep", 
              "apollo_sports":"스포츠", 
              "apollo_sqa":"SQA", 
              "apollo_survey":"선호도 조사", 
              "apollo_time":"시간", 
              "apollo_tmap":"TMAP", 
              "apollo_tmembership":"T 멤버십",
              "apollo_tworld":"T 월드", 
              "apollo_video":"비디오", 
              "apollo_weather":"날씨", 
              "apollo_wordchain":"끝말잇기", 
              "apollo_friends":"프렌즈톡", 
              "apollo_media":"미디어", 
              "apollo_qa":"대표발화"
}

In [19]:
def adot_domain_to_text(adot_cat1):
    return adot_sevice_set.get(adot_cat1, "기타 서비스")

adot_domain_to_text_udf = F.udf(adot_domain_to_text, F.StringType())
data_cat1 = data_cat1.withColumn("cat1", adot_domain_to_text_udf(data_cat1["cat1"]))

## 전체, 주말, 주중(출근, 퇴근, 일과시간) 별 cat1 list 추출

In [21]:
total_data = adot_extract_profile(data_cat1, 0.5, 0.6, total_days , ' v')
total_data = total_data.join(adot_extract_profile(data_cat1.filter(F.col('is_weekend')==1), 0.4, 0.6, weekend_days, 'adot_cat1_weekend'), on="luna_id", how="left")
total_data = total_data.join(adot_extract_profile(data_cat1.filter((F.col('is_weekend')==0) & (F.col('second_s')=="6_9")), 0.4, 0.6, weekday_days, 'adot_cat1_go_to_work'),on="luna_id",how="left")
total_data = total_data.join(adot_extract_profile(data_cat1.filter((F.col('is_weekend')==0) & ((F.col('second_s')=="18_21") | (F.col('second_s')=="21_24"))), 0.4, 0.6, weekday_days, 'adot_cat1_leave_work'),on="luna_id",how="left")
total_data = total_data.join(adot_extract_profile(data_cat1.filter((F.col('is_weekend')==0) & ((F.col('second_s')=="9_11") | (F.col('second_s')=="11_13") | (F.col('second_s')=="13_18"))), 0.4, 0.6, weekday_days, 'adot_cat1_working_hour'),on="luna_id",how="left")
total_data = total_data.withColumn("source_domain", F.lit("adot"))

# 프로 파일 테이블 저장

In [None]:
PROJECT_ID = "skt-datahub"
db_name = "adot_reco_dev"
partitioned_dest_table = "adotServiceProfile_adot"

In [None]:
table_exists = bq_table_exists(table=f'{db_name}.{partitioned_dest_table}', project_id = PROJECT_ID)

In [None]:
if not table_exists:
    get_bigquery_client().query(f"""
        CREATE TABLE IF NOT EXISTS {db_name}.{partitioned_dest_table}(
            luna_id STRING,
            cat1_profiles STRING,
            adot_cat1_weekend STRING,
            adot_cat1_go_to_work STRING,
            adot_cat1_leave_work STRING,
            adot_cat1_working_hours STRING,
            source_domain STRING,
            dt Date
        )
        PARTITION BY dt
    """).result()

    print(f"생성된 테이블 : {db_name}.{partitioned_dest_table}")

In [None]:
df_to_bq_table(df=total_data,
               dataset=db_name,
               table_name=partitioned_dest_table,
               mode="overwrite")

# Template 입히기

In [2]:
profile_template = bq_to_pandas("SELECT * FROM adot_reco_dev.profile_template")

unsupported operand type(s) for /: 'NoneType' and 'int'




Downloading: 100%|[32m██████████[0m|


In [23]:
template = list(profile_template[profile_template['source_domain']=="adot"].template)[0]

def profile_text(adot_cat1):
    return template.format(cat1_profile=adot_cat1)

profile_text_udf = F.udf(profile_text, StringType())
total_data = total_data.withColumn("profile_templates", profile_text_udf(total_data["cat1_profiles"]))

### Template 입힌 테이블 저장

In [26]:
db_name = "adot_reco_dev"
partitioned_dest_table = "adotServiceProfile_templated_adot"

In [None]:
table_exists = bq_table_exists(table=f'{db_name}.{partitioned_dest_table}', project_id = PROJECT_ID)

In [27]:
if not table_exists:
    get_bigquery_client().query(f"""
        CREATE TABLE IF NOT EXISTS {db_name}.{partitioned_dest_table}(
            luna_id STRING,
            cat1_profiles STRING,
            source_domain STRING,
            profile_templates STRING,
            dt DATE
        )
        PARTITION BY dt
    """).result()
    print(f"생성된 테이블 : {db_name}.{partitioned_dest_table}")

[Stage 4:=> (22 + 8) / 52][Stage 5:>   (0 + 0) / 52][Stage 6:>   (0 + 0) / 52]

생성된 테이블 : x1113099.user_retrieval_profile_adot_text


In [28]:
df_to_bq_table(df=total_data,
               dataset=db_name,
               table_name=partitioned_dest_table,
               mode="overwrite")

                                                                                