In [None]:
from skt.gcp import (
    PROJECT_ID,
    bq_insert_overwrite,
    bq_to_df,
    bq_to_pandas,
    get_bigquery_client,
    bq_table_exists,
    get_max_part,
    get_bigquery_client,
    df_to_bq_table
)

from skt.ye import (
    get_hdfs_conn,
    get_spark,
)

In [None]:
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [None]:
from google.cloud.bigquery.job import QueryJobConfig

In [None]:
import pandas as pd
from datetime import datetime, date, timedelta

In [None]:
print(f'current_dt: {current_dt}')
print(f'state: {state}')
print(f'log_duration: {log_duration}')

In [None]:
# 최신날짜 계산
execution_dt = datetime.strptime(current_dt, '%Y-%m-%d')
execution_dt_one_ago = (execution_dt - timedelta(days=1))
lag_current_dt= execution_dt_one_ago.strftime('%Y-%m-%d')

In [None]:
log_duration = int(log_duration) - 1
short_duration = 6

In [None]:
long_start_dt = (execution_dt - timedelta(days=log_duration)).strftime("%Y-%m-%d")
short_start_dt = (execution_dt - timedelta(days=short_duration)).strftime("%Y-%m-%d")

print("long_start_dt : ", long_start_dt)
print("short_start_dt : ", short_start_dt)

In [None]:
db_name = 'adot_reco_dev'
table_nm = 'tmbr_cat1_cnt'

In [None]:
bq_client = get_bigquery_client()

# 전처리 함수

In [None]:
def tmbr_extract_profile(data, thre=0.6, freq_wei=0.6, dt_cnt=60):
    #############################
    ### 빈도기반 확률 값 계산 ###
    #############################
    data_freq = data.select(["svc_mgmt_num", "luna_id", "cat1"]).na.drop("any", subset="cat1")
    data_freq = data_freq.groupby("svc_mgmt_num", "luna_id", "cat1").agg(F.count("*").alias("cat1_cnt"))
    data_freq_user = data_freq.groupby(["svc_mgmt_num", "luna_id"]).agg(max("cat1_cnt").alias("cat1_max_cnt"))
    data_freq = data_freq.join(data_freq_user, on=["svc_mgmt_num", "luna_id"], how="left")
    data_freq = data_freq.withColumn("cat1_prop", F.col("cat1_cnt") / F.col("cat1_max_cnt"))

    result_df = data_freq.select("svc_mgmt_num", "luna_id", "cat1", "cat1_prop").dropDuplicates(["svc_mgmt_num", "luna_id", "cat1"])

    ############################
    ### IDF 기반 weight 계산 ###
    ############################
    data_dt_cnt = data.select(["svc_mgmt_num", "luna_id", "cat1", "dt"]).distinct().groupby(["svc_mgmt_num", "luna_id", "cat1"]).agg(F.count("*").alias("svc_mgmt_num_cat1_cnt"))
    data_dt_cnt = data_dt_cnt.withColumn("df_weight", 1 / F.log(F.lit(dt_cnt) / F.col("svc_mgmt_num_cat1_cnt") + 1.0e-8))
    data_dt_cnt = data_dt_cnt.withColumn("rev_df_weight", 1.0 / (1.0 + F.exp(-F.col("df_weight"))))

    #### Merge ####
    merge_data = result_df.dropna(subset=["svc_mgmt_num", "luna_id"]).select(["svc_mgmt_num", "luna_id", "cat1", "cat1_prop"]).join(data_dt_cnt.select(["svc_mgmt_num", "luna_id", "cat1", "rev_df_weight"]), on=["svc_mgmt_num", "luna_id", "cat1"], how="left")
    merge_data = merge_data.withColumn("score", F.col("cat1_prop") * freq_wei + F.col("rev_df_weight") * (1 - freq_wei))

    data_cat1_distinct = data.select(["cat1", "cat2"]).distinct()
    merge_data = merge_data.join(data_cat1_distinct, on="cat1", how="left")

    window_spec = Window.partitionBy(["svc_mgmt_num", "luna_id"]).orderBy(F.desc("score"))
    merge_data = merge_data.withColumn("rank", F.row_number().over(window_spec)) \
                    .filter(F.col("rank") <= 10) \
                    .drop("rank")

    merge_data = merge_data.filter(F.col("score") >= thre).orderBy(F.desc("score")).groupBy("luna_id").agg(F.concat_ws(", ", F.collect_set("cat1")).alias("cat1_profiles"), F.concat_ws(", ", F.collect_set("cat2")).alias("cat2_profiles"))
    return merge_data

In [None]:
def calculate_days(start_date, end_date):
    date_range = pd.date_range(start=start_date, end=end_date)
    total_days = len(date_range)
    weekend_days = date_range.to_series().map(lambda x: x.weekday() >= 5).sum()
    weekday_days = total_days - weekend_days
    return total_days, int(weekend_days), int(weekday_days)

# 프로파일링

In [None]:
total_days, weekend_days, weekday_days = calculate_days(long_start_dt, current_dt)
log_format = f"""
    total_days: {total_days},
    weekend_days: {weekend_days},
    weekday_days: {weekday_days}
"""
print(log_format)

In [None]:
query_cat1 = f"""
    select *
    from {db_name}.tmbr_cat1_cnt
    where dt >= '{long_start_dt}'
"""

In [None]:
data_cat1 = bq_to_df(query_cat1)

In [None]:
total_data = tmbr_extract_profile(data_cat1, 0.6, 0.6, total_days)

In [None]:
total_data.show(3, False)
print(total_data.printSchema())

# 프로 파일 테이블 저장

In [None]:
partitioned_dest_table = "adotServiceProfile_tmbr"

In [None]:
table_exists = bq_table_exists(table=f'{db_name}.{partitioned_dest_table}', project_id = PROJECT_ID)

In [None]:
if not table_exists:
    get_bigquery_client().query(f"""
        CREATE TABLE IF NOT EXISTS {db_name}.{partitioned_dest_table}(
            svc_mgmt_num STRING,
            luna_id STRING,
            cat1_profiles STRING,
            cat2_profiles STRING,
            source_domain STRING,
            dt Date
        )
        PARTITION BY dt
    """).result()

    print(f"생성된 테이블 : {db_name}.{partitioned_dest_table}")

In [None]:
df_to_bq_table(df=total_data,
               dataset=db_name,
               table_name=partitioned_dest_table,
               mode="overwrite")

# Template 입히기

In [None]:
profile_template = bq_to_pandas(f"SELECT * FROM {db_name}.profile_template")

In [None]:
template = list(profile_template[profile_template['source_domain']=="tmbr"].template)[0]

def profile_text(cat1_profiles):
    return template.format(cat1_profile=cat1_profiles)

profile_text_udf = F.udf(profile_text, StringType())
total_data = total_data.withColumn("profile_templates", profile_text_udf(total_data["cat1_profiles"]))

In [None]:
partitioned_dest_table = "adotServiceProfile_templated_tmbr"

In [None]:
table_exists = bq_table_exists(table=f'{db_name}.{partitioned_dest_table}', project_id = PROJECT_ID)

In [None]:
if not table_exists:
    get_bigquery_client().query(f"""
        CREATE TABLE IF NOT EXISTS {db_name}.{partitioned_dest_table}(
            svc_mgmt_num STRING,
            luna_id STRING,
            cat1_profiles STRING,
            source_domain STRING,
            profile_templates STRING,
            dt DATE
        )
        PARTITION BY dt
    """).result()
    print(f"생성된 테이블 : {db_name}.{partitioned_dest_table}")

In [None]:
df_to_bq_table(df=total_data,
               dataset=db_name,
               table_name=partitioned_dest_table,
               mode="overwrite")