In [None]:
from skt.gcp import (
    PROJECT_ID,
    bq_insert_overwrite,
    bq_to_df,
    bq_to_pandas,
    get_bigquery_client,
    bq_table_exists,
    get_max_part,
    load_query_result_to_table,
    pandas_to_bq,
    pandas_to_bq_table,
    load_bigquery_ipython_magic,
    get_bigquery_client,
    _print_query_job_results,
    load_query_result_to_partitions,
    df_to_bq_table
    
)

from skt.ye import (
    get_hdfs_conn,
    get_spark,
    hive_execute,
    hive_to_pandas,
    pandas_to_parquet,
    slack_send,
    get_secrets
)

In [None]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.functions import (
    row_number, 
    col, 
    lit, 
    count, 
    log, 
    exp, 
    sum as spark_sum
)
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType

In [None]:
import pandas as pd
from datetime import datetime, date, timedelta

In [None]:
print(f'current_dt: {current_dt}')
print(f'state: {state}')
print(f'log_duration: {log_duration}')


In [None]:
execution_dt = datetime.strptime(current_dt, '%Y-%m-%d')
execution_dt_one_ago = (execution_dt - timedelta(days=1)).strftime('%Y-%m-%d')
log_duration = int(log_duration) - 1
short_duration = 6


In [None]:
long_start_dt = (execution_dt - timedelta(days=log_duration)).strftime("%Y-%m-%d")
short_start_dt = (execution_dt - timedelta(days=short_duration)).strftime("%Y-%m-%d")

print("long_start_dt : ", long_start_dt)
print("short_start_dt : ", short_start_dt)

In [None]:
db_name = "adot_reco_dev"

In [None]:
def extract_prefer_list(data, thre=0.7, col_names='tmap_total'):
    data_distinct = data.select(['cat1','cat2','item']).distinct()
    
    data = data.select(['svc_mgmt_num','luna_id','item']).na.drop("any", subset='item')
    data = data.groupby('svc_mgmt_num','luna_id','item').agg(count("*").alias('item_cnt'))
    data_user = data.groupby(['svc_mgmt_num','luna_id']).agg(max('item_cnt').alias('item_max_cnt'))
    data = data.join(data_user, on=['svc_mgmt_num','luna_id'], how="left")
    data = data.withColumn('item_prop', col('item_cnt')/col('item_max_cnt'))
    
    # 전체 빈도수 기반으로 선호 장소 추출
    data1 = data.filter(col('item_prop') > thre)
    window_spec1 = Window.partitionBy(["svc_mgmt_num",'luna_id']).orderBy(F.desc('item_prop'))

    top_keywords1 = data1.withColumn("rank", row_number().over(window_spec1)) \
                    .filter(col("rank") <= 10) \
                    .drop("rank")
    
    top_keywords1 = top_keywords1.dropDuplicates(['luna_id','item']).select(['svc_mgmt_num', 'luna_id', 'item'])
    # data1 = top_keywords.groupBy("svc_mgmt_num",'luna_id').agg(concat_ws(", ",collect_list(f'{cate}')).alias(f'{cate}_list'))
    
    # IDF 고려하여 유저가 특이하게 방문하는 장소 추출
    data_item_user_cnt = data.select(['svc_mgmt_num','luna_id']).distinct().count()
    
    data_idf = data.select(['svc_mgmt_num','luna_id','item']).distinct()
    data_idf_cnt = data_idf.groupby('item').agg(count("*").alias('item_idf_cnt'))
    data2 = data.join(data_idf_cnt,on='item',how="left")
    data2 = data2.withColumn('idf_wei', log(data_item_user_cnt / (1+col("item_idf_cnt"))))
    data2 = data2.withColumn('rev_item_prop', 0.6/(1.0 + exp(-col("idf_wei")))+col('item_prop')*0.4)
    data2 = data2.filter(col('rev_item_prop') > 0.7)
    
    window_spec2 = Window.partitionBy(["svc_mgmt_num",'luna_id']).orderBy(desc('rev_item_prop'))

    top_keywords2 = data2.withColumn("rank", row_number().over(window_spec2)) \
                    .filter(col("rank") <= 5) \
                    .drop("rank")
    
    top_keywords2 = top_keywords2.dropDuplicates(['luna_id','item'])
    # top_keywords2 = top_keywords2.withColumnRenamed('item', 'item_rev')
    
    # top_keywords = top_keywords1.join(top_keywords2, on=['svc_mgmt_num', 'luna_id'], how="left")
    item_per_group = top_keywords1.groupBy(['svc_mgmt_num', 'luna_id']).agg(F.collect_set("item").alias("item_values"))
    
    top_keywords2 = top_keywords2.join(item_per_group, on=['svc_mgmt_num', 'luna_id'])
    top_keywords2 = top_keywords2.filter(~array_contains(col("item_values"), col("item"))).select(['svc_mgmt_num', 'luna_id', 'item'])
    
    
    ## cat1, cat2 붙이기 위한 작업
    top_keywords = top_keywords1.union(top_keywords2)
    top_keywords = top_keywords.join(data_distinct, on="item", how="left")
    
    ## 전체 데이터 merge
    data = top_keywords1.groupBy('luna_id').agg(F.concat_ws(", ", F.collect_set('item')).alias(f'{col_names}_item_list'))
    data = data.join(top_keywords2.groupBy('luna_id').agg(F.concat_ws(", ", F.collect_set('item')).alias(f'{col_names}_item_list_unique')), on=['luna_id'], how="left")
    data = data.join(top_keywords.groupBy('luna_id').agg(F.concat_ws(", ", F.collect_set('cat1')).alias(f'{col_names}_cat1_list'), F.concat_ws(", ", F.collect_set('cat2')).alias(f'{col_names}_cat2_list')), on=['luna_id'], how="left")
    
    return data

In [None]:
def calculate_days(start_date, end_date):
    date_range = pd.date_range(start=start_date, end=end_date)
    total_days = len(date_range)
    weekend_days = date_range.to_series().map(lambda x: x.weekday() >= 5).sum()
    weekday_days = total_days - weekend_days
    return total_days, int(weekend_days), int(weekday_days)

In [12]:
total_days, weekend_days, weekday_days = calculate_days(long_start_dt, current_dt)
log_format = f"""
    total_days: {total_days},
    weekend_days: {weekend_days},
    weekday_days: {weekday_days}
"""
print(log_format)

In [14]:
query_item = f"""
    SELECT *
    FROM {db_name}.tmap_item_cnt
    where dt >= '{long_start_dt}'
"""
data_item = bq_to_df(query_item)

## item 기준으로 주말, 평일로 나누고, item에 해당하는 cat1,cat2도 함께 저장하는 형태

In [None]:
total_data = extract_prefer_list(data_item, 0.5, 'tmap_total')
total_data = total_data.join(extract_prefer_list(data_item.filter(col('is_weekend')==1), 0.5, 'tmap_weekend'),on="luna_id",how="left")
total_data = total_data.join(extract_prefer_list(data_item.filter(col('is_weekend')==0), 0.5, 'tmap_weekday'),on="luna_id",how="left")

In [None]:
total_data = total_data.withColumn("item_profiles", F.concat_ws(", ", total_data.tmap_total_item_list, total_data.tmap_total_item_list_unique))\
                       .withColumn("cat1_profiles", F.col("tmap_total_cat1_list"))\
                       .withColumn("cat2_profiles", F.col("tmap_total_cat2_list"))\
                       .withColumn("source_domain", F.lit("tmap"))


In [None]:
total_data.show(3,False)
print(total_data.printSchema())

# 프로 파일 테이블 저장

In [None]:
PROJECT_ID = "skt-datahub"
db_name = "adot_reco_dev"
partitioned_dest_table = "adotServiceProfile_tmap"

In [None]:
table_exists = bq_table_exists(table=f'{db_name}.{partitioned_dest_table}', project_id = PROJECT_ID)

In [None]:
if not table_exists:
    get_bigquery_client().query(f"""
        CREATE TABLE IF NOT EXISTS {db_name}.{partitioned_dest_table}(
            luna_id STRING,
            item_profiles STRING,
            cat1_profiles STRING,
            cat2_profiles STRING,
            tmap_total_item_list_unique STRING,
            tmap_weekend_item_list STRING,
            tmap_weekend_item_list_unique STRING,
            tmap_weekend_cat1_list STRING,
            tmap_weekend_cat2_list STRING,
            tmap_weekday_item_list STRING,
            tmap_weekday_item_list_unique STRING,
            tmap_weekday_cat1_list STRING,
            tmap_weekday_cat2_list STRING,
            source_domain STRING,
            dt DATE
        )
        PARTITION BY dt
    """).result()

    print(f"생성된 테이블 : {db_name}.{partitioned_dest_table}")

In [None]:
df_to_bq_table(df=total_data,
               dataset=db_name,
               table_name=partitioned_dest_table,
               mode="overwrite")

# Template 입히기

In [2]:
profile_template = bq_to_pandas("SELECT * FROM adot_reco_dev.profile_template")

unsupported operand type(s) for /: 'NoneType' and 'int'




Downloading: 100%|[32m██████████[0m|


In [23]:
template = list(profile_template[profile_template['source_domain']=="tmap"].template)[0]
def profile_text(tmap_cat1,tmap_item):
    return template.format(cat1_profile=tmap_cat1, item_profile=tmap_item)

profile_text_udf = F.udf(profile_text, StringType())
total_data = total_data.withColumn("profile_templates", profile_text_udf(total_data["cat1_profiles"], total_data["item_profiles"]))

### Template 입힌 테이블 저장

In [26]:
db_name = "adot_reco_dev"
partitioned_dest_table = "adotServiceProfile_templated_tmap"

In [None]:
table_exists = bq_table_exists(table=f'{db_name}.{partitioned_dest_table}', project_id = PROJECT_ID)

In [27]:
if not table_exists:
    get_bigquery_client().query(f"""
        CREATE TABLE IF NOT EXISTS {db_name}.{partitioned_dest_table}(
            luna_id STRING,
            cat1_profiles STRING,
            item_profiles STRING,
            source_domain STRING,
            profile_templates STRING,
            dt DATE
        )
        PARTITION BY dt
    """).result()
    print(f"생성된 테이블 : {db_name}.{partitioned_dest_table}")

[Stage 4:=> (22 + 8) / 52][Stage 5:>   (0 + 0) / 52][Stage 6:>   (0 + 0) / 52]

생성된 테이블 : x1113099.user_retrieval_profile_adot_text


In [28]:
df_to_bq_table(df=total_data,
               dataset=db_name,
               table_name=partitioned_dest_table,
               mode="overwrite")

                                                                                