In [None]:
from skt.gcp import (
    PROJECT_ID,
    bq_insert_overwrite,
    bq_to_df,
    bq_to_pandas,
    get_bigquery_client,
    bq_table_exists,
    get_max_part,
    get_bigquery_client,
    df_to_bq_table
)

In [None]:
from google.cloud.bigquery.job import QueryJobConfig

In [None]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.functions import (
    row_number, 
    col, 
    lit, 
    count, 
    log, 
    exp, 
    sum as spark_sum
)
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType, DateType

In [None]:
import pandas as pd
from datetime import datetime, date, timedelta

In [None]:
print(f'current_dt: {current_dt}')
print(f'state: {state}')
print(f'log_duration: {log_duration}')


In [None]:
execution_dt = datetime.strptime(current_dt, '%Y-%m-%d')
execution_dt_one_ago = (execution_dt - timedelta(days=1)).strftime('%Y-%m-%d')
log_duration = int(log_duration) - 1
short_duration = 6


In [None]:
long_start_dt = (execution_dt - timedelta(days=log_duration)).strftime("%Y-%m-%d")
short_start_dt = (execution_dt - timedelta(days=short_duration)).strftime("%Y-%m-%d")

print("long_start_dt : ", long_start_dt)
print("short_start_dt : ", short_start_dt)

In [None]:
db_name = "adot_reco_dev"

In [None]:
bq_client = get_bigquery_client()

In [None]:
def calculate_days(start_date, end_date):
    date_range = pd.date_range(start=start_date, end=end_date)
    total_days = len(date_range)
    weekend_days = date_range.to_series().map(lambda x: x.weekday() >= 5).sum()
    weekday_days = total_days - weekend_days
    return total_days, int(weekend_days), int(weekday_days)

In [12]:
total_days, weekend_days, weekday_days = calculate_days(long_start_dt, current_dt)
log_format = f"""
    total_days: {total_days},
    weekend_days: {weekend_days},
    weekday_days: {weekday_days}
"""
print(log_format)

## ADOT 유저만 처리하는 where caluse

In [None]:
only_adot_where_clause  = f"""luna_id is not null"""

In [None]:
query = f"""
WITH data AS (
  SELECT
    svc_mgmt_num,
    luna_id,
    cat1,
    cat2,
    item
  FROM adot_reco_dev.tmap_item_cnt
  WHERE dt >= '{long_start_dt}'
  AND item is not null
),

-- 전체 통계량 보는 곳
count_by_group AS (
  SELECT
    item,
    cat1,
    cat2,
    COUNT(*) AS item_count
  FROM data
  GROUP BY item, cat1, cat2
),

-- 전체 통계량 보는 곳 (item_max_cnt 이후부터는 전체 svc_mgmt_num 볼 필요 x)
df_with_max_count AS (
  SELECT *,
         MAX(item_count) OVER (PARTITION BY svc_mgmt_num, luna_id, item) AS item_max_cnt
  FROM (
    SELECT
      a.svc_mgmt_num,
      a.luna_id,
      a.item,
      a.cat1,
      a.cat2,
      b.item_count
    FROM data a
    LEFT JOIN count_by_group b
    ON a.item = b.item
  )
)

SELECT
  svc_mgmt_num,
  luna_id,
  item,
  cat1,
  cat2,
  item_max_cnt,
  item_count,
  (item_count / item_max_cnt) as item_prop
FROM df_with_max_count
"""

In [None]:
temp_db = 'temp_1d'
job_config = QueryJobConfig()
temp_table = f'{PROJECT_ID}.{temp_db}.item_prop_temp_table'
job_config.destination = temp_table
job_config.write_disposition = 'WRITE_TRUNCATE'

In [None]:
# 임시 테이블 생성
query_job = bq_client.query(query, job_config=job_config)
query_job.result() 

In [None]:
query = f"""
WITH filtered_table AS (
    SELECT *
    FROM  {PROJECT_ID}.{temp_db}.item_prop_temp_table
    WHERE item_prop > 0.7
    AND {only_adot_where_clause}
),
ranked_data AS (
    SELECT distinct svc_mgmt_num,
                    luna_id,
                    cat1,
                    cat2,
                    item,
                    item_prop
    FROM (
        SELECT 
            svc_mgmt_num,
            luna_id,
            item_prop,
            item,
            cat1,
            cat2,
            ROW_NUMBER() OVER (PARTITION BY svc_mgmt_num, luna_id ORDER BY item_prop DESC) AS rank
        FROM 
            filtered_table
    )
    WHERE rank <=10
)
SELECT *
FROM  ranked_data
"""

In [None]:
temp_db = 'temp_1d'
job_config = QueryJobConfig()
temp_table = f'{PROJECT_ID}.{temp_db}.tmap10keyword_profile_temp_table'
job_config.destination = temp_table
job_config.write_disposition = 'WRITE_TRUNCATE'

In [None]:
# 임시 테이블 생성
query_job = bq_client.query(query, job_config=job_config)
query_job.result() 

In [None]:
query = f"""
WITH data AS (
  SELECT
      svc_mgmt_num,
      luna_id,
      item,
      item_prop
  FROM {PROJECT_ID}.{temp_db}.item_prop_temp_table
  WHERE item is not null
),
-- 전체 통계량 이용
data_item_user_cnt AS (
  SELECT COUNT(*) AS user_cnt
  FROM (
    SELECT DISTINCT svc_mgmt_num, luna_id
    FROM data
  )
),

-- Calculate data_idf_cnt (전체 통계량 이용)
data_idf_cnt AS (
  SELECT  item, 
          COUNT(*) AS item_idf_cnt
  FROM (SELECT DISTINCT svc_mgmt_num, luna_id, item FROM data)
  GROUP BY item
),

-- Join and calculate idf_wei and rev_item_prop (전체 통계량 이용)
data_idf_score AS (
  SELECT 
    d.*,
    idf_table.item_idf_cnt,
    LOG((SELECT user_cnt FROM data_item_user_cnt) / (1 + idf_table.item_idf_cnt)) AS idf_wei,
    0.6 / (1.0 + EXP(-LOG((SELECT user_cnt FROM data_item_user_cnt) / (1 + idf_table.item_idf_cnt)))) + d.item_prop * 0.4 AS rev_item_prop
  FROM data d
  LEFT JOIN data_idf_cnt idf_table ON d.item = idf_table.item
  WHERE {only_adot_where_clause}
),

-- Filter and rank
ranked_data AS (
  SELECT 
    *,
    ROW_NUMBER() OVER (PARTITION BY svc_mgmt_num, luna_id ORDER BY rev_item_prop DESC) AS rank
  FROM data_idf_score
  WHERE rev_item_prop > 0.7
),

top_keywords2 AS (
  SELECT DISTINCT
    svc_mgmt_num,
    luna_id,
    item
  FROM ranked_data
  WHERE rank <= 5
),

-- Calculate item_per_group
item_per_group AS (
  SELECT 
    svc_mgmt_num, 
    luna_id, 
    ARRAY_AGG(DISTINCT item) AS item_values
  FROM {PROJECT_ID}.{temp_db}.tmap10keyword_profile_temp_table
  WHERE {only_adot_where_clause}
  GROUP BY svc_mgmt_num, luna_id
)

-- Final result
SELECT 
  t.svc_mgmt_num, 
  t.luna_id, 
  t.item
FROM top_keywords2 t
JOIN item_per_group i ON t.svc_mgmt_num = i.svc_mgmt_num AND t.luna_id = i.luna_id
WHERE t.item NOT IN UNNEST(i.item_values)
"""

In [None]:
temp_db = 'temp_1d'
job_config = QueryJobConfig()
temp_table = f'{PROJECT_ID}.{temp_db}.tmap5keyword2_profile_temp_table'
job_config.destination = temp_table
job_config.write_disposition = 'WRITE_TRUNCATE'

In [None]:
# 임시 테이블 생성
query_job = bq_client.query(query, job_config=job_config)
query_job.result() 

In [None]:
query =f"""
WITH JOIND_KEYWORD_TABLE AS (
    SELECT  A.*,
            B.cat1,
            B.cat2
    FROM (
        SELECT  svc_mgmt_num,
                luna_id,
                item
            
        FROM {PROJECT_ID}.{temp_db}.tmap10keyword_profile_temp_table

        UNION ALL

        SELECT  svc_mgmt_num,
                luna_id,
                item
        FROM {PROJECT_ID}.{temp_db}.tmap5keyword2_profile_temp_table
    ) AS A
    LEFT JOIN 
    (
        SELECT distinct cat2,
                        cat1,
                        item
        FROM adot_reco_dev.tmap_item_cnt
        WHERE dt >= '{long_start_dt}'
        AND item is not null
    ) AS B 
    ON A.item = B.item
),
keywords_list_table AS (

    SELECT A.svc_mgmt_num,
           A.luna_id,
           A.top10keywords,
           B.top5keywords
    FROM (
      SELECT  svc_mgmt_num,
              luna_id,
              STRING_AGG(DISTINCT item, ', ') AS top10keywords
      FROM {PROJECT_ID}.{temp_db}.tmap10keyword_profile_temp_table
      GROUP BY svc_mgmt_num, luna_id
    ) AS A
    LEFT JOIN (
      SELECT  svc_mgmt_num,
              luna_id,
              STRING_AGG(DISTINCT item, ', ') AS top5keywords

      FROM {PROJECT_ID}.{temp_db}.tmap5keyword2_profile_temp_table
      GROUP BY svc_mgmt_num, luna_id
    ) AS B
    ON A.svc_mgmt_num = B.svc_mgmt_num AND A.luna_id = B.luna_id
),
cat_profiles_table AS (
  SELECT
    svc_mgmt_num,
    luna_id,
    STRING_AGG(DISTINCT cat1, ', ') AS cat1_profiles,
    STRING_AGG(DISTINCT cat2, ', ') AS cat2_profiles
  FROM {PROJECT_ID}.{temp_db}.tmap10keyword_profile_temp_table
  GROUP BY svc_mgmt_num, luna_id
),

item_profiles_table AS (
  SELECT svc_mgmt_num,
         luna_id,
         CASE
            WHEN NULLIF(top10keywords, '') IS NOT NULL AND NULLIF(top5keywords, '') IS NOT NULL
              THEN CONCAT(top10keywords, ', ', top5keywords)
            WHEN NULLIF(top10keywords, '') IS NOT NULL
              THEN top10keywords
            WHEN NULLIF(top5keywords, '') IS NOT NULL
              THEN top5keywords
            ELSE ''
         END AS item_profiles
  FROM keywords_list_table
)

SELECT  A.svc_mgmt_num, 
        A.luna_id,
        A.item_profiles,
        B.cat1_profiles,
        B.cat2_profiles,
        'tmap' as source_domain,
        PARSE_DATE('%Y-%m-%d', '{current_dt}') as dt 
        

FROM item_profiles_table AS A

LEFT JOIN (
  SELECT * 
  FROM cat_profiles_table
  ) AS B
ON A.svc_mgmt_num = B.svc_mgmt_num AND A.luna_id = B.luna_id
"""  

# 프로 파일 테이블 저장

In [None]:
PROJECT_ID = "skt-datahub"
db_name = "adot_reco_dev"
partitioned_dest_table = "adotServiceProfile_tmap"

In [None]:
table_exists = bq_table_exists(table=f'{db_name}.{partitioned_dest_table}', project_id = PROJECT_ID)

In [None]:
bq_insert_overwrite(sql=query, destination=f'{PROJECT_ID}.{db_name}.{partitioned_dest_table}', partition='dt')

# Template 입히기

In [None]:
profile_template = bq_to_pandas("SELECT * FROM adot_reco_dev.profile_template")
tmap_template = list(profile_template[profile_template['source_domain']=="tmap"].template)[0]

In [None]:
query = f"""
SELECT  svc_mgmt_num,
        luna_id,
        CASE 
            WHEN cat1_profiles='' THEN ""
            ELSE    REGEXP_REPLACE(
                        REGEXP_REPLACE(
                            '{tmap_template}',
                            r'\\{{cat1_profile\\}}',
                            cat1_profiles
                        ),
                        r'\\{{item_profile\\}}',
                        item_profiles
                )
        END AS profile_templates,
        "tmap" as source_domain,
        dt

FROM (
    SELECT  svc_mgmt_num,
            luna_id,            
            COALESCE(item_profiles, '') as item_profiles,
            COALESCE(cat1_profiles, '') as cat1_profiles,
            COALESCE(cat2_profiles, '') as cat2_profiles,
            dt
    FROM {PROJECT_ID}.{db_name}.{partitioned_dest_table}
    WHERE dt = '{current_dt}' AND {only_adot_where_clause}
)
"""

In [26]:
db_name = "adot_reco_dev"
partitioned_dest_table = "adotServiceProfile_templated_tmap"

In [None]:
bq_insert_overwrite(sql=query, destination=f'{PROJECT_ID}.{db_name}.{partitioned_dest_table}', partition='dt')

# Model 용 Template 입힌 테이블 저장

In [None]:
tmap_model_template = list(profile_template[profile_template['source_domain']=="tmap_onemodel"].template)[0]

In [None]:
query = f"""
SELECT  svc_mgmt_num,
        luna_id,
        CASE 
            WHEN cat1_profiles='' THEN ""
            ELSE    REGEXP_REPLACE(
                        REGEXP_REPLACE(
                            '{tmap_model_template}',
                            r'\\{{cat1_profile\\}}',
                            cat1_profiles
                        ),
                        r'\\{{item_profile\\}}',
                        item_profiles
                )
        END AS profile_templates,
        "tmap" as source_domain,
        dt

FROM (
    SELECT  svc_mgmt_num,
            luna_id,            
            COALESCE(item_profiles, '') as item_profiles,
            COALESCE(cat1_profiles, '') as cat1_profiles,
            COALESCE(cat2_profiles, '') as cat2_profiles,
            dt
    FROM {PROJECT_ID}.{db_name}.{partitioned_dest_table}
    WHERE dt = '{current_dt}' AND {only_adot_where_clause}
)
"""

In [None]:
db_name = "adot_reco_dev"
partitioned_dest_table = "adotServiceProfile_templated_tmap_model"

In [None]:
bq_insert_overwrite(sql=query, destination=f'{PROJECT_ID}.{db_name}.{partitioned_dest_table}', partition='dt')