In [1]:
from skt.gcp import (
    PROJECT_ID,
    bq_insert_overwrite,
    bq_to_df,
    bq_to_pandas,
    get_bigquery_client,
    bq_table_exists,
    get_max_part,
    load_query_result_to_table,
    pandas_to_bq,
    pandas_to_bq_table,
    load_bigquery_ipython_magic,
    get_bigquery_client,
    _print_query_job_results,
    load_query_result_to_partitions
    
)

from skt.ye import (
    get_hdfs_conn,
    get_spark,
    hive_execute,
    hive_to_pandas,
    pandas_to_parquet,
    slack_send,
    get_secrets
)

In [2]:
from google.cloud.bigquery.job import QueryJobConfig

In [3]:
from datetime import datetime, timedelta

# Variables

In [4]:
long_duration = 60
short_duration = 7
end_dt = '2024-06-22'
start_dt = (datetime.strptime(end_dt, '%Y-%m-%d') - timedelta(days=long_duration)).strftime('%Y-%m-%d')

In [5]:
db_name = 'adot_reco_dev'
table_nm = 'tdeal_cat1_cnt'
project_id = 'skt-datahub'

# PREPROCESSING

In [6]:
tdeal_temp_cat_list = {
    "여행/티켓": [],
    "건강식품": [],
    "스포츠/레저": [],
    "출산/육아": [],
    "반려동물용품": [],
    "화장품/미용": [],
    "디지털/가전": []
}

In [7]:
tdeal_where_clauses = []
for key, vals in tdeal_temp_cat_list.items():
    tdeal_where_clauses.append(
        f"TRIM(LOWER(cat2)) = '{key}'"
    )
    tdeal_where_clauses.append("OR")
tdeal_where_clause = ' '.join(tdeal_where_clauses[:-1])

# BIGQUERY CONFIG JOB

In [8]:
bq_client = get_bigquery_client()

# SCORING LOGIC

In [9]:
# 상대 비교 프로파일 추출 |로직
cat1_query = f"""
WITH params AS (
  SELECT
    {long_duration/2} AS longterm_decay_param, 
    {short_duration/2} AS shortterm_decay_param
),

BUY_TABLE AS (
    SELECT distinct svc_mgmt_num, 
                    luna_id,
                    cat2, 
                    cat1, 
                    cat3, 
                    is_weekend, 
                    dt
    
    FROM {db_name}.{table_nm}
    WHERE dt <= '{end_dt}' 
    AND dt >= '{start_dt}'
    AND cat3='buy'
    AND ( {tdeal_where_clause} )
),

USER_PURCHASE_TABLE AS (

    SELECT  *
           
    FROM (
        SELECT  svc_mgmt_num,
                luna_id,
                cat2,
                cat1,
                ARRAY_AGG(dt ORDER BY dt asc) as purchase_date
        FROM BUY_TABLE
        group by svc_mgmt_num, luna_id, cat1, cat2
    )
),
unnested_purchases AS (
  SELECT
    svc_mgmt_num,
    luna_id,
    cat2,
    cat1,
    purchase_date
    
  FROM USER_PURCHASE_TABLE, UNNEST(purchase_date) AS purchase_date
),

purchase_metrics AS (
    SELECT  svc_mgmt_num,
            luna_id,
            cat1,
            cat2,
            COUNT(*) AS frequency,
            ARRAY_AGG(DATE_DIFF(PARSE_DATE('%Y-%m-%d', '{end_dt}'), purchase_date, DAY) ORDER BY DATE_DIFF(PARSE_DATE('%Y-%m-%d', '{end_dt}'), purchase_date, DAY) DESC) AS days_since_purchases
           
  FROM unnested_purchases
  GROUP BY svc_mgmt_num, luna_id, cat1, cat2
),

interest_score AS (
  SELECT
    svc_mgmt_num,
    luna_id,
    cat1,
    cat2,
    frequency,
    (
      SELECT SUM(EXP(-1 * day / longterm_decay_param))  -- Exponential decay with 60-day half-life
      FROM UNNEST(days_since_purchases) day
    ) AS long_term_recency_score,
    (
      SELECT SUM(EXP(-1 * day / shortterm_decay_param))  
      FROM UNNEST(days_since_purchases) day
    ) AS short_term_recency_score,
  FROM purchase_metrics, params
)

SELECT *
FROM interest_score
"""

In [10]:
job_config = QueryJobConfig()
temp_table = f'{project_id}.adot_reco_dev.jh_tdeal_eda_temp'
job_config.destination = temp_table
job_config.write_disposition = 'WRITE_TRUNCATE'

In [11]:
# 절대 비교 프로파일 추출 로직
query_job = bq_client.query(cat1_query, job_config=job_config)
query_job.result() 

<google.cloud.bigquery.table.RowIterator at 0x7fea20bb7d00>

# Percentile tables for cat1

In [12]:
query = f"""
SELECT  cat1,
        cat2,
        APPROX_QUANTILES(long_term_recency_score, 4) as long_percentile_list,
        APPROX_QUANTILES(short_term_recency_score, 4) as short_percentile_list,
FROM {temp_table}
GROUP BY cat1, cat2
"""

In [13]:
percentile_table_cat1 = bq_to_pandas(query)

query: 
SELECT  cat1,
        cat2,
        APPROX_QUANTILES(long_term_recency_score, 4) as long_percentile_list,
        APPROX_QUANTILES(short_term_recency_score, 4) as short_percentile_list,
FROM skt-datahub.adot_reco_dev.jh_tdeal_eda_temp
GROUP BY cat1, cat2

destination: skt-datahub._775c5ccab1096b3cccd7ac34a5db11c0a354fb07.anon454403d51be3aeac263c50c89ab164d6954fe8e7981447785ec7bc72f29d5fe6
total_rows: 113
slot_secs: 0.052

Downloading: 100%|[32m██████████[0m|


In [14]:
import math 
day = 1
longterm_decay_param = 30
math.exp(-1 * (day / longterm_decay_param))

0.9672161004820059

In [15]:
long_quantile_tables_cat1 = percentile_table_cat1.set_index('cat1').to_dict()['long_percentile_list']
short_quantile_tables_cat1 = percentile_table_cat1.set_index('cat1').to_dict()['short_percentile_list']

In [16]:
query = f"""
SELECT distinct cat2, 
                cat1, 
                cat3, 
                dt

FROM {db_name}.{table_nm}
WHERE dt <= '{end_dt}' 
AND dt >= '{start_dt}'
AND cat3='buy'
AND ( {tdeal_where_clause} )
"""

In [17]:
cat_df = bq_to_pandas(query)

unsupported operand type(s) for /: 'NoneType' and 'int'
Downloading: 100%|[32m██████████[0m|


In [18]:
cat_dict = cat_df.set_index("cat1").to_dict()['cat2']

In [19]:
cat1_list = ["스포츠/레저"]    

In [20]:
long_percentile_cat1_conditions = " OR ".join([f"(cat1 = '{key}' AND long_term_recency_score>={values[3]})" for key, values in long_quantile_tables_cat1.items() if cat_dict[key] in cat1_list])
short_percentile_cat1_conditions = " OR ".join([f"(cat1 = '{key}' AND long_term_recency_score>={values[3]})" for key, values in short_quantile_tables_cat1.items() if cat_dict[key] in cat1_list])

In [21]:
cat1_query = f"""
    SELECT  *,
            "long" as pref
    FROM {temp_table}
    WHERE ({long_percentile_cat1_conditions})
    
    UNION ALL
    SELECT  *,
            "short" as pref
    FROM {temp_table}
    WHERE ({short_percentile_cat1_conditions})
"""

In [22]:
temp_table = f'{project_id}.adot_reco_dev.jh_tdeal_cat1_temp'

In [23]:
job_config = QueryJobConfig()
job_config.destination = temp_table
job_config.write_disposition = 'WRITE_TRUNCATE'

In [24]:
# 절대 비교 프로파일 추출 로직
query_job = bq_client.query(cat1_query, job_config=job_config)
query_job.result() 

<google.cloud.bigquery.table.RowIterator at 0x7fea20ade040>

In [25]:
#cat1_table = bq_to_pandas(query)

# CAT2

In [26]:
# 상대 비교 프로파일 추출 로직
cat2_query = f"""
WITH params AS (
  SELECT
    {long_duration/2} AS longterm_decay_param, 
    {short_duration/2} AS shortterm_decay_param
),

BUY_TABLE AS (
    SELECT distinct svc_mgmt_num, 
                    luna_id,
                    cat2, 
                    is_weekend, 
                    dt
    
    FROM {db_name}.{table_nm}
    WHERE dt <= '{end_dt}' 
    AND dt >= '{start_dt}'
    AND cat3='buy'
    AND ( {tdeal_where_clause} )
),

USER_PURCHASE_TABLE AS (

    SELECT  *
           
    FROM (
        SELECT  svc_mgmt_num,
                luna_id,
                cat2,
                ARRAY_AGG(dt ORDER BY dt asc) as purchase_date
        FROM BUY_TABLE
        group by svc_mgmt_num, luna_id, cat2
    )
),
unnested_purchases AS (
  SELECT
    svc_mgmt_num,
    luna_id,
    cat2,
    purchase_date
    
  FROM USER_PURCHASE_TABLE, UNNEST(purchase_date) AS purchase_date
),

purchase_metrics AS (
    SELECT  svc_mgmt_num,
            luna_id,
            cat2,
            COUNT(*) AS frequency,
            ARRAY_AGG(DATE_DIFF(PARSE_DATE('%Y-%m-%d', '{end_dt}'), purchase_date, DAY) ORDER BY DATE_DIFF(PARSE_DATE('%Y-%m-%d', '{end_dt}'), purchase_date, DAY) DESC) AS days_since_purchases
           
  FROM unnested_purchases
  GROUP BY svc_mgmt_num, luna_id, cat2
),

interest_score AS (
  SELECT
    svc_mgmt_num,
    luna_id,
    cat2,
    frequency,
    (
      SELECT SUM(EXP(-1 * day / longterm_decay_param))  -- Exponential decay with 60-day half-life
      FROM UNNEST(days_since_purchases) day
    ) AS long_term_recency_score,
    (
      SELECT SUM(EXP(-1 * day / shortterm_decay_param))  
      FROM UNNEST(days_since_purchases) day
    ) AS short_term_recency_score,
  FROM purchase_metrics, params
)

SELECT *
FROM interest_score
"""

In [27]:
temp_table = f'{project_id}.adot_reco_dev.jh_tdeal_eda_temp'
job_config = QueryJobConfig()
job_config.destination = temp_table
job_config.write_disposition = 'WRITE_TRUNCATE'

In [28]:
# 절대 비교 프로파일 추출 로직
query_job = bq_client.query(cat2_query, job_config=job_config)
query_job.result() 

<google.cloud.bigquery.table.RowIterator at 0x7fea20c1c7f0>

In [29]:
query = f"""
SELECT  cat2,
        APPROX_QUANTILES(long_term_recency_score, 4) as long_percentile_list,
        APPROX_QUANTILES(short_term_recency_score, 4) as short_percentile_list,
FROM {temp_table}
GROUP BY cat2
"""

In [30]:
percentile_table_cat2 = bq_to_pandas(query)

query: 
SELECT  cat2,
        APPROX_QUANTILES(long_term_recency_score, 4) as long_percentile_list,
        APPROX_QUANTILES(short_term_recency_score, 4) as short_percentile_list,
FROM skt-datahub.adot_reco_dev.jh_tdeal_eda_temp
GROUP BY cat2

destination: skt-datahub._775c5ccab1096b3cccd7ac34a5db11c0a354fb07.anon84e02edae9baa20190cb0062c7dd6025e833918657cb0247fceacd2b4fe87158
total_rows: 7
slot_secs: 0.036

Downloading: 100%|[32m██████████[0m|


In [31]:
long_quantile_tables_cat2 = percentile_table_cat2.set_index('cat2').to_dict()['long_percentile_list']
short_quantile_tables_cat2 = percentile_table_cat2.set_index('cat2').to_dict()['short_percentile_list']

In [32]:
#cat2list = [cat2 for cat2 in set(cat_dict.values()) if cat2 not in cat1_list]
cat2list = list(set(cat_dict.values()))

In [33]:
user_state_list = ["반려동물용품", "출산/육아"]

In [34]:
long_percentile_conditions = " OR ".join([f"(cat2 = '{key}' AND long_term_recency_score>={values[3]})" if key not in user_state_list else f"(cat2 = '{key}' AND long_term_recency_score>={values[1]})"  for key, values in long_quantile_tables_cat2.items()])
short_percentile_conditions = " OR ".join([f"(cat2 = '{key}' AND short_term_recency_score>={values[3]})" if key not in user_state_list else f"(cat2 = '{key}' AND short_term_recency_score>={values[1]})"  for key, values in short_quantile_tables_cat2.items()])

In [35]:
cat2_query = f"""
    SELECT  *,
            "long" as pref
    FROM {temp_table}
    WHERE ({long_percentile_conditions})
    
    UNION ALL
    SELECT  *,
            "short" as pref
    FROM {temp_table}
    WHERE ({short_percentile_conditions})
"""

In [36]:
temp_table = f'{project_id}.adot_reco_dev.jh_tdeal_cat2_temp'

In [37]:
job_config = QueryJobConfig()
job_config.destination = temp_table
job_config.write_disposition = 'WRITE_TRUNCATE'

In [38]:
# 절대 비교 프로파일 추출 로직
query_job = bq_client.query(cat2_query, job_config=job_config)
query_job.result() 

<google.cloud.bigquery.table.RowIterator at 0x7fea20fa7fa0>

In [40]:
query = f"""
WITH User_state_tables AS(
    SELECT  svc_mgmt_num,
            luna_id,
            ARRAY_AGG(user_state) as user_state,
    FROM (
        SELECT distinct svc_mgmt_num,
                        luna_id,
                        CASE
                            WHEN cat2 = "출산/육아" THEN '육아/아기'
                            ELSE '반려동물'
                        END AS user_state

        FROM skt-datahub.adot_reco_dev.jh_tdeal_cat2_temp
        WHERE cat2 in ("출산/육아", "반려동물용품")
    )
    GROUP BY svc_mgmt_num, luna_id
),

Long_User_preference_tables AS (
        SELECT  svc_mgmt_num,
                luna_id,
                ARRAY_AGG(long_prefered_domain) as long_prefered_domain,
        FROM (
            SELECT distinct  svc_mgmt_num,
                             luna_id,
                             cat2 as long_prefered_domain

            FROM skt-datahub.adot_reco_dev.jh_tdeal_cat2_temp
            WHERE cat2 not in ("출산/육아", "반려동물용품") and pref='long'

            UNION ALL

            SELECT distinct  svc_mgmt_num,
                             luna_id,
                             cat1 as long_prefered_domain

            FROM skt-datahub.adot_reco_dev.jh_tdeal_cat1_temp
            WHERE pref='long'
        ) 
        GROUP BY svc_mgmt_num, luna_id
),

Short_User_preference_tables AS (

    SELECT *
    FROM (
        SELECT  svc_mgmt_num,
                luna_id,
                ARRAY_AGG(short_prefered_domain) as short_prefered_domain,
        FROM (
            SELECT distinct  svc_mgmt_num,
                             luna_id,
                             cat2 as short_prefered_domain

            FROM skt-datahub.adot_reco_dev.jh_tdeal_cat2_temp
            WHERE cat2 not in ("출산/육아", "반려동물용품") and pref='short'

            UNION ALL

            SELECT distinct  svc_mgmt_num,
                             luna_id,
                             cat1 as short_prefered_domain

            FROM skt-datahub.adot_reco_dev.jh_tdeal_cat1_temp
            WHERE pref='short'
        ) 
        GROUP BY svc_mgmt_num, luna_id
    )
)

SELECT  A.*,
        B.user_state
FROM (
    SELECT 
      COALESCE(t1.svc_mgmt_num, t2.svc_mgmt_num) AS svc_mgmt_num,
      COALESCE(t1.luna_id, t2.luna_id) AS luna_id,
      t1.long_prefered_domain,
      t2.short_prefered_domain
    FROM Long_User_preference_tables t1
    FULL OUTER JOIN Short_User_preference_tables t2
    ON t1.svc_mgmt_num = t2.svc_mgmt_num
) AS A

LEFT JOIN (
    SELECT *
    FROM User_state_tables
)AS B
ON A.svc_mgmt_num = B.svc_mgmt_num



"""

In [146]:
#bq_insert_overwrite(sql=query, destination=f'{PROJECT_ID}.adot_reco_dev.temp_jh_tdeal_profile')

In [41]:
test = bq_to_pandas(query)

query: 
WITH User_state_tables AS(
    SELECT  svc_mgmt_num,
            luna_id,
            ARRAY_AGG(user_state) as user_state,
    FROM (
        SELECT distinct svc_mgmt_num,
                        luna_id,
                        CASE
                            WHEN cat2 = "출산/육아" THEN '육아/아기'
                            ELSE '반려동물'
                        END AS user_state

        FROM skt-datahub.adot_reco_dev.jh_tdeal_cat2_temp
        WHERE cat2 in ("출산/육아", "반려동물용품")
    )
    GROUP BY svc_mgmt_num, luna_id
),

Long_User_preference_tables AS (
        SELECT  svc_mgmt_num,
                luna_id,
                ARRAY_AGG(long_prefered_domain) as long_prefered_domain,
        FROM (
            SELECT distinct  svc_mgmt_num,
                             luna_id,
                             cat2 as long_prefered_domain

            FROM skt-datahub.adot_reco_dev.jh_tdeal_cat2_temp
            WHERE cat2 not in ("출산/육아", "반려동물용품") and pref='long'

            UNION ALL

 

In [43]:
test.iloc[0].svc_mgmt_num

'30f5259c4d7d983b358ebaeb689c73f891a938691d8ab9e36e3822afe2ca8523'

In [154]:
#test.loc[test.svc_mgmt_num=='6528f758031544ec584ccb544c78c792b1aeb20be438543eb2de094c7e7fa260']

In [139]:
test.svc_mgmt_num.nunique()

49392

In [142]:
test.iloc[2].svc_mgmt_num

'4f68215c5dbc9183a3a7a0668384da4231b72915764bf21d8ed0d90b9c486cbc'

In [137]:
import pandas as pd
pd.set_option('display.max_rows', 500)
test.head(200)

Unnamed: 0,svc_mgmt_num,luna_id,long_prefered_domain,short_prefered_domain,user_state
0,c51349437f64adefe4886c48698fe7153bf907a1f7a5de...,APL00000CSWTZ0KLOJY8,"[골프, 스포츠/레저]","[골프, 스포츠액세서리, 건강식품, 스포츠/레저]",[]
1,29c00497f53ec2f721faf993b6672d5a0b3e178d608751...,,"[골프, 스포츠액세서리, 스포츠/레저, 화장품/미용]","[골프, 스포츠액세서리, 스포츠/레저, 화장품/미용]",[]
2,4f68215c5dbc9183a3a7a0668384da4231b72915764bf2...,APL00000D1MNRYE7BRPC,"[골프, 건강식품, 스포츠/레저]","[골프, 헬스, 건강식품, 스포츠/레저]",[]
3,dcfb12112f1043bd335ebac7e40b3ee8502691c785dd80...,APL00000D2IQKCUD0OAO,"[골프, 스포츠/레저]","[골프, 스포츠/레저]",[]
4,a789e2d2c7eeb3957bd74a038bc8a5b71fca03537300e7...,,"[골프, 기타스포츠용품, 스포츠/레저]","[골프, 기타스포츠용품]",[]
5,7533f7c6ea56e8ede43ea8259409898f2afdf74c7946a5...,APL00000DLODND3WREV4,"[골프, 스포츠/레저]","[골프, 스포츠/레저]",[]
6,a25ef61e470f1148c094998db46bd939859dc833f4a96c...,,"[골프, 스포츠/레저]","[골프, 스포츠/레저]",[]
7,0d304c4af4f074d7c77d53796569fd1511a2949270b36f...,,"[골프, 스포츠/레저]","[골프, 스포츠/레저]",[]
8,a6932e456ad79d491af61e129dc0a8771b0b8b92681d70...,,"[골프, 스포츠/레저]",[골프],[]
9,9d891f8c359c0eea02b4b2f8f3df5ce0f4bbcfad22f0f6...,,[골프],[골프],[]


# drop temp tables

In [203]:
table_names = [f"{project_id}.adot_reco_dev.jh_tdeal_cat2_temp", f"{project_id}.adot_reco_dev.jh_tdeal_cat1_temp", f"{project_id}.adot_reco_dev.jh_tdeal_item_temp", f"{project_id}.adot_reco_dev.jh_tdeal_eda_temp"]

In [204]:
for tb in table_names:
    query = f"""
    Drop table if exists {tb}
    """
    bq_client.query(query).result()