In [1]:
from skt.gcp import (
    PROJECT_ID,
    bq_insert_overwrite,
    bq_to_df,
    bq_to_pandas,
    get_bigquery_client,
    bq_table_exists,
    get_max_part,
    load_query_result_to_table,
    pandas_to_bq,
    pandas_to_bq_table,
    load_bigquery_ipython_magic,
    get_bigquery_client,
    _print_query_job_results,
    load_query_result_to_partitions
    
)

from skt.ye import (
    get_hdfs_conn,
    get_spark,
    hive_execute,
    hive_to_pandas,
    pandas_to_parquet,
    slack_send,
    get_secrets
)

In [2]:
from datetime import datetime, timedelta

In [91]:
current_dt = '2024-07-08'
ttl = 30
state = 'stg'

In [92]:
execution_dt = datetime.strptime(current_dt, '%Y-%m-%d')
execution_dt_one_ago = (execution_dt - timedelta(days=1))
lag_current_dt= execution_dt_one_ago.strftime('%Y-%m-%d')
ttl = int(ttl)
print(f'execution_dt: {current_dt}')
print(f"state: {state}")
print(f"ttl: {ttl}")

execution_dt: 2024-07-08
state: stg
ttl: 30


In [106]:
""" no partition data in comm (snapshot) """
PROJECT_ID = "skt-datahub"
db_name  = "comm"
tmbr_meta_tbl= "mp_taxonomies_brand" 
tmbr_meta_map_tbl = "mp_taxonomies_brandMapp"

In [126]:
temp_db = 'temp_1d'

In [107]:
save_db_name = 'adot_reco_dev'

In [108]:
bq_clinet = get_bigquery_client()

In [159]:
## Meta table 전처리 로직: 중복 product 처리 (제거 x -> 메타 합산)
query = f"""

WITH META_TABLE AS (
    SELECT  brand_id,
            count(distinct brand_name) over(partition by COALESCE(TRIM(LOWER(brand_name)), '')) as brand_cnt,
            COALESCE(TRIM(LOWER(brand_name)), '') as brand_name,
            del_yn,
            display_name
            prm_brand_name,
            categories,
            COALESCE(TRIM(LOWER(category_large_name)), '') as category_large_name,
            COALESCE(TRIM(LOWER(category_medium_name)), '') as category_medium_name,
            COALESCE(TRIM(LOWER(category_small_name)), '') as category_small_name,
            COALESCE(TRIM(LOWER(description)), '') as description

    FROM    {db_name}.{tmbr_meta_tbl}
    WHERE brand_name is not null and brand_name !='' and LOWER(brand_name) NOT LIKE '%test%'
),

DUPLICATED_BRAND_TABLE AS(
    SELECT  brand_name, 
            ARRAY_AGG(category_large_name) as category_large_names,
            ARRAY_AGG(category_medium_name) as category_medium_names,
            ARRAY_AGG(category_small_name) as category_small_names,
            ARRAY_AGG(description) as descriptions,

    FROM (
        SELECT  brand_name,
                category_large_name,
                category_medium_name,
                category_small_name, 
                description

        FROM META_TABLE
        WHERE brand_name !='' AND brand_cnt > 1 
    )
    GROUP BY brand_name
),

NON_DUPLICATED_BRAND_TABLE AS(
    SELECT  brand_name,
            category_large_name,
            category_medium_name,
            category_small_name, 
            description

    FROM META_TABLE
    WHERE brand_name !='' AND brand_cnt = 1
),

MERGED_DUPLICATE_BRAND_TABLE AS(
    SELECT  brand_name,
            ARRAY_TO_STRING(category_large_names, ',') as category_large_name,
            ARRAY_TO_STRING(category_medium_names, ',') as category_medium_name,
            ARRAY_TO_STRING(category_small_names, ',') as category_small_name,
            ARRAY_TO_STRING(descriptions, ',') as description,
    FROM(
        SELECT  brand_name,
                ARRAY(
                    SELECT category_large_name
                    FROM UNNEST(category_large_names) AS category_large_name 
                    WHERE category_large_name != ''
                  ) AS category_large_names,
                ARRAY(
                    SELECT category_medium_name 
                    FROM UNNEST(category_medium_names) AS category_medium_name 
                    WHERE category_medium_name != ''
                  ) AS category_medium_names,

                ARRAY(
                    SELECT category_small_name
                    FROM UNNEST(category_small_names) AS category_small_name
                    WHERE category_small_name != ''
                  ) AS category_small_names,
                ARRAY(
                    SELECT description
                    FROM UNNEST(descriptions) AS description
                    WHERE description != ''
                  ) AS descriptions     

        FROM DUPLICATED_BRAND_TABLE
    )
)
SELECT  *,
        PARSE_DATE('%Y-%m-%d', '{current_dt}') as dt 
FROM (
    SELECT *,

    FROM MERGED_DUPLICATE_BRAND_TABLE

    UNION ALL

    SELECT *
    FROM NON_DUPLICATED_BRAND_TABLE
) AS A
"""


In [160]:
from google.cloud.bigquery.job import QueryJobConfig

In [161]:
job_config = QueryJobConfig()
temp_table = f'{PROJECT_ID}.{temp_db}.tmbr_temp_table'
job_config.destination = temp_table
job_config.write_disposition = 'WRITE_TRUNCATE'

In [162]:
# 임시 테이블 생성
query_job = bq_clinet.query(query, job_config=job_config)
query_job.result() 

<google.cloud.bigquery.table.RowIterator at 0x7f0abf0047c0>

In [176]:
query = f"""
WITH META_TABLE AS (
    SELECT  *
    FROM {temp_table}
    WHERE dt = '{current_dt}'
),

SUB_WORD_TABLE AS (

    SELECT  brand_word,
            brand_name,
            category_large_name,
            category_medium_name,
            category_small_name,
            description,
            count(brand_word) over(partition by brand_word) as word_cnt,
            dt
    FROM (
        SELECT  category_large_name,
                category_medium_name,
                category_small_name,
                description,
                brand_name,
                brand_word,
                dt
        FROM META_TABLE,
        UNNEST(SPLIT(brand_name, ' ')) AS brand_word
    )
),
tmbr_etymology_table AS (
    SELECT  distinct A.*,
            B.del_yn
    FROM (

        SELECT distinct brand_name,
               category_large_name,
               category_medium_name,
               category_small_name,
               description,
               CASE 
                   WHEN category_medium_name !='' OR category_small_name !='' THEN 'N'
                   ELSE 'Y'
               END AS is_empty,
               dt

        FROM SUB_WORD_TABLE
        WHERE brand_name = brand_word
    )AS A
    LEFT JOIN  (
        SELECT  distinct COALESCE(TRIM(LOWER(brand_name)), '') as brand_name,
                del_yn

        FROM  {db_name}.{tmbr_meta_tbl}
        WHERE brand_name is not null and brand_name !=''
    ) AS B
    ON A.brand_name = B.brand_name
),

derivative_table AS(
    SELECT  brand_word,
            ARRAY_TO_STRING(ARRAY_AGG(brand_name), '||') as derivative_brands
    FROM(
        SELECT  distinct brand_word,
                         brand_name
        FROM SUB_WORD_TABLE
        WHERE brand_word in (SELECT distinct brand_name FROM tmbr_etymology_table) 
        AND brand_name != brand_word AND brand_word !=''
    )
    GROUP BY brand_word
)

SELECT distinct *
FROM (
    SELECT  A.*,
            DENSE_RANK() OVER (ORDER BY A.brand_name) AS new_brand_id,
            B.derivative_brands

    FROM tmbr_etymology_table AS A

    LEFT JOIN (
        SELECT  brand_word,
                derivative_brands
        FROM derivative_table
    ) AS B
    ON A.brand_name = B.brand_word
)
"""

In [177]:
job_config = QueryJobConfig()
temp_etymology_table = f'{PROJECT_ID}.{temp_db}.tmbr_etymology_table'
job_config.destination = temp_etymology_table
job_config.write_disposition = 'WRITE_TRUNCATE'

In [178]:
query_job = bq_clinet.query(query, job_config=job_config)
query_job.result() 

<google.cloud.bigquery.table.RowIterator at 0x7f0abed5eeb0>

## Category mapping

In [179]:
query = f"""
WITH META_TABLE AS (
    SELECT distinct *
    FROM {temp_table}
    WHERE dt = '{current_dt}'
),
TEMP_TABLE AS (
    SELECT  brand_name,
            category_large_name,
            CASE
                WHEN category_medium_name = '' AND category_small_name = '' THEN ''
                WHEN category_medium_name = '' THEN category_small_name
                WHEN category_small_name = '' THEN category_medium_name
                WHEN category_medium_name = category_small_name THEN category_medium_name
                WHEN STRPOS(category_medium_name, category_small_name) > 0 THEN category_small_name
                ELSE CONCAT(category_medium_name, ',', category_small_name)
            END AS categories,
            description,
            dt
    FROM  META_TABLE
)

SELECT *
FROM TEMP_TABLE 
WHERE categories !=''
"""

In [180]:
categories_mapping_df = bq_to_pandas(query)

unsupported operand type(s) for /: 'NoneType' and 'int'
Downloading: 100%|[32m██████████[0m|


In [181]:
categories_mapping = categories_mapping_df.set_index('brand_name').to_dict()['categories']

In [182]:
import json
categories_mapping_json = json.dumps(categories_mapping, ensure_ascii=False)

In [199]:
query = f"""
CREATE TEMP FUNCTION get_categories(brand_name STRING)
RETURNS STRING
LANGUAGE js AS '''
  var root_product_mapping_json_set = {categories_mapping_json};
  return (brand_name in root_product_mapping_json_set) ? root_product_mapping_json_set[brand_name] : '';
''';

WITH META_TABLE AS (
    SELECT distinct *
    FROM {temp_table}
    WHERE dt = '{current_dt}'
),

ARRAY_TABLE AS (
    SELECT *
    FROM (
        SELECT  *,
                CASE
                    WHEN category_medium_name = '' AND category_small_name = '' THEN ''
                    WHEN category_medium_name = '' THEN category_small_name
                    WHEN category_small_name = '' THEN category_medium_name
                    WHEN category_medium_name = category_small_name THEN category_medium_name
                    WHEN STRPOS(category_medium_name, category_small_name) > 0 THEN category_small_name
                    ELSE CONCAT(category_medium_name, ',', category_small_name)
                END AS categories,
                SPLIT(derivative_brands, '||') as derivative_brand_array
        FROM {PROJECT_ID}.{temp_db}.tmbr_etymology_table
    )
),
exploded_data AS (
  SELECT 
    brand_name,
    description,
    is_empty,
    del_yn,
    categories,
    derivative_brand_element
  FROM ARRAY_TABLE,
  UNNEST(derivative_brand_array) AS derivative_brand_element
),

MERGED_TABLE AS (
    SELECT distinct brand_name,
                    CASE
                        WHEN categories = '' AND categories_from_derivative = '' THEN ''
                        WHEN categories = '' AND categories_from_derivative != '' THEN categories_from_derivative
                        ELSE categories
                    END AS categories,
                    description

    FROM (
        SELECT brand_name,
               description,
               categories,
               get_categories(derivative_brand_element) AS categories_from_derivative,
               del_yn
        FROM exploded_data
    )
    WHERE categories_from_derivative!='' OR categories!=''
)

SELECT *
FROM (
    SELECT  brand_name,
            CASE
                WHEN category_medium_name = '' AND category_small_name = '' THEN ''
                WHEN category_medium_name = '' THEN category_small_name
                WHEN category_small_name = '' THEN category_medium_name
                WHEN category_medium_name = category_small_name THEN category_medium_name
                WHEN STRPOS(category_medium_name, category_small_name) > 0 THEN category_small_name
                ELSE CONCAT(category_medium_name, ',', category_small_name)
            END AS categories,
            description,
            PARSE_DATE('%Y-%m-%d', '{current_dt}') as dt 
    FROM  META_TABLE
    WHERE brand_name not in (SELECT distinct brand_name FROM MERGED_TABLE)
)
WHERE categories!=''

UNION ALL
SELECT  *,
        PARSE_DATE('%Y-%m-%d', '{current_dt}') as dt 
FROM MERGED_TABLE
WHERE categories !=''
"""

In [200]:
job_config = QueryJobConfig()
temp_meta_preprocessed_table = f'{PROJECT_ID}.{temp_db}.tmbr_meta_preprocessed_table'
job_config.destination = temp_meta_preprocessed_table
job_config.write_disposition = 'WRITE_TRUNCATE'

In [201]:
query_job = bq_clinet.query(query, job_config=job_config)
query_job.result() 

<google.cloud.bigquery.table.RowIterator at 0x7f0abeeeda90>

In [192]:
# ttl_query = f"""
# ALTER TABLE
#   {PROJECT_ID}.{save_db_name}.tmbr_preprocessed_meta_table
# SET
#   OPTIONS(partition_expiration_days={ttl})
# """
# bq_clinet.query(ttl_query).result()

# AGGREGATE

In [208]:
query = f"""
WITH category_split_table AS (
  SELECT 
    brand_name,
    SPLIT(categories, ',') as category_array
  FROM {PROJECT_ID}.{temp_db}.tmbr_meta_preprocessed_table
),

exploded_tables AS (
  SELECT 
    brand_name,
    category
  FROM category_split_table, UNNEST(category_array) AS category
),

categories_count_table AS (
  SELECT 
    brand_name,
    category,
    COUNT(*) as category_count
  FROM exploded_tables
  GROUP BY brand_name, category
),

ranked_items AS (
  SELECT 
    brand_name,
    category,
    category_count,
    ROW_NUMBER() OVER (PARTITION BY brand_name ORDER BY category_count DESC) as rank
  FROM categories_count_table
)

SELECT  A.*,
        B.brand_id,
        PARSE_DATE('%Y-%m-%d', '{current_dt}') as dt 
FROM (
    SELECT brand_name,
           ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category), ',') AS categories
    FROM ranked_items
    WHERE rank = 1 or category_count > 1
    GROUP BY brand_name 
) AS A

LEFT JOIN (
    SELECT  *
    FROM  {db_name}.{tmbr_meta_tbl}
) AS B
ON A.brand_name = TRIM(LOWER(B.brand_name))
"""

In [209]:
bq_insert_overwrite(sql=query, destination=f'{PROJECT_ID}.{save_db_name}.tmbr_meta_mapping_tbl', partition='dt')

query: 
WITH category_split_table AS (
  SELECT 
    brand_name,
    SPLIT(categories, ',') as category_array
  FROM skt-datahub.temp_1d.tmbr_meta_preprocessed_table
),

exploded_tables AS (
  SELECT 
    brand_name,
    category
  FROM category_split_table, UNNEST(category_array) AS category
),

categories_count_table AS (
  SELECT 
    brand_name,
    category,
    COUNT(*) as category_count
  FROM exploded_tables
  GROUP BY brand_name, category
),

ranked_items AS (
  SELECT 
    brand_name,
    category,
    category_count,
    ROW_NUMBER() OVER (PARTITION BY brand_name ORDER BY category_count DESC) as rank
  FROM categories_count_table
)

SELECT  A.*,
        B.brand_id,
        PARSE_DATE('%Y-%m-%d', '2024-07-08') as dt 
FROM (
    SELECT brand_name,
           ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category), ',') AS categories
    FROM ranked_items
    WHERE rank = 1 or category_count > 1
    GROUP BY brand_name 
) AS A

LEFT JOIN (
    SELECT  *
    FROM  comm.mp_taxonomies_brand


## LOG JOIN

In [7]:
table = "recgpt_log_sequence_lag_daily_prd"

In [8]:
start_dt ='2024-05-01'

In [9]:
query = f"""
WITH LOG_MERGED_TABLE AS(
    SELECT  A.*,
            COALESCE(B.categories, '') as categories
    FROM (
    SELECT distinct COALESCE(REPLACE(LOWER(item),' ', ''), '') as item
    FROM adot_reco.recgpt_log_sequence_lag_daily_prd
    WHERE dt > '{start_dt}' and type ='tmbr'
    ) AS A

    LEFT JOIN (
    SELECT  COALESCE(REPLACE(LOWER(brand_name),' ', ''), '') as brand_name,
            categories, 
            description,
            dt
    FROM {PROJECT_ID}.{save_db_name}.tmbr_preprocessed_meta_table
    ) AS B

    ON A.item = b.brand_name
),

NO_CATEGORY_TABLE AS (
    SELECT  A.*,
            COALESCE(B.categories, '') as categories
    FROM (
        SELECT distinct item
        FROM LOG_MERGED_TABLE
        WHERE categories ='' and item !=''
    )AS A
    LEFT JOIN (
        SELECT distinct brand_name, categories
        FROM {PROJECT_ID}.{save_db_name}.tmbr_preprocessed_meta_table
    ) AS B
    ON A.item LIKE CONCAT('%', B.brand_name, '%') 
        
)

SELECT *
FROM LOG_MERGED_TABLE
WHERE categories !='' and item !=''

UNION ALL

SELECT * 
FROM NO_CATEGORY_TABLE

"""

In [10]:
test = bq_to_pandas(query)

query: 
WITH LOG_MERGED_TABLE AS(
    SELECT  A.*,
            COALESCE(B.categories, '') as categories
    FROM (
    SELECT distinct COALESCE(REPLACE(LOWER(item),' ', ''), '') as item
    FROM adot_reco.recgpt_log_sequence_lag_daily_prd
    WHERE dt > '2024-05-01' and type ='tmbr'
    ) AS A

    LEFT JOIN (
    SELECT  COALESCE(REPLACE(LOWER(brand_name),' ', ''), '') as brand_name,
            categories, 
            description,
            dt
    FROM skt-datahub.adot_reco_dev.tmbr_preprocessed_meta_table
    ) AS B

    ON A.item = b.brand_name
),

NO_CATEGORY_TABLE AS (
    SELECT  A.*,
            COALESCE(B.categories, '') as categories
    FROM (
        SELECT distinct item
        FROM LOG_MERGED_TABLE
        WHERE categories ='' and item !=''
    )AS A
    LEFT JOIN (
        SELECT distinct brand_name, categories
        FROM skt-datahub.adot_reco_dev.tmbr_preprocessed_meta_table
    ) AS B
    ON A.item LIKE CONCAT('%', B.brand_name, '%') 
        
)

SELECT *
FROM LOG

In [14]:
import pandas as pd
pd.set_option('display.max_rows', 500) 

In [15]:
test.loc[test.categories!='']

Unnamed: 0,item,categories
4,할리스커피,카페
5,개별(10/40)베이커리,베이커리
6,cu(0day),편의점
7,공차코리아,카페
8,아침고요수목원가족동물원,테마파크
12,sk텔레콤(주),통신
17,타이드스퀘어투어비스,여행
18,부천아쿠아리움,테마파크
19,cu(0day),베이커리
20,도미노피자,피자


In [12]:
test.loc[test.categories=='']

Unnamed: 0,item,categories
0,더플레이스,
1,안경매니져,
2,skm&s,
3,나뚜루,
9,포도뮤지엄,
10,아웃백스테이크,
11,룰루메딕,
13,배달의민족,
14,t스마트오더,
15,sk브로드밴드btv케이블,
