In [1]:
from skt.gcp import (
    PROJECT_ID,
    bq_insert_overwrite,
    bq_to_df,
    bq_to_pandas,
    get_bigquery_client,
    bq_table_exists,
    get_max_part,
    load_query_result_to_table,
    pandas_to_bq,
    pandas_to_bq_table,
    load_bigquery_ipython_magic,
    get_bigquery_client,
    _print_query_job_results,
    load_query_result_to_partitions
    
)

from skt.ye import (
    get_hdfs_conn,
    get_spark,
    hive_execute,
    hive_to_pandas,
    pandas_to_parquet,
    slack_send,
    get_secrets
)

In [None]:
from datetime import datetime, timedelta

In [None]:
execution_dt = datetime.strptime(current_dt, '%Y-%m-%d')
execution_dt_one_ago = (execution_dt - timedelta(days=1))
lag_current_dt= execution_dt_one_ago.strftime('%Y-%m-%d')
print(f'execution_dt: {lag_current_dt}')

In [95]:
""" no partition data in comm (snapshot) """
PROJECT_ID = "skt-datahub"
db_name  = "comm"
tmbr_meta_tbl= "mp_taxonomies_brand" 
tmbr_meta_map_tbl = "mp_taxonomies_brandMapp"

In [None]:
save_db_name = 'adot_reco_dev'

In [None]:
bq_clinet = get_bigquery_client()

In [283]:
## Meta table 전처리 로직: 중복 product 처리 (제거 x -> 메타 합산)
query = f"""

WITH META_TABLE AS (
    SELECT  brand_id,
            count(distinct brand_name) over(partition by COALESCE(TRIM(LOWER(brand_name)), '')) as brand_cnt,
            COALESCE(TRIM(LOWER(brand_name)), '') as brand_name,
            del_yn,
            display_name
            prm_brand_name,
            categories,
            COALESCE(TRIM(LOWER(category_large_name)), '') as category_large_name,
            COALESCE(TRIM(LOWER(category_medium_name)), '') as category_medium_name,
            COALESCE(TRIM(LOWER(category_small_name)), '') as category_small_name,
            COALESCE(TRIM(LOWER(description)), '') as description

    FROM    {db_name}.{tmbr_meta_tbl}
    WHERE brand_name is not null and brand_name !=''
),

DUPLICATED_BRAND_TABLE AS(
    SELECT  brand_name, 
            ARRAY_AGG(category_large_name) as category_large_names,
            ARRAY_AGG(category_medium_name) as category_medium_names,
            ARRAY_AGG(category_small_name) as category_small_names,
            ARRAY_AGG(description) as descriptions,

    FROM (
        SELECT  brand_name,
                category_large_name,
                category_medium_name,
                category_small_name, 
                description

        FROM META_TABLE
        WHERE brand_name !='' AND brand_cnt > 1 
    )
    GROUP BY brand_name
),

NON_DUPLICATED_BRAND_TABLE AS(
    SELECT  brand_name,
            category_large_name,
            category_medium_name,
            category_small_name, 
            description

    FROM META_TABLE
    WHERE brand_name !='' AND brand_cnt = 1
),

MERGED_DUPLICATE_BRAND_TABLE AS(
    SELECT  brand_name,
            ARRAY_TO_STRING(category_large_names, ',') as category_large_name,
            ARRAY_TO_STRING(category_medium_names, ',') as category_medium_name,
            ARRAY_TO_STRING(category_small_names, ',') as category_small_name,
            ARRAY_TO_STRING(descriptions, ',') as description,
    FROM(
        SELECT  brand_name,
                ARRAY(
                    SELECT category_large_name
                    FROM UNNEST(category_large_names) AS category_large_name 
                    WHERE category_large_name != ''
                  ) AS category_large_names,
                ARRAY(
                    SELECT category_medium_name 
                    FROM UNNEST(category_medium_names) AS category_medium_name 
                    WHERE category_medium_name != ''
                  ) AS category_medium_names,

                ARRAY(
                    SELECT category_small_name
                    FROM UNNEST(category_small_names) AS category_small_name
                    WHERE category_small_name != ''
                  ) AS category_small_names,
                ARRAY(
                    SELECT description
                    FROM UNNEST(descriptions) AS description
                    WHERE description != ''
                  ) AS descriptions     

        FROM DUPLICATED_BRAND_TABLE
    )
)
SELECT  *,
        DENSE_RANK() OVER (ORDER BY brand_name) AS new_brand_id,
        PARSE_DATE('%Y-%m-%d', '{current_dt}') as dt 
FROM (
    SELECT *,

    FROM MERGED_DUPLICATE_BRAND_TABLE

    UNION ALL

    SELECT *
    FROM NON_DUPLICATED_BRAND_TABLE
) AS A
"""


In [285]:
from google.cloud.bigquery.job import QueryJobConfig

In [286]:
job_config = QueryJobConfig()
temp_table = f'{PROJECT_ID}.temp_1d.tmbr_temp_table'
job_config.destination = temp_table
job_config.write_disposition = 'WRITE_TRUNCATE'

In [287]:
# 임시 테이블 생성
query_job = bq_clinet.query(query, job_config=job_config)
query_job.result() 

<google.cloud.bigquery.table.RowIterator at 0x7f797eefc340>

In [318]:
# 고유명사 추출 로직
query = f"""
WITH META_TABLE AS (
    SELECT  *
    FROM {temp_table}
    WHERE dt = '{current_dt}'
),

SUB_WORD_TABLE AS (

    SELECT  brand_word,
            brand_name,
            new_brand_id,
            count(brand_word) over(partition by brand_word) as word_cnt
    FROM (
        SELECT  new_brand_id,
                brand_name,
                brand_word 
        FROM META_TABLE,
        UNNEST(SPLIT(brand_name, ' ')) AS brand_word
    )
),

etymology_table AS(
    SELECT * 
    FROM SUB_WORD_TABLE
    WHERE brand_name = brand_word
),

derivative_table AS(
    SELECT * 
    FROM SUB_WORD_TABLE
    WHERE brand_word in (SELECT brand_name FROM etymology_table) 
    AND brand_name != brand_word
)

SELECT distinct brand_word as root_brand_name,
                brand_name, 
                new_brand_id
                
FROM derivative_table
WHERE brand_word !='-' and brand_word is not null and brand_word !=''
"""

In [320]:
represent_product_df = bq_to_pandas(query)

query: 
WITH META_TABLE AS (
    SELECT  *
    FROM skt-datahub.temp_1d.tmbr_temp_table
    WHERE dt = '2024-07-04'
),

SUB_WORD_TABLE AS (

    SELECT  brand_word,
            brand_name,
            new_brand_id,
            count(brand_word) over(partition by brand_word) as word_cnt
    FROM (
        SELECT  new_brand_id,
                brand_name,
                brand_word 
        FROM META_TABLE,
        UNNEST(SPLIT(brand_name, ' ')) AS brand_word
    )
),

etymology_table AS(
    SELECT * 
    FROM SUB_WORD_TABLE
    WHERE brand_name = brand_word
),

derivative_table AS(
    SELECT * 
    FROM SUB_WORD_TABLE
    WHERE brand_word in (SELECT brand_name FROM etymology_table) 
    AND brand_name != brand_word
)

SELECT distinct brand_word as root_brand_name,
                brand_name, 
                new_brand_id
                
FROM derivative_table
WHERE brand_word !='-' and brand_word is not null and brand_word !=''

destination: skt-datahub._775c5ccab1096b3cccd7ac34a5db11

In [321]:
root_product_mapping = represent_product_df.set_index('brand_name').to_dict()['root_brand_name']

In [322]:
import json
root_product_mapping_json = json.dumps(root_product_mapping, ensure_ascii=False)

In [316]:
query = f"""
CREATE TEMP FUNCTION convert_to_root(brand_name STRING)
RETURNS STRING
LANGUAGE js AS '''
  var root_product_mapping_json_set = {root_product_mapping_json};
  return (brand_name in root_product_mapping_json_set) ? root_product_mapping_json_set[brand_name] : brand_name;
''';

WITH TEMP_TABLE AS (
    SELECT *
    FROM (
        SELECT  brand_name,
                CASE
                    WHEN category_medium_name = '' AND category_small_name = '' THEN ''
                    WHEN category_medium_name = '' THEN category_small_name
                    WHEN category_small_name = '' THEN category_medium_name
                    WHEN category_medium_name = category_small_name THEN category_medium_name
                    WHEN STRPOS(category_medium_name, category_small_name)> 0 THEN category_small_name
                    ELSE CONCAT(category_medium_name, ',', category_small_name)
                END AS category,
                description,
                dt
        FROM {temp_table}
        )
),

VALID_TABLE AS (
    SELECT  brand_name,
            category,
            description
            
    FROM TEMP_TABLE
    WHERE category !='' and category is not null
),

INVALID_TABLE AS (
    SELECT  *,
            convert_to_root(brand_name) as converted_brand_name,
    FROM  TEMP_TABLE
    WHERE category =''
)

SELECT  brand_name, 
        category,
        brand_name as rep_name,
        description
FROM    VALID_TABLE

UNION ALL

SELECT  A.brand_name,
        B.category,
        A.converted_brand_name as rep_name,
        A.description
        
FROM INVALID_TABLE AS A
INNER JOIN
(
    SELECT *
    FROM VALID_TABLE
)AS B
ON A.converted_brand_name = b.brand_name
"""

In [None]:
drop_query = f"""Drop table if exists {temp_table}"""
bq_cl =get_bigquery_client()
bq_cl.query(drop_query).result()
bq_insert_overwrite(sql=query, destination=f'{PROJECT_ID}.{db_name}.tmbr_product_table_stg', partition='dt'

In [None]:
query = f"""
CREATE TEMP FUNCTION convert_to_root(brand_name STRING)
RETURNS STRING
LANGUAGE js AS '''
  var root_product_mapping_json_set = {root_product_mapping_json};
  return (brand_name in root_product_mapping_json_set) ? root_product_mapping_json_set[brand_name] : brand_name;
''';
SELECT  brand_name,
        convert_to_root(brand_name) as converted_brand_name,
        dt
FROM {temp_table}
WHERE dt = '{meta_dt}'
"""
tts = bq_to_pandas(query)

In [255]:
tts.loc[tts.brand_name != tts.converted_brand_name]

Unnamed: 0,brand_name,converted_brand_name,category_large_name,category_medium_name,category_small_name,description,dt
9,에이닷 전화 가입하기,에이닷,,,,24년 5월 미션스탬프용 내부관리 브랜드,2024-07-04
25,에이닷 슬립,에이닷,PLAY,,,2023년 11월 0 day,2024-07-04
54,t1 base camp,t1,EAT,베이커리,,T1 BASE CAMP,2024-07-04
58,"필요한 정보, 메모를 에이닷 keep에 등록하기",에이닷,,,,24년 7월 미션스탬프용 내부관리 브랜드,2024-07-04
59,에이닷 캘린더에 일정 등록하기,에이닷,,,,24년 7월 미션스탬프용 내부관리 브랜드,2024-07-04
75,에이닷 전화 걸어보기,에이닷,,,,24년 5월 미션스탬프용 내부관리 브랜드,2024-07-04
77,test brand1,test,,,,Test Brand1Test Brand1Test Brand1Test Brand1,2024-07-04
81,미래소년 코난,코난,PLAY,,,미래소년 코난 0 day,2024-07-04
103,[수정금지] 드롭탑,드롭탑,,,,최고의 장소에서 즐기는 최고의 커피,2024-07-04
139,[수정금지] sk렌터카,sk렌터카,BUY,생활,교통,최상의 경험을 선사하는 No.1 렌터카,2024-07-04
