In [None]:
from skt.gcp import (
    PROJECT_ID,
    bq_insert_overwrite,
    bq_to_df,
    bq_to_pandas,
    get_bigquery_client,
    bq_table_exists,
    get_max_part,
    load_query_result_to_table,
    pandas_to_bq,
    pandas_to_bq_table,
    load_bigquery_ipython_magic,
    get_bigquery_client,
    _print_query_job_results,
    load_query_result_to_partitions
    
)

from skt.ye import (
    get_hdfs_conn,
    get_spark,
    hive_execute,
    hive_to_pandas,
    pandas_to_parquet,
    slack_send,
    get_secrets
)

In [None]:
from google.cloud.bigquery.job import QueryJobConfig

In [None]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.functions import (
    row_number, 
    col, 
    lit, 
    count, 
    log, 
    exp, 
    sum as spark_sum
)
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType

In [None]:
import pandas as pd
from datetime import datetime, date, timedelta

In [None]:
execution_dt = datetime.strptime(current_dt, '%Y-%m-%d')
execution_dt_one_ago = (execution_dt - timedelta(days=1)).strftime('%Y-%m-%d')
execution_dt_next = (execution_dt + timedelta(days=1))
current_dt_next = execution_dt_next.strftime('%Y-%m-%d')

In [None]:
print(f'current_dt: {current_dt}')
print(f'current_dt_next: {current_dt_next}')
print(f'state: {state}')


In [None]:
db_name = 'adot_reco_dev'
table_nm = 'tdeal_cat1_cnt'
project_id = 'skt-datahub'

In [None]:
bq_client = get_bigquery_client()

In [None]:
pivot_query = f"""
SELECT  ROW_NUMBER() OVER (ORDER BY profile_templates) AS profile_id,
        profile_templates,
        source_domain,
        luna_ids,
        PARSE_DATE('%Y-%m-%d', '{current_dt_next}') as dt 
        
FROM (

        SELECT  profile_templates,
                source_domain,
                ARRAY_AGG(luna_id) as luna_ids
        FROM (

                SELECT  distinct luna_id,
                                profile_templates,
                                source_domain
                FROM {db_name}.adotServiceProfile_xdr
                WHERE dt = '{current_date}' and luna_id is not null and profile_templates is not null and profile_templates!=''
        ) 
        GROUP BY profile_templates, source_domain

        UNION ALL
        SELECT  profile_templates,
                source_domain,
                ARRAY_AGG(luna_id) as luna_ids
        FROM (
                SELECT distinct luna_id
                                profile_templates,
                                source_domain

                FROM {db_name}.adotServiceProfile_tdeal
                WHERE dt = '{current_date}' and luna_id is not null and profile_templates is not null and profile_templates!=''
        )
        GROUP BY profile_templates, source_domain

        UNION ALL
        SELECT  profile_templates,
                source_domain,
                ARRAY_AGG(luna_id) as luna_ids
        FROM (
                SELECT  distinct luna_id,
                                profile_templates,
                                source_domain
                FROM {db_name}.adotServiceProfile_adot
                WHERE dt = '{current_dt_next}' and luna_id is not null and profile_templates is not null and profile_templates!=''
        )
        GROUP BY profile_templates, source_domain

        UNION ALL
        SELECT  profile_templates,
                source_domain,
                ARRAY_AGG(luna_id) as luna_ids
        FROM (
                SELECT distinct  profile_templates,
                                source_domain
                FROM {db_name}.adotServiceProfile_tmap
                WHERE dt = '{current_date}' and luna_id is not null and profile_templates is not null and profile_templates!=''
        )
        GROUP BY profile_templates, source_domain
)

"""

In [None]:
# 빅쿼리 테이블로 우선 저장
bq_insert_overwrite(sql=pivot_query, destination=f'{PROJECT_ID}.{db_name}.adotServiceProfilesPivotTable', partition='dt')

In [None]:
hdfs_root_path = "/data/temp/ca_recsys"
current_root_path = f"{hdfs_root_path}/{state}/user_retriev/{current_dt_next}"
hdfs_data_path =  f"{current_root_path}/data/profiles"

In [None]:
# 추론용 하둡 패스에 저장
profiles_table = bq_to_df(pivot_query)
profiles_table.write.mode("overwrite").parquet(hdfs_data_path)

In [None]:
query = f"""
WITH explodedByLuna AS (
    SELECT  *
    FROM    '{project_id}.{db_name}.adotServicePivotProfiles'
    UNNEST(luna_ids) AS luna_id
)
SELECT  luna_id,
        ARRAY_AGG(DISTINCT profile_templates) AS total_profile_templates,
        ARRAY_AGG(DISTINCT profile_id) AS profile_ids,
        PARSE_DATE('%Y-%m-%d', '{current_dt_next}') as dt 

FROM explodedByLuna
GROUP BY luna_id
"""

In [None]:
bq_insert_overwrite(sql=query, destination=f'{PROJECT_ID}.{db_name}.adotServiceUnionUserProfiles', partition='dt')