# * ETL_Conversion M2 to M1 (Geo)

## Parameter

In [1]:
import configparser
import datetime as dt
import pandas as pd
import numpy as np
import oracledb
import re
# import FN_Actual_MKS_Broadband as fn

config = configparser.ConfigParser()
config.read('../../../my_config.ini')
config.sections()

TDMDBPR_user = config['TDMDBPR']['username']
TDMDBPR_pwd = config['TDMDBPR']['password']
TDMDBPR_db = config['TDMDBPR']['db']
TDMDBPR_host = config['TDMDBPR']['host']
TDMDBPR_port = config['TDMDBPR']['port']

AKPIPRD_user = config['AKPIPRD']['username']
AKPIPRD_pwd = config['AKPIPRD']['password']
AKPIPRD_db = config['AKPIPRD']['db']
AKPIPRD_host = config['AKPIPRD']['host']
AKPIPRD_port = config['AKPIPRD']['port']

curr_dt = dt.datetime.now().date()
next_week_dt = curr_dt + dt.timedelta(days=7)
str_curr_dt = curr_dt.strftime('%Y%m%d')
str_next_week_dt = next_week_dt.strftime('%Y%m%d')
yyyymm_curr = curr_dt.strftime('%Y%m')
yyyymm_next_week = next_week_dt.strftime('%Y%m')

### Lasted Source Summary

In [2]:
# Connect : TDMDBPR
src_dsn = f'{TDMDBPR_user}/{TDMDBPR_pwd}@{TDMDBPR_host}:{TDMDBPR_port}/{TDMDBPR_db}'
src_conn = oracledb.connect(src_dsn)
src_cur = src_conn.cursor()


try:
    # Get : Actual Summary from "FCT_BB_SHARE_SUBS_CCAATT"
    src_cur.execute("""
        SELECT TM_KEY_MTH
            , SUM(SUBS) TOTAL
            , SUM(CASE WHEN ISP = 'TOL' THEN SUBS END) TOL
            , SUM(CASE WHEN ISP = '3BB' THEN SUBS END) "3BB"
            , SUM(CASE WHEN ISP = 'AIS' THEN SUBS END) AIS
            , SUM(CASE WHEN ISP IN ('CAT', 'TOT') THEN SUBS END) NT
        FROM CORPNSBOX.FCT_BB_SHARE_SUBS_CCAATT A
        WHERE TM_KEY_MTH >= 202401
        --WHERE TM_KEY_MTH = (SELECT MAX(TM_KEY_MTH) FROM CORPNSBOX.FCT_BB_SHARE_SUBS_CCAATT NOLOCK)
        AND EXISTS (SELECT 1 FROM CDSAPPO.DIM_MOOC_AREA O
			        WHERE O.REMARK <> 'Dummy'
			        AND O.CCAATT = A.CCAATT)
        GROUP BY TM_KEY_MTH
        ORDER BY 1
    """)
    rows = src_cur.fetchall()
    print(f'\nCurrent Source Summary...')
    chk_src_df = pd.DataFrame.from_records(rows, columns=[x[0] for x in src_cur.description])

    # Display
    tmp_src_df = chk_src_df.copy()
    mod_col_list = tmp_src_df.iloc[:, 1:6].columns.tolist()
    for col in mod_col_list:
        tmp_src_df[col] = tmp_src_df[col].apply(lambda x: format(x, ',.0f'))
    print(f'\n{tmp_src_df}')
    
    src_cur.close()


except oracledb.DatabaseError as e:
    print(f'\nError with Oracle : {e}')


finally:
    src_conn.close()


Current Source Summary...

   TM_KEY_MTH      TOTAL        TOL        3BB        AIS         NT
0      202401  8,392,163  3,062,361  2,206,741  1,703,305  1,419,756
1      202402  8,399,105  3,065,282  2,197,017  1,716,028  1,420,779
2      202403  8,414,203  3,072,991  2,187,215  1,732,168  1,421,829
3      202404  8,422,602  3,073,168  2,177,393  1,749,183  1,422,858
4      202405  8,439,795  3,082,458  2,167,565  1,765,884  1,423,888


### Lasted Fact Summary

In [3]:
# Connect : AKPIPRD
tgt_dsn = f'{AKPIPRD_user}/{AKPIPRD_pwd}@{AKPIPRD_host}:{AKPIPRD_port}/{AKPIPRD_db}'
tgt_conn = oracledb.connect(tgt_dsn)
tgt_cur = tgt_conn.cursor()


try:
    # Get : Actual Summary from "FCT_BROADBAND_MKS"
    tgt_cur.execute("""
        SELECT TM_KEY_MTH
            , SUM(CASE WHEN METRIC_CD IN ('VIN00025', 'VIN00026', 'VIN00027', 'VIN00028') THEN METRIC_VALUE END) TOTAL
            , SUM(CASE WHEN METRIC_CD = 'VIN00025' THEN METRIC_VALUE END) TOL
            , SUM(CASE WHEN METRIC_CD = 'VIN00026' THEN METRIC_VALUE END) "3BB"
            , SUM(CASE WHEN METRIC_CD = 'VIN00027' THEN METRIC_VALUE END) AIS
            , SUM(CASE WHEN METRIC_CD = 'VIN00028' THEN METRIC_VALUE END) NT
            --, MAX(LOAD_DATE) LOAD_DATE
        FROM AUTOKPI.FCT_BROADBAND_MKS NOLOCK
        WHERE TM_KEY_MTH >= 202401 --AND TM_KEY_MTH <= 202404
        AND TM_KEY_DAY LIKE '%01'
        AND AREA_TYPE = 'P'
        AND REMARK IS NULL
        GROUP BY TM_KEY_MTH
        ORDER BY 1
    """)
    rows = tgt_cur.fetchall()
    print(f'\nCurrent Fact Summary...')
    chk_tgt_df = pd.DataFrame.from_records(rows, columns=[x[0] for x in tgt_cur.description])

    # Display
    tmp_tgt_df = chk_tgt_df.copy()
    mod_col_list = tmp_tgt_df.iloc[:, 1:6].columns.tolist()
    for col in mod_col_list:
        tmp_tgt_df[col] = tmp_tgt_df[col].apply(lambda x: format(x, ',.0f'))
    print(f'\n{tmp_tgt_df}')

    # Get : MAX(TM_KEY_MTH)
    tgt_cur.execute("SELECT MAX(TM_KEY_MTH) FROM AUTOKPI.FCT_BROADBAND_MKS NOLOCK")
    max_mth_fct = tgt_cur.fetchone()
    print(f'\nv_max_mth_fct: {max_mth_fct[0]}')
    
    tgt_cur.close()


except oracledb.DatabaseError as e:
    print(f'\nError with Oracle : {e}')


finally:
    tgt_conn.close()


Current Fact Summary...

   TM_KEY_MTH      TOTAL        TOL        3BB        AIS         NT
0      202401  8,392,163  3,062,361  2,206,741  1,703,305  1,419,756
1      202402  8,399,105  3,065,282  2,197,017  1,716,028  1,420,779
2      202403  8,414,203  3,072,991  2,187,215  1,732,168  1,421,829
3      202404  8,422,602  3,073,168  2,177,393  1,749,183  1,422,858
4      202405  8,439,795  3,082,458  2,167,565  1,765,884  1,423,888

v_max_mth_fct: 202407


### Check Diff Summary

In [4]:
chk_diff_df = chk_src_df.iloc[-1].compare(chk_tgt_df.iloc[-1])
chk_diff_df['diff'] = chk_diff_df['self'] - chk_diff_df['other']
mod_col_list = chk_diff_df.columns.tolist()
for col in mod_col_list:
    chk_diff_df[col] = chk_diff_df[col].apply(lambda x: format(x, ',.0f'))

chk_diff_df

Unnamed: 0,self,other,diff


### Input Parameter

In [6]:
# Manual Config
v_target_schema = 'AUTOKPI'
v_target_table = 'FCT_BROADBAND_MKS'
v_sql_rawdata_fact = 'Raw-FCT_BROADBAND_MKS.sql'
v_sql_mockup_fact = 'Mock-FCT_BROADBAND_MKS.sql'
v_sql_initial_fact = 'Initial-FCT_BROADBAND_MKS.sql'


# Auto Config
v_update_flag = 'Y' if chk_diff_df.size > 0 else 'N'
v_mth_end_src = chk_src_df['TM_KEY_MTH'].max().astype(float)
v_mth_end_fct = chk_tgt_df['TM_KEY_MTH'].max().astype(float)
v_max_mth_fct = max_mth_fct[0]
v_mth_end_fct_year = pd.to_datetime(v_mth_end_fct, format='%Y%m').year
v_mth_end_fct_month = pd.to_datetime(v_mth_end_fct, format='%Y%m').month
v_prev_mth_fct = v_mth_end_fct-1 if v_mth_end_fct_month != 1 else float(str(v_mth_end_fct_year-1)+str(12))


# Show Auto Config
print(f'\nyyyymm_next_week: {yyyymm_next_week}')
print(f'\nv_update_flag: {v_update_flag}')
print(f'v_mth_end_src: {v_mth_end_src}')
print(f'v_mth_end_fct: {v_mth_end_fct}')
print(f'v_max_mth_fct: {v_max_mth_fct}')
print(f'v_prev_mth_fct: {v_prev_mth_fct}')


yyyymm_next_week: 202407

v_update_flag: N
v_mth_end_src: 202405.0
v_mth_end_fct: 202405.0
v_max_mth_fct: 202407
v_prev_mth_fct: 202404.0


## ETL Process...

### DB source to DB Target
    Delete -> Insert

    Source : CORPNSBOX.FCT_BB_SHARE_SUBS_CCAATT
             CDSAPPO.DIM_MOOC_AREA
             CDSAPPO.DIM_TIME
    
    Target : AUTOKPI.FCT_BROADBAND_MKS

In [7]:
''' Auto Process '''

job_start_datetime = dt.datetime.now().strftime('%Y-%m-%d, %H:%M:%S')
print(f'\nJob Start... {job_start_datetime}')

# Process flow
if v_update_flag == 'Y':
    if yyyymm_next_week > str(v_mth_end_fct):
        print(f'\n*** Source update & mockup to next month Fact ***')
        fn.src_update_to_fact(v_mth_end_fct, v_target_schema, v_target_table, v_sql_rawdata_fact)
    else:
        print(f'\n*** Source update current month Fact ***')
        fn.src_update_to_fact(v_prev_mth_fct, v_target_schema, v_target_table, v_sql_rawdata_fact)

elif v_update_flag == 'N':
    if yyyymm_next_week > str(v_max_mth_fct):
        print(f'\n*** Last Fact mockup to next month ***')
        fn.mockup_to_fact(v_max_mth_fct, v_target_schema, v_target_table, v_sql_mockup_fact)
    else:
        print(f'\n*** Not update ***')

print(f'\nJob Done !!!')


Job Start... 2024-06-24, 15:32:33

*** Last Fact mockup to next month ***

Param input...

   -> mth_end_fct: 202405
   -> target_schema: AUTOKPI
   -> target_table: FCT_BROADBAND_MKS
   -> sql_mockup_fact: Mock-FCT_BROADBAND_MKS.sql
   -> v_param: {'mth_end_fct': 202405}

AKPIPRD : Connected

Processing...

   -> DELETE : "FCT_BROADBAND_MKS" : Done !

   -> INSERT : "FCT_BROADBAND_MKS" : Done !

AKPIPRD : Disconnected

Job Done !!!


In [8]:
''' Manual Process '''

# # Input Period
# v_initial_mth_start = 202301
# v_initial_mth_end = 202312

# job_start_datetime = dt.datetime.now().strftime('%Y-%m-%d, %H:%M:%S')
# print(f'\nJob Start... {job_start_datetime}')

# print(f'\n*** Re-run initial data to Fact ***')
# fn.src_initial_to_fact(v_initial_mth_start, v_initial_mth_end, v_target_schema, v_target_table, v_sql_initial_fact)

# print(f'\nJob Done !!!')

' Manual Process '

In [9]:
''' Create Result DataFrame '''

# Connect : AKPIPRD
tgt_dsn = f'{AKPIPRD_user}/{AKPIPRD_pwd}@{AKPIPRD_host}:{AKPIPRD_port}/{AKPIPRD_db}'
tgt_conn = oracledb.connect(tgt_dsn)
tgt_cur = tgt_conn.cursor()
# print(f'\n{AKPIPRD_db} : Connected')


try:
    # Get : Result Data Summary
    tgt_cur.execute("""
        SELECT TM_KEY_MTH, AREA_NO, AREA_TYPE, COALESCE(REMARK, 'Actual') REMARK
            -->> Subs
            , SUM(CASE WHEN METRIC_CD IN ('VIN00025', 'VIN00026', 'VIN00027', 'VIN00028') THEN METRIC_VALUE END) TOTAL
            , SUM(CASE WHEN METRIC_CD = 'VIN00024' THEN METRIC_VALUE END) "AIS & 3BB"
            , SUM(CASE WHEN METRIC_CD = 'VIN00025' THEN METRIC_VALUE END) TOL
            , SUM(CASE WHEN METRIC_CD = 'VIN00026' THEN METRIC_VALUE END) "3BB"
            , SUM(CASE WHEN METRIC_CD = 'VIN00027' THEN METRIC_VALUE END) AIS
            , SUM(CASE WHEN METRIC_CD = 'VIN00028' THEN METRIC_VALUE END) NT
            -->> % MKS
            , SUM(CASE WHEN AREA_TYPE = 'P' AND METRIC_CD IN ('VIN00020', 'VIN00021', 'VIN00022', 'VIN00023') THEN METRIC_VALUE END) "% TOTAL"
            , SUM(CASE WHEN AREA_TYPE = 'P' AND METRIC_CD = 'VIN00019' THEN METRIC_VALUE END) "% AIS & 3BB"
            , SUM(CASE WHEN AREA_TYPE = 'P' AND METRIC_CD = 'VIN00020' THEN METRIC_VALUE END) "% TOL"
            , SUM(CASE WHEN AREA_TYPE = 'P' AND METRIC_CD = 'VIN00021' THEN METRIC_VALUE END) "% 3BB"
            , SUM(CASE WHEN AREA_TYPE = 'P' AND METRIC_CD = 'VIN00022' THEN METRIC_VALUE END) "% AIS"
            , SUM(CASE WHEN AREA_TYPE = 'P' AND METRIC_CD = 'VIN00023' THEN METRIC_VALUE END) "% NT"
            -->> Count
            , COUNT(DISTINCT METRIC_CD) CNT_METRIC, COUNT(1) ROW_CNT, MAX(LOAD_DATE) LOAD_DATE
        FROM AUTOKPI.FCT_BROADBAND_MKS NOLOCK
        WHERE TM_KEY_DAY LIKE '%01'
        GROUP BY TM_KEY_MTH, AREA_NO, AREA_TYPE, REMARK
        ORDER BY 1,2
    """)
    rows = tgt_cur.fetchall()
    print(f'\nGet : Fact Summary...')
    chk_result_df = pd.DataFrame.from_records(rows, columns=[x[0] for x in tgt_cur.description])
    print(f'\n   -> chk_result_df : {chk_result_df.shape[0]} rows, {chk_result_df.shape[1]} columns') 
    
    # Display
    tmp_result_df = chk_result_df.copy()
    # tmp_result_df = tmp_result_df.replace(np.nan, None)
    # tmp_result_df.iloc[:, 4:18] = tmp_result_df.iloc[:, 4:18].fillna(0)
    mod_col_list = tmp_result_df.iloc[:, 4:18].columns.tolist()
    for col in mod_col_list:
        tmp_result_df[col] = tmp_result_df[col].apply(lambda x: format(x, ',.2f') if re.search('%', col) else format(x, ',.0f'))
        
    tgt_cur.close()


except oracledb.DatabaseError as e:
    print(f'\nError with Oracle : {e}')


finally:
    tgt_conn.close()


Get : Fact Summary...

   -> chk_result_df : 133 rows, 19 columns


In [20]:
''' Group by '''

agg_df = chk_result_df.groupby(['REMARK', 'TM_KEY_MTH', 'CNT_METRIC']).agg({'AREA_TYPE': 'count', 'ROW_CNT': 'sum', 'LOAD_DATE': 'max'}).reset_index()
agg_df

Unnamed: 0,REMARK,TM_KEY_MTH,CNT_METRIC,AREA_TYPE,ROW_CNT,LOAD_DATE
0,Actual,202301,10,7,1770,2024-06-24 16:21:04.396422
1,Actual,202302,10,7,1770,2024-06-24 16:21:04.396422
2,Actual,202303,10,7,1770,2024-06-24 16:21:04.396422
3,Actual,202304,10,7,1770,2024-06-24 16:21:04.396422
4,Actual,202305,10,7,1770,2024-06-24 16:21:04.396422
5,Actual,202306,10,7,1770,2024-06-24 16:21:04.396422
6,Actual,202307,10,7,1770,2024-06-24 16:21:04.396422
7,Actual,202308,10,7,1770,2024-06-24 16:21:04.396422
8,Actual,202309,10,7,1770,2024-06-24 16:21:04.396422
9,Actual,202310,10,7,1770,2024-06-24 16:21:04.396422


In [24]:
''' Reconcile '''

# Filter
max_mth = chk_result_df['TM_KEY_MTH'].max()
# rec_df = tmp_result_df.loc[tmp_result_df['TM_KEY_MTH']==max_mth]
rec_df = tmp_result_df.loc[tmp_result_df['AREA_TYPE']=='P']
rec_df = rec_df.reset_index(drop=True)

rec_df#.tail(3)

Unnamed: 0,TM_KEY_MTH,AREA_NO,AREA_TYPE,REMARK,TOTAL,AIS & 3BB,TOL,3BB,AIS,NT,% TOTAL,% AIS & 3BB,% TOL,% 3BB,% AIS,% NT,CNT_METRIC,ROW_CNT,LOAD_DATE
0,202301,1,P,Actual,8444535,3878505,3156430,2365715,1512790,1409600,100.0,45.93,37.38,28.01,17.91,16.69,10,10,2024-06-24 16:21:04.396422
1,202302,1,P,Actual,8429617,3874882,3145220,2345964,1528918,1409515,100.0,45.97,37.31,27.83,18.14,16.72,10,10,2024-06-24 16:21:04.396422
2,202303,1,P,Actual,8379779,3870039,3099695,2325512,1544528,1410045,100.0,46.18,36.99,27.75,18.43,16.83,10,10,2024-06-24 16:21:04.396422
3,202304,1,P,Actual,8314605,3864993,3039333,2305321,1559671,1410279,100.0,46.48,36.55,27.73,18.76,16.96,10,10,2024-06-24 16:21:04.396422
4,202305,1,P,Actual,8342582,3873752,3057444,2295508,1578244,1411386,100.0,46.43,36.65,27.52,18.92,16.92,10,10,2024-06-24 16:21:04.396422
5,202306,1,P,Actual,8350456,3883755,3054270,2285609,1598146,1412430,100.0,46.51,36.58,27.37,19.14,16.91,10,10,2024-06-24 16:21:04.396422
6,202307,1,P,Actual,8358078,3896532,3048074,2275704,1620827,1413472,100.0,46.62,36.47,27.23,19.39,16.91,10,10,2024-06-24 16:21:04.396422
7,202308,1,P,Actual,8363498,3900542,3048406,2260877,1639665,1414550,100.0,46.64,36.45,27.03,19.61,16.91,10,10,2024-06-24 16:21:04.396422
8,202309,1,P,Actual,8364146,3902444,3046115,2246082,1656362,1415587,100.0,46.66,36.42,26.85,19.8,16.92,10,10,2024-06-24 16:21:04.396422
9,202310,1,P,Actual,8366741,3904385,3045734,2236231,1668154,1416622,100.0,46.67,36.4,26.73,19.94,16.93,10,10,2024-06-24 16:21:04.396422
