# * Adhoc : Target

## Parameter

In [53]:
import os
import glob
import configparser
import datetime as dt
import pandas as pd
import numpy as np
import xlrd
import oracledb
import re

config = configparser.ConfigParser()
config.read('../../my_config.ini')
config.sections()

TDMDBPR_user = config['TDMDBPR']['username']
TDMDBPR_pwd = config['TDMDBPR']['password']
TDMDBPR_db = config['TDMDBPR']['db']
TDMDBPR_host = config['TDMDBPR']['host']
TDMDBPR_port = config['TDMDBPR']['port']

AKPIPRD_user = config['AKPIPRD']['username']
AKPIPRD_pwd = config['AKPIPRD']['password']
AKPIPRD_db = config['AKPIPRD']['db']
AKPIPRD_host = config['AKPIPRD']['host']
AKPIPRD_port = config['AKPIPRD']['port']

curr_dt = dt.datetime.now().date()
str_curr_dt = curr_dt.strftime('%Y%m%d')
curr_dt

datetime.date(2024, 6, 19)

## ETL Process...

### Step 1 : Import Data Source

In [360]:
''' Rawdata '''


''' Target_DTAC_Sales_Y2024 '''
src_file = 'C:\Ruz\Pentaho\Jobs\Input\Target\Source\Ref-K.Voraphon\DTAC_Sales_Target\Target_DTAC_Sales_Y2024.xlsx'
src_sheet = 'Rawdata'
src_df = pd.read_excel(src_file, sheet_name=src_sheet, index_col=None) 
# src_df['H_AREA_KEY'] = src_df['AREA_CD']
# src_df['MONTH_SHORT'] = src_df['DATA_MONTH'].str.upper()
src_df.rename(columns={'AREA_CD': 'H_AREA_KEY', 'DATA_MONTH': 'MONTH_SHORT', 'DATA_YEAR': 'TM_KEY_YR', 'METRIC_VALUES': 'MTH_VALUE'}, inplace=True)
# src_df = src_df.replace(np.nan, None)

print(f'\nsrc_df : {src_df.shape[0]} rows, {src_df.shape[1]} columns')
src_df.tail(3)


src_df : 17292 rows, 11 columns


Unnamed: 0,DATA_DATE,MONTH_SHORT,TM_KEY_YR,METRIC_CD,METRIC_NAME,MTH_VALUE,COMP_CD,VERSION,H_AREA_KEY,ARE_DESC,AREA_TYPE
17289,20240523,MAY,2024,DB0R000100,Total Revenue : DTAC,92373340.0,DTAC,T,84Z,SURAT THANI,H
17290,20240523,MAY,2024,DB0R000100,Total Revenue : DTAC,70892380.0,DTAC,T,93X,"TRANG, SATUN, PHATTHALUNG",H
17291,20240523,MAY,2024,DB0R000100,Total Revenue : DTAC,,DTAC,T,,True corp,H


In [361]:
''' Master Data '''


''' DIM_TIME '''
dt_file = '../CFW/data/dim_time.csv'
# dt_cols = ['TM_KEY_YR', 'MONTH_SHORT', 'TM_KEY_MTH', 'TRUE_TM_KEY_WK', 'TRUE_WEEK', 'TM_KEY_DAY', 'DAYS_IN_MONTH']
dt_cols = ['TM_KEY_YR', 'MONTH_SHORT', 'TM_KEY_MTH', 'TRUE_TM_KEY_WK', 'TM_KEY_DAY', 'DAYS_IN_MONTH']
dt_df = pd.read_csv(dt_file, usecols=dt_cols)
# dt_df['MONTH_KEY'] = dt_df['MONTH_SHORT'].str.upper()
dt_df['MONTH_SHORT'] = dt_df['MONTH_SHORT'].apply(lambda x: x.upper())
# dt_df.tail(3)


''' DIM_MOOC_AREA '''
mooc_file = '../CFW/data/dim_mooc_area.csv'
mooc_cols = ['ZONE_TYPE', 'ORGID_G', 'TDS_SGMD', 'ORGID_R', 'TDS_RGM_CODE', 'ORGID_H', 'HOP_HINT', 'TDS_PROVINCE', 'PROVINCE_ENG', 'PROVINCE_TH', 'ORGID_HH', 'D_CLUSTER', 'CCAATT', 'REMARK']
mooc_df = pd.read_csv(mooc_file, usecols=mooc_cols)
# Create H level
mooc_h_df = mooc_df.loc[(mooc_df['REMARK']!='Dummy') & (mooc_df['HOP_HINT']!='True Corp')]
mooc_h_df = mooc_h_df[['ZONE_TYPE', 'ORGID_G', 'TDS_SGMD', 'ORGID_H', 'HOP_HINT']].drop_duplicates()
mooc_h_df.dropna(how='all', inplace=True)
mooc_h_df['H_AREA_KEY'] = mooc_h_df['ORGID_H'].apply(lambda x: x if re.search(r'[a-z]', x, re.I) else int(x))
# mooc_h_df.tail(3)

In [362]:
''' Example DataFrame '''

# src_df.tail(3)
# dt_df.tail(3)
# mooc_df.tail(3)
# mooc_h_df.tail(3)

mooc_h_df.loc[mooc_h_df['ORGID_H'].str.contains('^0')].tail(3)

Unnamed: 0,ZONE_TYPE,ORGID_G,TDS_SGMD,ORGID_H,HOP_HINT,H_AREA_KEY
310,BMA,GX3,Retail Management & Regional Management 3 (East),4,"SMP : Mueang Samut Prakan, Phra Pradaeng, Phra...",4
312,BMA,GX3,Retail Management & Regional Management 3 (East),3,"SMP : Bang Bo, Bang Sao Thong, Bang Phli",3
320,BMA,GX1,Deputy CGO & Regional Management 1 (BMA-West),5,"NTB : Pak Kret, Bang Bua Thong, Sai Noi",5


### Step 2 : Aggregate Data

In [363]:
'''
    DB1R000900	Prepaid Inflow M1 : DTAC
    DB1S000101	Prepaid Gross Adds : DTAC
    DB2R000500	Postpaid Inflow M1 : DTAC
    DB2S000100	Postpaid Gross Adds : DTAC
'''

''' Filter '''
raw_df = src_df
# raw_df = raw_df.loc[(raw_df['METRIC_CD'].str.contains('^DB1R000900|^DB1S000101|^DB2R000500|^DB2S000100')) & (raw_df['METRIC_CD'].str.contains('[0-9]$|[0-9]A[A-K]$'))]
raw_df = raw_df.loc[(raw_df['METRIC_CD'].str.contains('^DB1R000900|^DB1S000101|^DB2R000500|^DB2S000100')) & (raw_df['METRIC_CD'].str.contains('[0-9]$|[0-9]A[A-K]$'))]

''' Data Test '''
# raw_df = raw_df[raw_df['METRIC_CD'].str.contains('DB1R000900|DB1S000101|DB2R000500|DB2S000100')]
raw_df = raw_df.loc[raw_df['MONTH_SHORT']=='MAY']
raw_df = raw_df.loc[raw_df['METRIC_CD']=='DB1S000101']
# raw_df = raw_df.loc[raw_df['H_AREA_KEY'].isna()]
raw_df = raw_df.reset_index(drop=True)
raw_df

Unnamed: 0,DATA_DATE,MONTH_SHORT,TM_KEY_YR,METRIC_CD,METRIC_NAME,MTH_VALUE,COMP_CD,VERSION,H_AREA_KEY,ARE_DESC,AREA_TYPE
0,20240523,MAY,2024,DB1S000101,Prepaid Gross Adds : DTAC,21592.000000,DTAC,T,50,"BKK : Bang Khun Thian, Chom Thong, Bang Bon",H
1,20240523,MAY,2024,DB1S000101,Prepaid Gross Adds : DTAC,3440.000000,DTAC,T,16,"BKK : Bangkok Yai, Bangkok Noi, Bang Phlat",H
2,20240523,MAY,2024,DB1S000101,Prepaid Gross Adds : DTAC,24290.000000,DTAC,T,40,"BKK : Taling Chan, Phasi Charoen, Thawi Wattha...",H
3,20240523,MAY,2024,DB1S000101,Prepaid Gross Adds : DTAC,5982.000000,DTAC,T,49,"BKK : Thon Buri, Khlong San, Rat Burana, Thung...",H
4,20240523,MAY,2024,DB1S000101,Prepaid Gross Adds : DTAC,11097.000000,DTAC,T,202,"NTB : Bang Kruai, Bang Yai",H
...,...,...,...,...,...,...,...,...,...,...,...
61,20240523,MAY,2024,DB1S000101,Prepaid Gross Adds : DTAC,16481.000000,DTAC,T,86X,"RANONG, CHUMPHON",H
62,20240523,MAY,2024,DB1S000101,Prepaid Gross Adds : DTAC,64803.666667,DTAC,T,90Z,SONGKHLA,H
63,20240523,MAY,2024,DB1S000101,Prepaid Gross Adds : DTAC,31355.333333,DTAC,T,84Z,SURAT THANI,H
64,20240523,MAY,2024,DB1S000101,Prepaid Gross Adds : DTAC,15149.666667,DTAC,T,93X,"TRANG, SATUN, PHATTHALUNG",H


In [364]:
''' Merge Data '''

merge_df1 = pd.merge(raw_df, mooc_h_df, how='left', on='H_AREA_KEY')
# merge_df1#.tail(3)

merge_df2 = pd.merge(merge_df1, dt_df, how='left', on=['TM_KEY_YR', 'MONTH_SHORT'])
# merge_df2.tail(3)

prep_agg_df = merge_df2
prep_agg_df['DAY_VALUE'] = prep_agg_df['MTH_VALUE'] / prep_agg_df['DAYS_IN_MONTH']
prep_agg_df.tail(3)

Unnamed: 0,DATA_DATE,MONTH_SHORT,TM_KEY_YR,METRIC_CD,METRIC_NAME,MTH_VALUE,COMP_CD,VERSION,H_AREA_KEY,ARE_DESC,...,ZONE_TYPE,ORGID_G,TDS_SGMD,ORGID_H,HOP_HINT,TM_KEY_DAY,DAYS_IN_MONTH,TRUE_TM_KEY_WK,TM_KEY_MTH,DAY_VALUE
2043,20240523,MAY,2024,DB1S000101,Prepaid Gross Adds : DTAC,320964.929749,DTAC,T,,True corp,...,,,,,,20240529,31,2024022,202405,10353.707411
2044,20240523,MAY,2024,DB1S000101,Prepaid Gross Adds : DTAC,320964.929749,DTAC,T,,True corp,...,,,,,,20240530,31,2024022,202405,10353.707411
2045,20240523,MAY,2024,DB1S000101,Prepaid Gross Adds : DTAC,320964.929749,DTAC,T,,True corp,...,,,,,,20240531,31,2024022,202405,10353.707411


In [371]:
prep_agg_df.columns

# col_list = prep_agg_df.columns.values.tolist()
# col_list

Index(['DATA_DATE', 'MONTH_SHORT', 'TM_KEY_YR', 'METRIC_CD', 'METRIC_NAME',
       'MTH_VALUE', 'COMP_CD', 'VERSION', 'H_AREA_KEY', 'ARE_DESC',
       'AREA_TYPE', 'ZONE_TYPE', 'ORGID_G', 'TDS_SGMD', 'ORGID_H', 'HOP_HINT',
       'TM_KEY_DAY', 'DAYS_IN_MONTH', 'TRUE_TM_KEY_WK', 'TM_KEY_MTH',
       'DAY_VALUE'],
      dtype='object')

In [422]:
''' Aggregate P, G, H level '''


agg_cols = ['TM_KEY_YR', 'TM_KEY_MTH', 'TRUE_TM_KEY_WK', 'TM_KEY_DAY', 'METRIC_CD', 'METRIC_NAME', 'COMP_CD', 'VERSION', 'AREA_NO', 'AREA_TYPE', 'AREA_CD', 'AREA_NAME', 'DAY_VALUE', 'MTH_VALUE'] # , 'FREQUENCY', 'REMARK'

# P : Nationwide
agg_p_df = prep_agg_df.groupby(['TM_KEY_YR', 'TM_KEY_MTH', 'TRUE_TM_KEY_WK', 'TM_KEY_DAY', 'METRIC_CD', 'METRIC_NAME', 'COMP_CD', 'VERSION']).agg({'MTH_VALUE': 'sum', 'DAY_VALUE': 'sum'}).reset_index()
agg_p_df['AREA_NO'] = 1
agg_p_df['AREA_TYPE'] = 'P'
agg_p_df['AREA_CD'] = 'P'
agg_p_df['AREA_NAME'] = 'Nationwide'
agg_p_df = agg_p_df.loc[:, agg_cols]
agg_p_df[agg_p_df['TM_KEY_DAY']==20240501]

# G : Nationwide
agg_g_df = prep_agg_df.groupby(['TM_KEY_YR', 'TM_KEY_MTH', 'TRUE_TM_KEY_WK', 'TM_KEY_DAY', 'METRIC_CD', 'METRIC_NAME', 'COMP_CD', 'VERSION', 'ORGID_G', 'TDS_SGMD']).agg({'MTH_VALUE': 'sum', 'DAY_VALUE': 'sum'}).reset_index()
agg_g_df['AREA_NO'] = 2
agg_g_df['AREA_TYPE'] = 'G'
agg_g_df.rename(columns={'ORGID_G': 'AREA_CD'}, inplace=True)
agg_g_df.rename(columns={'TDS_SGMD': 'AREA_NAME'}, inplace=True)
agg_g_df = agg_g_df.loc[:, agg_cols]
agg_g_df[agg_g_df['TM_KEY_DAY']==20240501]

# H : Nationwide
agg_h_df = prep_agg_df.groupby(['TM_KEY_YR', 'TM_KEY_MTH', 'TRUE_TM_KEY_WK', 'TM_KEY_DAY', 'METRIC_CD', 'METRIC_NAME', 'COMP_CD', 'VERSION', 'ORGID_H', 'HOP_HINT']).agg({'MTH_VALUE': 'sum', 'DAY_VALUE': 'sum'}).reset_index()
agg_h_df['AREA_NO'] = 3
agg_h_df['AREA_TYPE'] = 'H'
agg_h_df.rename(columns={'ORGID_H': 'AREA_CD'}, inplace=True)
agg_h_df.rename(columns={'HOP_HINT': 'AREA_NAME'}, inplace=True)
agg_h_df = agg_h_df.loc[:, agg_cols]
agg_h_df[agg_h_df['TM_KEY_DAY']==20240501]

# Concat DataFrame
agg_all_df = pd.concat([agg_p_df, agg_g_df, agg_h_df], ignore_index=True)
agg_all_df['FREQUENCY'] = 'Daily'
agg_all_df['REMARK'] = '?'

agg_all_df.loc[agg_all_df['TM_KEY_DAY']==20240501]
# agg_all_df

Unnamed: 0,TM_KEY_YR,TM_KEY_MTH,TRUE_TM_KEY_WK,TM_KEY_DAY,METRIC_CD,METRIC_NAME,COMP_CD,VERSION,AREA_NO,AREA_TYPE,AREA_CD,AREA_NAME,DAY_VALUE,MTH_VALUE,FREQUENCY,REMARK
0,2024,202405,2024018,20240501,DB1S000101,Prepaid Gross Adds : DTAC,DTAC,T,1,P,P,Nationwide,47207.868702,1.463444e+06,Daily,?
31,2024,202405,2024018,20240501,DB1S000101,Prepaid Gross Adds : DTAC,DTAC,T,2,G,GX1,Deputy CGO & Regional Management 1 (BMA-West),3421.645161,1.060710e+05,Daily,?
32,2024,202405,2024018,20240501,DB1S000101,Prepaid Gross Adds : DTAC,DTAC,T,2,G,GX2,Regional Management 2 (BMA-East),4206.774194,1.304100e+05,Daily,?
33,2024,202405,2024018,20240501,DB1S000101,Prepaid Gross Adds : DTAC,DTAC,T,2,G,GX3,Retail Management & Regional Management 3 (East),5280.258065,1.636880e+05,Daily,?
34,2024,202405,2024018,20240501,DB1S000101,Prepaid Gross Adds : DTAC,DTAC,T,2,G,GX4,Regional Management 4 (North),7068.473118,2.191227e+05,Daily,?
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339,2024,202405,2024018,20240501,DB1S000101,Prepaid Gross Adds : DTAC,DTAC,T,3,H,84Z,SURAT THANI,1011.462366,3.135533e+04,Daily,?
340,2024,202405,2024018,20240501,DB1S000101,Prepaid Gross Adds : DTAC,DTAC,T,3,H,86X,"RANONG, CHUMPHON",531.645161,1.648100e+04,Daily,?
341,2024,202405,2024018,20240501,DB1S000101,Prepaid Gross Adds : DTAC,DTAC,T,3,H,90Z,SONGKHLA,2090.440860,6.480367e+04,Daily,?
342,2024,202405,2024018,20240501,DB1S000101,Prepaid Gross Adds : DTAC,DTAC,T,3,H,93X,"TRANG, SATUN, PHATTHALUNG",488.698925,1.514967e+04,Daily,?


## Generate Output file

In [135]:
''' Create "mooc_h_df.xlsx" '''

# mooc_h_df.to_excel(f'temp/mooc_h_df.xlsx', sheet_name='Data', index=False)
# print(f'\n   -> Generate "mooc_h_df.xlsx" successfully')


   -> Generate "mooc_h_df.xlsx" successfully
