# * VINSIGHT : Data Monitoring
    Process required -> "ETL-AGG_PERF_NEWCO_SNAP"

## Parameter

In [125]:
import os
import glob
import configparser
import datetime as dt
import pandas as pd
import numpy as np
import xlrd
import oracledb
import re

config = configparser.ConfigParser()
config.read('../../my_config.ini')
config.sections()

TDMDBPR_user = config['TDMDBPR']['username']
TDMDBPR_pwd = config['TDMDBPR']['password']
TDMDBPR_db = config['TDMDBPR']['db']
TDMDBPR_host = config['TDMDBPR']['host']
TDMDBPR_port = config['TDMDBPR']['port']

AKPIPRD_user = config['AKPIPRD']['username']
AKPIPRD_pwd = config['AKPIPRD']['password']
AKPIPRD_db = config['AKPIPRD']['db']
AKPIPRD_host = config['AKPIPRD']['host']
AKPIPRD_port = config['AKPIPRD']['port']

curr_dt = dt.datetime.now().date()
str_curr_dt = curr_dt.strftime('%Y%m%d')
curr_dt

datetime.date(2024, 6, 17)

## Create DataFrame
    from "AGG_PERF_NEWCO_SNAP"

In [55]:
# Create DataFrame

dsn = f'{AKPIPRD_user}/{AKPIPRD_pwd}@{AKPIPRD_host}:{AKPIPRD_port}/{AKPIPRD_db}'
conn = oracledb.connect(dsn)
print(f'\n{AKPIPRD_db} : Connected\nProcessing...')
# print(f'\nProcessing...')
cur = conn.cursor()

query = """
    -->> Actual Monthly

    SELECT TM_KEY_MTH, CENTER, METRIC_GRP, PRODUCT_GRP, COMP_CD, METRIC_CD, METRIC_NAME, CHANNEL_CD, AGG_TYPE, UOM
    
        , CAST(SUM(CASE WHEN AREA_TYPE = 'P' THEN ACTUAL_TMP END) AS DECIMAL(18,2)) AS P_ACTUAL
        , CAST(SUM(CASE WHEN AREA_TYPE = 'G' THEN ACTUAL_TMP END) AS DECIMAL(18,2)) AS G_ACTUAL
        , CAST(SUM(CASE WHEN AREA_TYPE = 'H' THEN ACTUAL_TMP END) AS DECIMAL(18,2)) AS H_ACTUAL
        , CAST(SUM(CASE WHEN AREA_TYPE = 'HH' THEN ACTUAL_TMP END) AS DECIMAL(18,2)) AS HH_ACTUAL

        , CAST(SUM(CASE WHEN AREA_TYPE = 'P' THEN TARGET_TMP END) AS DECIMAL(18,2)) AS P_TARGET
        , CAST(SUM(CASE WHEN AREA_TYPE = 'G' THEN TARGET_TMP END) AS DECIMAL(18,2)) AS G_TARGET
        , CAST(SUM(CASE WHEN AREA_TYPE = 'H' THEN TARGET_TMP END) AS DECIMAL(18,2)) AS H_TARGET
        , CAST(SUM(CASE WHEN AREA_TYPE = 'HH' THEN TARGET_TMP END) AS DECIMAL(18,2)) AS HH_TARGET

        , MAX(ACTUAL_AS_OF) ACTUAL_AS_OF, MIN(TM_KEY_DAY) MIN_DAY, MAX(TM_KEY_DAY) MAX_DAY, MAX(PPN_TM) PPN_TM, MAX(LOAD_DATE) LOAD_DATE
        
    FROM (
        SELECT TM_KEY_YR, TM_KEY_QTR, TM_KEY_MTH, TM_KEY_WK, TM_KEY_DAY
            , CENTER, PRODUCT_GRP, COMP_CD, METRIC_GRP, METRIC_CD, METRIC_NAME, SEQ, ACTUAL_AS_OF, AGG_TYPE, RR_IND, GRY_IND, UOM, AREA_TYPE, AREA_CD, AREA_NAME
            , CASE 	WHEN AGG_TYPE = 'S' THEN ACTUAL_SNAP 
                    ELSE (CASE WHEN TM_KEY_DAY = MAX(TM_KEY_DAY) OVER(PARTITION BY METRIC_CD, TM_KEY_MTH) THEN ACTUAL_AGG END)
                    END ACTUAL_TMP
            , CASE 	WHEN AGG_TYPE = 'S' THEN TARGET_SNAP 
                    ELSE (CASE WHEN TM_KEY_DAY = MAX(TM_KEY_DAY) OVER(PARTITION BY METRIC_CD, TM_KEY_MTH) THEN TARGET_AGG END)
                    END TARGET_TMP
            , ACTUAL_SNAP, TARGET_SNAP, BASELINE_SNAP, ACTUAL_AGG, TARGET_AGG, BASELINE_AGG, PPN_TM, LOAD_DATE
            , CASE WHEN REGEXP_LIKE(METRIC_CD, '[0-9]A[A-K]$') THEN SUBSTR(METRIC_CD,-2) ELSE 'ALL' END CHANNEL_CD
        FROM AUTOKPI.AGG_PERF_NEWCO_SNAP NOLOCK
    ) TMP_MTH

    --WHERE CHANNEL_CD = 'ALL'
    GROUP BY TM_KEY_MTH, CENTER, METRIC_GRP, PRODUCT_GRP, COMP_CD, METRIC_CD, METRIC_NAME, CHANNEL_CD, AGG_TYPE, UOM
    ORDER BY TM_KEY_MTH, CENTER, METRIC_GRP, PRODUCT_GRP, COMP_CD, METRIC_CD
"""

try:
    execute_datetime = dt.datetime.now().strftime('%Y-%m-%d, %H:%M:%S')
    print(f'\n  -> Execute query... {execute_datetime}')
    cur.execute(query)
    rows = cur.fetchall()
    df = pd.DataFrame.from_records(rows, columns=[x[0] for x in cur.description])
    print(f'\n  -> DataFrame : {df.shape[0]} rows, {df.shape[1]} columns')


except oracledb.DatabaseError as e:
    print(f'Error with Oracle : {e}')


finally:
    cur.close()
    conn.close()
    print(f'\n{AKPIPRD_db} : Disconnected')



AKPIPRD : Connected
Processing...

  -> Execute query... 2024-06-17, 14:09:31

  -> DataFrame : 7198 rows, 23 columns

AKPIPRD : Disconnected


In [56]:
# Rawdata

df.tail(1)

Unnamed: 0,TM_KEY_MTH,CENTER,METRIC_GRP,PRODUCT_GRP,COMP_CD,METRIC_CD,METRIC_NAME,CHANNEL_CD,AGG_TYPE,UOM,...,HH_ACTUAL,P_TARGET,G_TARGET,H_TARGET,HH_TARGET,ACTUAL_AS_OF,MIN_DAY,MAX_DAY,PPN_TM,LOAD_DATE
7197,202406,Sales,Subs,TOL,True,TB3S000700GEO,TOL %NAD 30DPDB2 (Due Date),ALL,N,%,...,,,,,,,20240601,20240616,2024-06-17 11:10:20,2024-06-17 14:01:32.232071


## Reconcile

In [57]:
# Generate Temp files

# GROUP list
grp_list_df = df[['METRIC_GRP', 'PRODUCT_GRP']].drop_duplicates().reset_index(drop=True)
grp_list_df.dropna(how='all')
# grp_list_df.dropna(axis=1, how='all')
# grp_list_df.dropna(subset=['PRODUCT_GRP'])
grp_list_df.to_excel(f'temp/Metric_Grp_List.xlsx', sheet_name='Data', index=False)
print(f'\n -> Generate "Metric_Grp_List.xlsx" successfully')

# METRIC list
metric_list_df = df[['METRIC_GRP', 'PRODUCT_GRP', 'COMP_CD', 'METRIC_CD', 'METRIC_NAME']].drop_duplicates().reset_index(drop=True)
metric_list_df.dropna(how='all')
metric_list_df.to_excel(f'temp/Metric_Cd_List.xlsx', sheet_name='Data', index=False)
print(f'\n -> Generate "Metric_Cd_List.xlsx" successfully')



 -> Generate "Metric_Grp_List.xlsx" successfully

 -> Generate "Metric_Cd_List.xlsx" successfully


In [199]:
# Create Reconcile Data
rec_df = df

# Filter

# rec_df = rec_df[rec_df['TM_KEY_MTH']==202406]
# rec_df = rec_df[rec_df['CHANNEL_CD']=='ALL']

# rec_df = rec_df[rec_df['METRIC_NAME'].str.contains('Prepaid Topping|Prepaid Pay per Use')]
# rec_df = rec_df[rec_df['METRIC_NAME'].str.contains('Prepaid Inflow M2')]
# rec_df = rec_df[rec_df['METRIC_NAME'].str.contains('Prepaid Activation Subs')]
# rec_df = rec_df[rec_df['METRIC_NAME'].str.contains('Prepaid Usage Subs')]
# rec_df = rec_df[rec_df['METRIC_NAME'].str.contains('Prepaid Revenue Subs')]
# rec_df = rec_df[rec_df['METRIC_NAME'].str.contains('^Postpaid Revenue.*DTAC$')]
# rec_df = rec_df[rec_df['METRIC_NAME'].str.contains('Postpaid Gross Adds')]
# rec_df = rec_df[rec_df['METRIC_NAME'].str.contains('Postpaid Activation Subs')]
# rec_df = rec_df[rec_df['METRIC_NAME'].str.contains('Postpaid Active Subs B2C')]
# rec_df = rec_df[rec_df['METRIC_NAME'].str.contains('Postpaid Activation Subs B2B')]
# rec_df = rec_df[rec_df['METRIC_NAME'].str.contains('Postpaid Active Subs B2B')]
# rec_df = rec_df[rec_df['METRIC_NAME'].str.contains('Postpaid ARPU')]
# rec_df = rec_df[rec_df['METRIC_NAME'].str.contains('Total Inflow M1')]
# rec_df = rec_df[rec_df['METRIC_NAME'].str.contains('TOL Inflow M1 - Connected')]
# rec_df = rec_df[rec_df['METRIC_NAME'].str.contains('TOL Active Subs')]
# rec_df = rec_df[rec_df['METRIC_NAME'].str.contains('TVS Inflow M1')]
# rec_df = rec_df[rec_df['METRIC_NAME'].str.contains('Active Subs')]
rec_df = rec_df[rec_df['METRIC_NAME'].str.contains('Inflow M1')]
# rec_df = rec_df[rec_df['METRIC_NAME'].str.contains('Monthly Fee')]
rec_df = rec_df.reset_index(drop=True)

# rec_df
rec_df.tail(3)

Unnamed: 0,TM_KEY_MTH,CENTER,METRIC_GRP,PRODUCT_GRP,COMP_CD,METRIC_CD,METRIC_NAME,CHANNEL_CD,AGG_TYPE,UOM,...,HH_ACTUAL,P_TARGET,G_TARGET,H_TARGET,HH_TARGET,ACTUAL_AS_OF,MIN_DAY,MAX_DAY,PPN_TM,LOAD_DATE
813,202406,Sales,Sales,TOL,True,TB3R000600CORP,TOL Inflow M1 - Connected,ALL,S,baht,...,,13025569.05,9735009.06,9735009.05,,20240612.0,20240601,20240616,2024-06-17 11:10:20,2024-06-17 14:01:32.232071
814,202406,Sales,Sales,TVS,True,TB4R001000,TVS Inflow M1,ALL,S,baht,...,4320.0,,,,,20240614.0,20240601,20240616,2024-06-17 11:10:20,2024-06-17 14:01:32.232071
815,202406,Sales,Sales,TVS,True,TB4R001000CORP,TVS Inflow M1,ALL,S,baht,...,4320.0,,,,,20240614.0,20240601,20240616,2024-06-17 11:10:20,2024-06-17 14:01:32.232071


In [200]:
# Group by

agg_df = rec_df.groupby(['COMP_CD', 'METRIC_CD', 'METRIC_NAME', 'CHANNEL_CD', 'UOM']).agg({'P_ACTUAL': 'sum', 'P_TARGET': 'sum', 'LOAD_DATE': 'max'}).reset_index()
    #.sort_values(['COMP_CD', 'METRIC_CD']).reset_index(drop=True)

# df['DecimalCol'] = df['DecimalCol'].apply(lambda x: round(x, 2))
# agg_df[['P_ACTUAL', 'P_TARGET']] = agg_df[['P_ACTUAL', 'P_TARGET']].apply(lambda x: round(x, 2))
# agg_df = agg_df.astype({'P_ACTUAL': int, 'P_TARGET': int, 'LOAD_DATE': 'datetime64[ns]'})

# Create Temp File
# agg_df.to_excel(f'temp/Temp.xlsx', sheet_name='Data', index=False)
# print(f'\n -> Generate "Temp.xlsx" successfully')

agg_df

Unnamed: 0,COMP_CD,METRIC_CD,METRIC_NAME,CHANNEL_CD,UOM,P_ACTUAL,P_TARGET,LOAD_DATE
0,ALL,B0R00010001,Total Inflow M1,ALL,baht,2.753604e+09,0.00,2024-06-17 14:01:32.232071
1,ALL,B0R00010001AA,Total Inflow M1 : Account Executive,AA,baht,3.101801e+06,0.00,2024-06-17 14:01:32.232071
2,ALL,B0R00010001AB,Total Inflow M1 : B2B,AB,baht,6.707842e+07,0.00,2024-06-17 14:01:32.232071
3,ALL,B0R00010001AC,Total Inflow M1 : Branded Retail,AC,baht,4.404917e+08,0.00,2024-06-17 14:01:32.232071
4,ALL,B0R00010001AD,Total Inflow M1 : Contact Center,AD,baht,1.862469e+08,0.00,2024-06-17 14:01:32.232071
...,...,...,...,...,...,...,...,...
131,TRUE,TB4R001000AH,TVS Inflow M1 : Others,AH,baht,0.000000e+00,0.00,2024-06-17 14:01:32.232071
132,TRUE,TB4R001000AJ,TVS Inflow M1 : Retail Sales,AJ,baht,3.083971e+04,20981.30,2024-06-17 14:01:32.232071
133,TRUE,TB4R001000CORP,TVS Inflow M1,ALL,baht,4.323915e+06,8323925.19,2024-06-17 14:01:32.232071
134,TRUE,TB4R001000GEO,TVS Inflow M1 (Geo),ALL,baht,3.307580e+06,0.00,2024-06-17 14:01:32.232071


In [201]:
# Add columns

tmp_df = agg_df
tmp_df = tmp_df[tmp_df['METRIC_CD'].str.contains('B0R00010001')]
tmp_df['TMP_CD'] = tmp_df['METRIC_CD'].replace(r'AA|AB|AC|AD|AE|AF|AG|AH|AI|AJ|AK', '', regex=True)
tmp_df['TMP_NAME'] = tmp_df['METRIC_NAME'].replace(r' : Account Executive| : B2B| : Branded Retail| : Contact Center| : Direct Sales| : Key Account| : Modern Trade| : Others| : Own Digital| : Retail Sales| : Wholesales', '', regex=True)
tmp_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp_df['TMP_CD'] = tmp_df['METRIC_CD'].replace(r'AA|AB|AC|AD|AE|AF|AG|AH|AI|AJ|AK', '', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp_df['TMP_NAME'] = tmp_df['METRIC_NAME'].replace(r' : Account Executive| : B2B| : Branded Retail| : Contact Center| : Direct Sales| : Key Account| : Modern Trade| : Others| : Own Digital| : Retail Sales| : Wholesales', '', regex=True)


Unnamed: 0,COMP_CD,METRIC_CD,METRIC_NAME,CHANNEL_CD,UOM,P_ACTUAL,P_TARGET,LOAD_DATE,TMP_CD,TMP_NAME
0,ALL,B0R00010001,Total Inflow M1,ALL,baht,2753604000.0,0.0,2024-06-17 14:01:32.232071,B0R00010001,Total Inflow M1
1,ALL,B0R00010001AA,Total Inflow M1 : Account Executive,AA,baht,3101801.0,0.0,2024-06-17 14:01:32.232071,B0R00010001,Total Inflow M1
2,ALL,B0R00010001AB,Total Inflow M1 : B2B,AB,baht,67078420.0,0.0,2024-06-17 14:01:32.232071,B0R00010001,Total Inflow M1
3,ALL,B0R00010001AC,Total Inflow M1 : Branded Retail,AC,baht,440491700.0,0.0,2024-06-17 14:01:32.232071,B0R00010001,Total Inflow M1
4,ALL,B0R00010001AD,Total Inflow M1 : Contact Center,AD,baht,186246900.0,0.0,2024-06-17 14:01:32.232071,B0R00010001,Total Inflow M1
5,ALL,B0R00010001AE,Total Inflow M1 : Direct Sales,AE,baht,245626600.0,0.0,2024-06-17 14:01:32.232071,B0R00010001,Total Inflow M1
6,ALL,B0R00010001AF,Total Inflow M1 : Key Account,AF,baht,118613600.0,0.0,2024-06-17 14:01:32.232071,B0R00010001,Total Inflow M1
7,ALL,B0R00010001AG,Total Inflow M1 : Modern Trade,AG,baht,235873800.0,0.0,2024-06-17 14:01:32.232071,B0R00010001,Total Inflow M1
8,ALL,B0R00010001AH,Total Inflow M1 : Others,AH,baht,47245180.0,0.0,2024-06-17 14:01:32.232071,B0R00010001,Total Inflow M1
9,ALL,B0R00010001AI,Total Inflow M1 : Own Digital,AI,baht,22792860.0,0.0,2024-06-17 14:01:32.232071,B0R00010001,Total Inflow M1


In [206]:
# Pivot Table

# # Actual & Target
# pv_target = pd.pivot_table(x, values=['P_ACTUAL', 'P_TARGET'], 
#                    index=['COMP_CD', 'TMP_CD', 'TMP_NAME'], 
#                    columns='CHANNEL_CD', 
#                    aggfunc='sum', fill_value=0)

# Actual
pv_actual = pd.pivot_table(tmp_df, values='P_ACTUAL', index=['COMP_CD', 'TMP_CD', 'TMP_NAME'], columns='CHANNEL_CD', aggfunc='sum', fill_value=0)
pv_actual['VERION'] = 'A'

# Target
pv_target = pd.pivot_table(tmp_df, values='P_TARGET', index=['COMP_CD', 'TMP_CD', 'TMP_NAME'], columns='CHANNEL_CD', aggfunc='sum', fill_value=0)
pv_target['VERION'] = 'T'

# Concat Dataframe
pd.concat([pv_actual, pv_target]).reset_index()

CHANNEL_CD,COMP_CD,TMP_CD,TMP_NAME,AA,AB,AC,AD,AE,AF,AG,AH,AI,AJ,AK,ALL,VERION
0,ALL,B0R00010001,Total Inflow M1,3101800.92,67078415.33,440491700.0,186246900.0,245626600.0,118613600.0,235873800.0,47245183.33,22792855.9,868401500.0,86315373.76,2753604000.0,A
1,ALL,B0R00010001CORP,Total Inflow M1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2753604000.0,A
2,DTAC,DB0R00010001,Total Inflow M1 : DTAC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1138094000.0,A
3,TRUE,TB0R00010001,Total Inflow M1 : TRUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1615510000.0,A
4,ALL,B0R00010001,Total Inflow M1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,T
5,ALL,B0R00010001CORP,Total Inflow M1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,T
6,DTAC,DB0R00010001,Total Inflow M1 : DTAC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,T
7,TRUE,TB0R00010001,Total Inflow M1 : TRUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1744819000.0,T


## Generate Output file

In [45]:
# # to Excel file

# op_dir = 'temp'
# op_file = 'VINSIGHT Data Monitoring.xlsx'

# df.to_excel(f'{op_dir}/{op_file}', sheet_name='Data', index=False)
# print(f'\n  -> Generate "{op_file}" successfully')


 -> Generate "Metric_List.xlsx" successfully


In [44]:
# # to CSV file

# op_dir = 'temp'
# op_file = 'VINSIGHT Data Monitoring.csv'

# df.to_csv(f'{op_dir}/{op_file}', index=False, encoding='utf-8')
# print(f'\n  -> Generate "{op_file}" successfully')


 -> Generate "Metric_List.csv" successfully
