# Work with Pandas

### https://realpython.com/pandas-dataframe/

In [None]:
import pandas as pd
import numpy as np
import datetime

## Sample Dataframe

In [None]:
n = 5

data = [x for x in range(n)]
data

In [None]:
data = [{'row_num': x, 'timestamp': datetime.datetime.now(), 'added_by': 'Ruzlim'} for x in range(n)]
data

In [None]:
rand_n = np.random.randint(0, 1_000)

rand_data = []

for x in range(0, rand_n):
    dollars = np.random.randint(1_000, 10_000)
    # print(dollars)
    rand_tmp_data = {
        'Name': f'Name-{x}',
        'Salary': f'${dollars:,.2f}'
        } 
    rand_data.append(rand_tmp_data)

rand_n
# rand_data

In [None]:
sdf = pd.DataFrame(data)
sdf

In [None]:
rand_df = pd.DataFrame(rand_data)
rand_df

In [None]:
sdf.index

In [None]:
sdf.columns

In [None]:
sdf['added_by'].unique()

In [None]:
sdf.dtypes

In [None]:
sdf.isnull()

In [None]:
pd.isna(sdf)

In [None]:
sdf_ = sdf.astype(dtype={'row_num': np.float32})
sdf_.dtypes

In [None]:
sdf.ndim

In [None]:
sdf.shape

In [None]:
sdf.size

In [None]:
sdf.memory_usage()

In [None]:
sdf.describe()

In [None]:
# df.head
sdf.head(3)

In [None]:
sdf.tail()

## loc

In [None]:
sdf.loc[2]

In [None]:
sdf.loc[:2, ['row_num', 'timestamp']]

## iloc

In [None]:
sdf.iloc[:2, 0:2]

In [None]:
sdf.iloc[:2, [0, 2]]

## Filter

In [None]:
# Equal : ==
sdf[sdf['row_num'] == 3]

In [None]:
# Not Equal : ~
sdf[~(sdf['row_num'] == 3)]

In [None]:
# AND : &
sdf[(sdf['row_num'] >= 2) & (sdf['row_num'] < 4)]

In [None]:
# OR : |
sdf[(sdf['row_num'] == 1) | (sdf['row_num'] == 3)]

In [None]:
# XOR : ^ (???)
sdf[(sdf['row_num'] == 1) ^ (sdf['row_num'] == 3)]

In [None]:
# where
sdf.where(cond = sdf['row_num'] >= 3, other = 0)

## Filling missing data

In [None]:
sdf_ = sdf.where(cond = sdf['row_num'] == 2)
sdf_

In [None]:
sdf_.fillna(value=0)

In [None]:
sdf_.fillna(method='ffill')

In [None]:
sdf_.fillna(method='bfill')

In [None]:
sdf_.dropna()

## Iterating

In [None]:
for col_label, col in sdf.iteritems():
    print(col_label, col, sep='\n', end='\n\n')

In [None]:
for row in sdf.loc[:, ['row_num', 'added_by']].itertuples():
    print(row)

## Concat Dataframe

In [None]:
rdf1 = pd.DataFrame(np.random.rand(3, 2))
rdf1

In [None]:
rdf2 = pd.DataFrame(np.random.rand(3, 2))
rdf2

In [None]:
rdf_concat = pd.concat([rdf1, rdf2])
rdf_concat

## Merge Dataframe

In [None]:
df_a = pd.DataFrame({'key': ['one', 'two'], 'a_val': [1, 2]})
df_a

In [None]:
df_b = pd.DataFrame({'key': ['one', 'two', 'tree'], 'b_val': [3, 4, 5]})
df_b

In [None]:
pd.merge(df_a, df_b, on='key')

In [None]:
pd.merge(df_b, df_a, on='key')

## Import Data source

In [57]:
# data = "/Users/ruzlim/Code/Jupyter//Raw_Agg_Performance.csv"
# data = "c/Users/Narut4/coding/Jupyter/Raw_Agg_Performance.csv"
data = "Raw_Agg_Performance.csv"

df = pd.read_csv(data)
df.head()

Unnamed: 0,TM_KEY_MTH,TM_KEY_WK,CENTER,METRIC_GRP,COMP_CD,PRODUCT_GRP,METRIC_CD,METRIC_NAME,DIMENSION_KPI_FLAG,BG_FLAG,AGG_TYPE,RR_IND,GRY_IND,UOM,PERIOD,ACTUAL_AGG,TARGET_AGG,PPN_TM
0,202303,2023009,Customer Service Experience & Retention,Experience,DTAC,All Services,DSER02401,Abandoned Call Rate : DTAC,N,N,N,0,RYG,%,M,,,2023-05-22 17:50:18
1,202303,2023009,Customer Service Experience & Retention,Experience,DTAC,All Services,DSER02402,Handled Call Rate : DTAC,N,N,N,0,GYR,%,M,,,2023-05-22 17:50:18
2,202303,2023009,Customer Service Experience & Retention,Experience,DTAC,All Services,DSER02501,Call Center CSAT IVR (Top2Boxes) : DTAC,N,N,N,0,GYR,%,M,,,2023-05-22 17:50:18
3,202303,2023009,Customer Service Experience & Retention,Experience,DTAC,Postpaid,DSER21101,Postpaid Complaint : DTAC,Y,N,N,0,RYG,%,M,,,2023-05-22 17:50:18
4,202303,2023009,Customer Service Experience & Retention,Experience,DTAC,Prepaid,DSER22101,Prepaid Complaint : DTAC,Y,N,N,0,RYG,%,M,,,2023-05-22 17:50:18


## Analyze Data

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.mean(numeric_only=True)

In [None]:
# Check null value in each column

df.apply(lambda x: sum(x.isnull()), axis=0)

In [None]:
# Fullfill null value example

df_tmp = df

df_tmp['TARGET_AGG'].fillna(df_tmp['TARGET_AGG'].mean(), inplace=True)

df_tmp.apply(lambda x: sum(x.isnull()), axis=0)

# df_tmp

In [None]:
df['CENTER'].unique()

In [58]:
# df_bg = df[(df['CENTER'] == 'Sales & Revenue') & (df['BG_FLAG'] == 'Y')]
df_bg = df[(df['BG_FLAG'] == 'Y') | ('METRIC_NAME' == 'TDG Revenue (Performance View)')]
df_bg.head()

Unnamed: 0,TM_KEY_MTH,TM_KEY_WK,CENTER,METRIC_GRP,COMP_CD,PRODUCT_GRP,METRIC_CD,METRIC_NAME,DIMENSION_KPI_FLAG,BG_FLAG,AGG_TYPE,RR_IND,GRY_IND,UOM,PERIOD,ACTUAL_AGG,TARGET_AGG,PPN_TM
133,202303,2023009,New S-Curve,Revenue,TRUE,TDG,TNSC00142,TDG Revenue (Performance View),Y,Y,N,0,GYR,baht,M,,,2023-05-22 17:50:18
137,202303,2023009,Sales & Revenue,Revenue,ALL,Postpaid,B2R000100,Postpaid Revenue,Y,Y,S,0,GYR,baht,M,1157226000.0,1165612000.0,2023-05-22 17:50:18
165,202303,2023009,Sales & Revenue,Revenue,ALL,Prepaid,B1R000100,Prepaid Revenue,Y,Y,S,1,GYR,baht,M,669901000.0,626219300.0,2023-05-22 17:50:18
286,202303,2023009,Sales & Revenue,Revenue,TRUE,TOL,TB3R000100,TOL Revenue,Y,Y,S,0,GYR,baht,M,189713200.0,205173400.0,2023-05-22 17:50:18
292,202303,2023009,Sales & Revenue,Revenue,TRUE,TVS,TB4R000100,TVS Revenue,Y,Y,S,0,GYR,baht,M,42430650.0,47555950.0,2023-05-22 17:50:18


In [None]:
df_bg.describe()

In [None]:
df_bg.shape

In [59]:
# df_bg.sort_values(by=['TM_KEY_MTH', 'TM_KEY_WK', 'CENTER', 'METRIC_GRP', 'COMP_CD', 'PRODUCT_GRP', 'METRIC_CD'], ascending=True)
df_bg.head()

Unnamed: 0,TM_KEY_MTH,TM_KEY_WK,CENTER,METRIC_GRP,COMP_CD,PRODUCT_GRP,METRIC_CD,METRIC_NAME,DIMENSION_KPI_FLAG,BG_FLAG,AGG_TYPE,RR_IND,GRY_IND,UOM,PERIOD,ACTUAL_AGG,TARGET_AGG,PPN_TM
133,202303,2023009,New S-Curve,Revenue,TRUE,TDG,TNSC00142,TDG Revenue (Performance View),Y,Y,N,0,GYR,baht,M,,,2023-05-22 17:50:18
137,202303,2023009,Sales & Revenue,Revenue,ALL,Postpaid,B2R000100,Postpaid Revenue,Y,Y,S,0,GYR,baht,M,1157226000.0,1165612000.0,2023-05-22 17:50:18
165,202303,2023009,Sales & Revenue,Revenue,ALL,Prepaid,B1R000100,Prepaid Revenue,Y,Y,S,1,GYR,baht,M,669901000.0,626219300.0,2023-05-22 17:50:18
286,202303,2023009,Sales & Revenue,Revenue,TRUE,TOL,TB3R000100,TOL Revenue,Y,Y,S,0,GYR,baht,M,189713200.0,205173400.0,2023-05-22 17:50:18
292,202303,2023009,Sales & Revenue,Revenue,TRUE,TVS,TB4R000100,TVS Revenue,Y,Y,S,0,GYR,baht,M,42430650.0,47555950.0,2023-05-22 17:50:18


In [61]:
# df_bg_agg_1 = pd.DataFrame(df_bg.groupby(['TM_KEY_MTH', 'TM_KEY_WK', 'METRIC_CD', 'METRIC_NAME'])['ACTUAL_AGG'].sum().reset_index())
# df_bg_agg_1

# df_bg_agg_2 = pd.DataFrame(df_bg.groupby(['TM_KEY_MTH', 'TM_KEY_WK', 'METRIC_CD', 'METRIC_NAME'])['ACTUAL_AGG'].agg(['sum', 'count']).reset_index())
# df_bg_agg_2

df_bg_agg = pd.DataFrame(
    df_bg.groupby(['TM_KEY_MTH', 'TM_KEY_WK', 'METRIC_CD', 'METRIC_NAME'])
    .agg({'ACTUAL_AGG': 'sum', 'TARGET_AGG': 'sum'}).reset_index())

df_bg_agg['ACTUAL_AGG'] = df_bg_agg['ACTUAL_AGG'] / 1000000
df_bg_agg['TARGET_AGG'] = df_bg_agg['TARGET_AGG'] / 1000000

df_bg_agg.head()

Unnamed: 0,TM_KEY_MTH,TM_KEY_WK,METRIC_CD,METRIC_NAME,ACTUAL_AGG,TARGET_AGG
0,202303,2023009,B1R000100,Prepaid Revenue,669.901008,626.219335
1,202303,2023009,B2R000100,Postpaid Revenue,1157.22611,1165.611851
2,202303,2023009,TB3R000100,TOL Revenue,189.713166,205.173356
3,202303,2023009,TB4R000100,TVS Revenue,42.430646,47.555946
4,202303,2023009,TNSC00142,TDG Revenue (Performance View),0.0,0.0


In [None]:
# Can't run
# type(df_bg_agg)
df_bg_pivot = df_bg_agg.pivot(index='TM_KEY_MTH', columns='METRIC_NAME', values='ACTUAL_AGG')
df_bg_pivot