In [1]:
import pandas as pd
import numpy as np
import os
import re
import datetime
from sklearn.preprocessing import LabelEncoder

In [2]:
def getDate(filename):
    match_str = re.search(r'\d{4}-\d{2}-\d{2}',filename)
    try:
        return datetime.datetime.strptime(match_str.group(), '%Y-%m-%d').date()
    except ValueError:
        return None

In [3]:
def cleanColumn(name):
    name = name.upper()
    clean_name = re.sub(r'[^a-zA-Z0-9_]', '', name)
    clean_name = clean_name.replace('\n', ' ')
    if clean_name == 'SHELTERNFI':
        clean_name = 'SHELTER'
    if clean_name == 'PROTECTIONCP':
        clean_name = 'PC_CP'
    if clean_name == 'PROTECTIONGBV':
        clean_name = 'PC_GBV'
    if clean_name == 'PROTECTIONMA':
        clean_name = 'PC_MA'
    if clean_name == 'TOTALREACHED':
        clean_name = 'PEOPLEREACHED'
    return clean_name

In [4]:
#using csv and xlsx
csv_dir = 'ukraine_data_excel'


In [5]:
#using xlsx
sheet1_df = []
sheet2_df = []
sheet3_df = []

In [6]:
#using xlsx
for filename in os.listdir(csv_dir):
  if filename.endswith('.xlsx'):
    file_path = os.path.join(csv_dir,filename)
    excel_df = pd.ExcelFile(file_path)
    files_date = getDate(filename)
    for sheet in excel_df.sheet_names:
      df = pd.read_excel(excel_df,sheet_name=sheet)
      df.columns = [cleanColumn(col) for col in df.columns]
      df['DATE'] = files_date
      if sheet == 'Num_of_Orgs_by_Oblast':
        sheet1_df.append(df)
      elif sheet == 'People_Reached_by_Oblast':
        sheet2_df.append(df)
      elif sheet == 'UDE_Inputs':
        sheet3_df.append(df)

In [7]:
combined_sheet1 = pd.concat(sheet1_df, keys=[str(df['DATE'].iloc[0]) for df in sheet1_df])
res_sheet1 = combined_sheet1.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()
merged_sheet1 = pd.merge(res_sheet1, combined_sheet1[['ADMIN1_ID', 'DATE', 'OBLAST']], on=['ADMIN1_ID', 'DATE'], how='left')
merged_sheet1.iloc[26]

ADMIN1_ID                                                       UA05
DATE                                                      2022-11-25
OBLAST_x                                                   Vinnytska
CCCM                                                             0.0
CCS                                                              0.0
ETC                                                              0.0
EDUCATION                                                        0.0
FSL                                                              0.0
HEALTH                                                           0.0
LOGISTICS                                                        0.0
MPC                                                              0.0
NUTRITION                                                        0.0
SHELTER                                                          0.0
PROTECTIONTOTAL                                                  0.0
PC_CP                             

In [63]:
combined_sheet2 = pd.concat(sheet2_df, keys=[str(df['DATE'].iloc[0]) for df in sheet2_df])
res_sheet2 = combined_sheet2.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()
merged_sheet2 = pd.merge(res_sheet2, combined_sheet2[['ADMIN1_ID', 'DATE', 'OBLAST']], on=['ADMIN1_ID', 'DATE'], how='left')

In [64]:
combined_sheet3 = pd.concat(sheet3_df, keys=[str(df['DATE'].iloc[0]) for df in sheet3_df])
res_sheet3 = combined_sheet3.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()
merged_sheet3 = pd.merge(res_sheet3, combined_sheet3[['ADMIN1_ID', 'DATE', 'OBLAST']], on=['ADMIN1_ID', 'DATE'], how='left')

In [65]:
merged_sheet1.shape

(750, 22)

In [66]:
merged_sheet3.shape

(750, 7)

In [67]:
merged_sheet1.iloc[26]

ADMIN1_ID                                                       UA05
DATE                                                      2022-11-25
OBLAST_x                                                   Vinnytska
CCCM                                                             0.0
CCS                                                              0.0
ETC                                                              0.0
EDUCATION                                                        0.0
FSL                                                              0.0
HEALTH                                                           0.0
LOGISTICS                                                        0.0
MPC                                                              0.0
NUTRITION                                                        0.0
SHELTER                                                          0.0
PROTECTIONTOTAL                                                  0.0
PC_CP                             

In [42]:
#Num_of_Orgs_by_Oblast
merged_sheet1.iloc[20:35]


Unnamed: 0,ADMIN1_ID,DATE,OBLAST_x,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,LOGISTICS,...,SHELTER,PROTECTIONTOTAL,PC_CP,PC_GBV,PC_MA,WASH,TOTAL,CLUSTERLIST,PC_PC,OBLAST_y
20,UA05,2022-09-22,Vinnytska,2.0,1.0,0.0,4.0,20.0,13.0,0.0,...,7.0,0.0,4.0,3.0,3.0,9.0,58.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska
21,UA05,2022-09-29,VinnytskaVinnytska,6.0,2.0,0.0,20.0,68.0,48.0,0.0,...,46.0,0.0,36.0,14.0,8.0,30.0,214.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska
22,UA05,2022-09-29,VinnytskaVinnytska,6.0,2.0,0.0,20.0,68.0,48.0,0.0,...,46.0,0.0,36.0,14.0,8.0,30.0,214.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska
23,UA05,2022-10-13,Vinnytska,3.0,1.0,0.0,11.0,35.0,24.0,0.0,...,23.0,0.0,25.0,16.0,5.0,13.0,123.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska
24,UA05,2022-10-27,Vinnytska,3.0,1.0,0.0,13.0,36.0,24.0,0.0,...,23.0,0.0,25.0,10.0,5.0,13.0,119.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska
25,UA05,2022-11-10,Vinnytska,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska
26,UA05,2022-11-25,Vinnytska,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska
27,UA05,2022-12-08,Vinnytska,3.0,1.0,0.0,13.0,36.0,27.0,0.0,...,24.0,0.0,26.0,11.0,7.0,13.0,125.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska
28,UA05,2022-12-22,Vinnytska,2.0,1.0,0.0,13.0,36.0,33.0,0.0,...,24.0,0.0,29.0,11.0,8.0,13.0,130.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska
29,UA05,2023-01-05,Vinnytska,3.0,1.0,1.0,17.0,39.0,33.0,0.0,...,25.0,0.0,33.0,10.0,8.0,14.0,138.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska


In [43]:
merged_sheet3.iloc[20:35]

Unnamed: 0,ADMIN1_ID,DATE,OBLAST_x,PEOPLEREACHED,NUMBEROFORGANISATIONS,CLUSTERSPRESENT,OBLAST_y
20,UA05,2022-09-22,Vinnytska,395394,58,"Camp Coordination & Camp Management,Coordinati...",Vinnytska
21,UA05,2022-09-29,VinnytskaVinnytska,790788,214,"Camp Coordination & Camp Management,Coordinati...",Vinnytska
22,UA05,2022-09-29,VinnytskaVinnytska,790788,214,"Camp Coordination & Camp Management,Coordinati...",Vinnytska
23,UA05,2022-10-13,Vinnytska,395394,123,"Camp Coordination & Camp Management,Coordinati...",Vinnytska
24,UA05,2022-10-27,Vinnytska,395394,119,"Camp Coordination & Camp Management,Coordinati...",Vinnytska
25,UA05,2022-11-10,Vinnytska,449220,125,"Camp Coordination & Camp Management,Coordinati...",Vinnytska
26,UA05,2022-11-25,Vinnytska,449220,125,"Camp Coordination & Camp Management,Coordinati...",Vinnytska
27,UA05,2022-12-08,Vinnytska,449220,125,"Camp Coordination & Camp Management,Coordinati...",Vinnytska
28,UA05,2022-12-22,Vinnytska,449220,130,"Camp Coordination & Camp Management,Coordinati...",Vinnytska
29,UA05,2023-01-05,Vinnytska,449220,138,"Camp Coordination & Camp Management,Coordinati...",Vinnytska


In [68]:
merged_sheet1['OBLAST'].value_counts()

KeyError: 'OBLAST'

In [None]:
sum(merged_sheet3['PEOPLEREACHED'].value_counts())

750

In [None]:
# replace zero values in the empty tuple to NAN for future linear interpolation
#NOTE : THIS IS HARD CODED NOT OPTIMAL
merged_sheet1.iloc[25] = merged_sheet1.iloc[25].replace(0,np.nan)
merged_sheet1.iloc[26] = merged_sheet1.iloc[26].replace(0,np.nan)

merged_sheet3.iloc[25] = merged_sheet3.iloc[25].replace(merged_sheet3.iloc[25]['PEOPLEREACHED'],np.nan)
merged_sheet3.iloc[26] = merged_sheet3.iloc[26].replace(merged_sheet3.iloc[26]['PEOPLEREACHED'],np.nan)


In [None]:
merged_sheet1['PEOPLE_REACHED'] = merged_sheet3['PEOPLEREACHED']

In [18]:
#check for null values
merged_sheet1.isna().any().any()

False

In [19]:
# check for null values in specific row
merged_sheet1.iloc[26].isna().any()

False

In [20]:
#filled null values using linear interpolation
interpolated_df = merged_sheet1.interpolate(method='linear')

In [21]:
interpolated_df.isna().any().any()

False

In [22]:
# rounded all int and float dtypes since interpolation caused floats
interpolated_df = interpolated_df.applymap(lambda x: round(x, 1) if isinstance(x, (int, float)) else x)

In [23]:
interpolated_df.iloc[20:35]

Unnamed: 0,ADMIN1_ID,DATE,OBLAST_x,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,LOGISTICS,...,SHELTER,PROTECTIONTOTAL,PC_CP,PC_GBV,PC_MA,WASH,TOTAL,CLUSTERLIST,PC_PC,OBLAST_y
20,UA05,2022-09-22,Vinnytska,2.0,1.0,0.0,4.0,20.0,13.0,0.0,...,7.0,0.0,4.0,3.0,3.0,9.0,58.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska
21,UA05,2022-09-29,VinnytskaVinnytska,6.0,2.0,0.0,20.0,68.0,48.0,0.0,...,46.0,0.0,36.0,14.0,8.0,30.0,214.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska
22,UA05,2022-09-29,VinnytskaVinnytska,6.0,2.0,0.0,20.0,68.0,48.0,0.0,...,46.0,0.0,36.0,14.0,8.0,30.0,214.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska
23,UA05,2022-10-13,Vinnytska,3.0,1.0,0.0,11.0,35.0,24.0,0.0,...,23.0,0.0,25.0,16.0,5.0,13.0,123.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska
24,UA05,2022-10-27,Vinnytska,3.0,1.0,0.0,13.0,36.0,24.0,0.0,...,23.0,0.0,25.0,10.0,5.0,13.0,119.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska
25,UA05,2022-11-10,Vinnytska,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska
26,UA05,2022-11-25,Vinnytska,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska
27,UA05,2022-12-08,Vinnytska,3.0,1.0,0.0,13.0,36.0,27.0,0.0,...,24.0,0.0,26.0,11.0,7.0,13.0,125.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska
28,UA05,2022-12-22,Vinnytska,2.0,1.0,0.0,13.0,36.0,33.0,0.0,...,24.0,0.0,29.0,11.0,8.0,13.0,130.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska
29,UA05,2023-01-05,Vinnytska,3.0,1.0,1.0,17.0,39.0,33.0,0.0,...,25.0,0.0,33.0,10.0,8.0,14.0,138.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska


In [24]:
interpolated_df.sort_values(by=['DATE'])

Unnamed: 0,ADMIN1_ID,DATE,OBLAST_x,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,LOGISTICS,...,SHELTER,PROTECTIONTOTAL,PC_CP,PC_GBV,PC_MA,WASH,TOTAL,CLUSTERLIST,PC_PC,OBLAST_y
0,UA05,2022-04-28,Vinnytska,2.0,1.0,1.0,5.0,17.0,16.0,0.0,...,6.0,16.0,2.0,2.0,2.0,7.0,42.0,"Camp Coordination & Camp Management,Coordinati...",11.0,Vinnytska
690,UA74,2022-04-28,Chernihivska,1.0,0.0,0.0,1.0,17.0,12.0,0.0,...,2.0,9.0,1.0,0.0,1.0,10.0,36.0,"Camp Coordination & Camp Management,Education,...",8.0,Chernihivska
60,UA12,2022-04-28,Dnipropetrovska,1.0,1.0,1.0,3.0,31.0,19.0,0.0,...,10.0,21.0,5.0,4.0,2.0,15.0,58.0,"Camp Coordination & Camp Management,Coordinati...",14.0,Dnipropetrovska
660,UA73,2022-04-28,Chernivetska,2.0,1.0,0.0,3.0,22.0,9.0,0.0,...,6.0,17.0,5.0,2.0,2.0,11.0,50.0,"Camp Coordination & Camp Management,Coordinati...",12.0,Chernivetska
90,UA14,2022-04-28,Donetska,0.0,1.0,0.0,2.0,48.0,26.0,0.0,...,9.0,29.0,7.0,4.0,1.0,18.0,75.0,"Coordination and Common Services,Education,Foo...",21.0,Donetska
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359,UA46,2023-01-05,Lvivska,4.0,1.0,1.0,23.0,62.0,52.0,1.0,...,27.0,0.0,41.0,20.0,8.0,23.0,194.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Lvivska
569,UA63,2023-01-05,Kharkivska,3.0,0.0,1.0,13.0,111.0,49.0,0.0,...,33.0,0.0,26.0,7.0,10.0,25.0,175.0,"Education,Food Security and Livelihoods,Health...",0.0,Kharkivska
719,UA74,2023-01-05,Chernihivska,0.0,0.0,0.0,18.0,56.0,29.0,0.0,...,33.0,0.0,16.0,7.0,9.0,17.0,149.0,"Education,Food Security and Livelihoods,Health...",0.0,Chernihivska
89,UA12,2023-01-05,Dnipropetrovska,3.0,1.0,1.0,19.0,113.0,52.0,1.0,...,51.0,0.0,40.0,12.0,9.0,32.0,244.0,"Coordination and Common Services,Emergency Tel...",0.0,Dnipropetrovska


In [25]:
interpolated_df.describe()

Unnamed: 0,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,LOGISTICS,MPC,NUTRITION,SHELTER,PROTECTIONTOTAL,PC_CP,PC_GBV,PC_MA,WASH,TOTAL,PC_PC
count,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0
mean,0.9,0.378667,0.192,6.154667,34.037333,19.542667,0.233333,13.658667,1.602667,12.025333,5.958667,11.010667,5.525333,4.124,12.654667,77.093333,1.804
std,1.283301,0.537584,0.420363,5.377853,27.633568,14.631574,0.465307,9.108061,1.544876,9.817461,9.210034,9.955168,4.456138,2.712644,8.763893,53.184133,4.586243
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,2.0,18.0,10.0,0.0,9.0,0.0,6.0,0.0,4.0,2.0,2.0,7.0,42.0,0.0
50%,0.0,0.0,0.0,5.0,28.0,16.0,0.0,12.0,1.0,10.0,0.0,8.0,4.5,4.0,12.0,66.0,0.0
75%,1.0,1.0,0.0,9.0,44.0,25.0,0.0,18.0,2.0,16.0,13.0,16.0,8.0,5.0,16.0,100.0,0.0
max,8.0,2.0,2.0,32.0,202.0,100.0,2.0,56.0,10.0,80.0,42.0,52.0,30.0,16.0,58.0,364.0,28.0


In [2]:
# create label encoder instance
label_encoder = LabelEncoder()

#fit label encoder to transform Oblast column
encoded_oblast = label_encoder.fit_transform(interpolated_df['OBLAST'])

#create copy
df_encoded = interpolated_df.copy()

#Add column and assign values
df_encoded['OBLAST_ENCODED'] = encoded_oblast

NameError: name 'LabelEncoder' is not defined

In [27]:
# create label encoder instance
label_encoder = LabelEncoder()

#fit label encoder to transform Date column
encoded_oblast = label_encoder.fit_transform(df_encoded['DATE'])

#create copy
df_encode = df_encoded.copy()

#Add column and assign values
df_encode['DATE_ENCODED'] = encoded_oblast

NameError: name 'df_encoded' is not defined

In [28]:
df_encode.sort_values(by=['DATE'])

NameError: name 'df_encode' is not defined

In [79]:
df_encoded.columns

Index(['ADMIN1_ID', 'DATE', 'CCCM', 'CCS', 'ETC', 'EDUCATION', 'FSL', 'HEALTH',
       'MPC', 'NUTRITION', 'PROTECTIONTOTAL', 'PC_CP', 'PC_GBV', 'PC_MA',
       'PC_PC', 'SHELTER', 'WASH', 'TOTAL', 'LOGISTICS', 'OBLAST',
       'PEOPLE_REACHED', 'OBLAST_ENCODED'],
      dtype='object')

In [65]:
merged_sheet1.shape

(750, 21)

In [66]:
#People_Reached_by_Oblast
merged_sheet2.head(10)

Unnamed: 0,ADMIN1_ID,DATE,FSL,GENERALPROTECTION,CP,MINEACTION,PROTECTIONTOTAL,HEALTH,MPC,SHELTER,WASH,PEOPLEREACHED,OBLAST
0,UA05,2022-04-28,7900.0,11200.0,100.0,500.0,11900.0,4600.0,91200.0,15800.0,2300.0,91200.0,Vinnytska
1,UA05,2022-05-06,14300.0,9600.0,100.0,500.0,14300.0,4600.0,113000.0,16200.0,2300.0,113000.0,Vinnytska
2,UA05,2022-05-12,15500.0,11400.0,200.0,500.0,16300.0,9900.0,138800.0,17500.0,3700.0,138800.0,Vinnytska
3,UA05,2022-05-19,17500.0,0.0,500.0,500.0,20300.0,23200.0,140600.0,17500.0,3700.0,140600.0,Vinnytska
4,UA05,2022-05-26,17600.0,0.0,4400.0,700.0,25600.0,23300.0,216600.0,21700.0,3800.0,216600.0,Vinnytska
5,UA05,2022-06-02,19000.0,0.0,21500.0,700.0,48000.0,23300.0,236000.0,33300.0,3800.0,236000.0,Vinnytska
6,UA05,2022-06-09,19000.0,0.0,21500.0,700.0,21000.0,23300.0,240100.0,40500.0,23800.0,240100.0,Vinnytska
7,UA05,2022-07-07,39200.0,0.0,46500.0,11800.0,94900.0,27500.0,214600.0,59400.0,42700.0,214600.0,Vinnytska
8,UA05,2022-07-14,39200.0,0.0,35700.0,11800.0,75700.0,27500.0,216000.0,60700.0,42700.0,216000.0,Vinnytska
9,UA05,2022-07-21,39200.0,0.0,35700.0,99800.0,174700.0,39000.0,217200.0,62600.0,42700.0,217200.0,Vinnytska


In [67]:
#UDE_Inputs
merged_sheet3.head()

Unnamed: 0,ADMIN1_ID,DATE,PEOPLEREACHED,NUMBEROFORGANISATIONS,OBLAST
0,UA05,2022-04-28,91200.0,42,Vinnytska
1,UA05,2022-05-06,113000.0,45,Vinnytska
2,UA05,2022-05-12,138800.0,49,Vinnytska
3,UA05,2022-05-19,140600.0,50,Vinnytska
4,UA05,2022-05-26,216600.0,48,Vinnytska


In [68]:
# copy_df = pd.DataFrame(np.nan, index=merged_sheet1.index,columns=merged_sheet1.columns)
# copy_df['DATE'] = merged_sheet1['DATE']
# copy_df['OBLAST'] = merged_sheet1['OBLAST']
# copy_df.head()

In [69]:
# # Create an Excel writer object
# excel_writer1 = pd.ExcelWriter('Data_ Round 33 - Ukraine 5W - 2022-11-10.xlsx', engine='xlsxwriter')
# excel_writer2 = pd.ExcelWriter('Data_ Round 34 - Ukraine 5W - 2022-11-25.xlsx', engine='xlsxwriter')
# # Write each DataFrame to a separate sheet in the Excel file
# copy_df.to_excel(excel_writer1, sheet_name='Num_of_Orgs_by_Oblast', index=False)
# copy_df.to_excel(excel_writer2, sheet_name='Num_of_Orgs_by_Oblast', index=False)

In [70]:
len(merged_sheet1.columns)
merged_sheet1.columns

Index(['ADMIN1_ID', 'DATE', 'CCCM', 'CCS', 'ETC', 'EDUCATION', 'FSL', 'HEALTH',
       'MPC', 'NUTRITION', 'PROTECTIONTOTAL', 'PC_CP', 'PC_GBV', 'PC_MA',
       'PC_PC', 'SHELTER', 'WASH', 'TOTAL', 'LOGISTICS', 'OBLAST',
       'PEOPLE_REACHED'],
      dtype='object')

In [71]:
merged_sheet1.shape

(750, 21)

In [72]:
merged_sheet3.shape

(750, 5)

In [73]:
merged_sheet1.DATE.value_counts()

2022-09-29    50
2022-04-28    25
2022-08-11    25
2022-12-22    25
2022-12-08    25
2022-11-25    25
2022-11-10    25
2022-10-27    25
2022-10-13    25
2022-09-22    25
2022-09-15    25
2022-09-08    25
2022-08-25    25
2022-08-18    25
2022-08-04    25
2022-05-06    25
2022-07-28    25
2022-07-21    25
2022-07-14    25
2022-07-07    25
2022-06-30    25
2022-06-23    25
2022-06-16    25
2022-06-09    25
2022-06-02    25
2022-05-26    25
2022-05-19    25
2022-05-12    25
2023-01-05    25
Name: DATE, dtype: int64