In [70]:
import pandas as pd
import numpy as np
import os
import re
import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression

In [2]:
def getDate(filename):
    match_str = re.search(r'\d{4}-\d{2}-\d{2}',filename)
    try:
        return datetime.datetime.strptime(match_str.group(), '%Y-%m-%d').date()
    except ValueError:
        return None

In [3]:
def cleanColumn(name):
    name = name.upper()
    clean_name = re.sub(r'[^a-zA-Z0-9_]', '', name)
    clean_name = clean_name.replace('\n', ' ')
    if clean_name == 'SHELTERNFI':
        clean_name = 'SHELTER'
    if clean_name == 'PROTECTIONCP':
        clean_name = 'PC_CP'
    if clean_name == 'PROTECTIONGBV':
        clean_name = 'PC_GBV'
    if clean_name == 'PROTECTIONMA':
        clean_name = 'PC_MA'
    if clean_name == 'TOTALREACHED':
        clean_name = 'PEOPLEREACHED'
    return clean_name

In [4]:
#using csv and xlsx
csv_dir = 'ukraine_data_excel'


In [5]:
#using xlsx
sheet1_df = []
sheet2_df = []
sheet3_df = []

In [6]:
#using xlsx
for filename in os.listdir(csv_dir):
  if filename.endswith('.xlsx'):
    file_path = os.path.join(csv_dir,filename)
    excel_df = pd.ExcelFile(file_path)
    files_date = getDate(filename)
    for sheet in excel_df.sheet_names:
      df = pd.read_excel(excel_df,sheet_name=sheet)
      df.columns = [cleanColumn(col) for col in df.columns]
      df['DATE'] = files_date
      if sheet == 'Num_of_Orgs_by_Oblast':
        sheet1_df.append(df)
      elif sheet == 'People_Reached_by_Oblast':
        sheet2_df.append(df)
      elif sheet == 'UDE_Inputs':
        sheet3_df.append(df)

In [7]:
combined_sheet1 = pd.concat(sheet1_df, keys=[str(df['DATE'].iloc[0]) for df in sheet1_df])
res_sheet1 = combined_sheet1.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()
merged_sheet1 = pd.merge(res_sheet1, combined_sheet1[['ADMIN1_ID', 'DATE', 'OBLAST']], on=['ADMIN1_ID', 'DATE'], how='left')

In [8]:
combined_sheet2 = pd.concat(sheet2_df, keys=[str(df['DATE'].iloc[0]) for df in sheet2_df])
res_sheet2 = combined_sheet2.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()
merged_sheet2 = pd.merge(res_sheet2, combined_sheet2[['ADMIN1_ID', 'DATE', 'OBLAST']], on=['ADMIN1_ID', 'DATE'], how='left')

In [9]:
combined_sheet3 = pd.concat(sheet3_df, keys=[str(df['DATE'].iloc[0]) for df in sheet3_df])
res_sheet3 = combined_sheet3.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()
merged_sheet3 = pd.merge(res_sheet3, combined_sheet3[['ADMIN1_ID', 'DATE', 'OBLAST']], on=['ADMIN1_ID', 'DATE'], how='left')

In [10]:
merged_sheet1.shape

(750, 22)

In [11]:
merged_sheet3.shape

(750, 7)

In [12]:
merged_sheet1.iloc[26]

ADMIN1_ID                                                       UA05
DATE                                                      2022-11-25
OBLAST_x                                                   Vinnytska
CCCM                                                             0.0
CCS                                                              0.0
ETC                                                              0.0
EDUCATION                                                        0.0
FSL                                                              0.0
HEALTH                                                           0.0
MPC                                                              0.0
NUTRITION                                                        0.0
PROTECTIONTOTAL                                                  0.0
PC_CP                                                            0.0
PC_GBV                                                           0.0
PC_MA                             

In [13]:
#Num_of_Orgs_by_Oblast
merged_sheet1.iloc[20:35]


Unnamed: 0,ADMIN1_ID,DATE,OBLAST_x,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,MPC,...,PC_CP,PC_GBV,PC_MA,PC_PC,SHELTER,WASH,TOTAL,CLUSTERLIST,LOGISTICS,OBLAST_y
20,UA05,2022-09-22,Vinnytska,2.0,1.0,0.0,4.0,20.0,13.0,6.0,...,4.0,3.0,3.0,0.0,7.0,9.0,58.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska
21,UA05,2022-09-29,VinnytskaVinnytska,6.0,2.0,0.0,20.0,68.0,48.0,34.0,...,36.0,14.0,8.0,0.0,46.0,30.0,214.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska
22,UA05,2022-09-29,VinnytskaVinnytska,6.0,2.0,0.0,20.0,68.0,48.0,34.0,...,36.0,14.0,8.0,0.0,46.0,30.0,214.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska
23,UA05,2022-10-13,Vinnytska,3.0,1.0,0.0,11.0,35.0,24.0,17.0,...,25.0,16.0,5.0,0.0,23.0,13.0,123.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska
24,UA05,2022-10-27,Vinnytska,3.0,1.0,0.0,13.0,36.0,24.0,17.0,...,25.0,10.0,5.0,0.0,23.0,13.0,119.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska
25,UA05,2022-11-10,Vinnytska,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska
26,UA05,2022-11-25,Vinnytska,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska
27,UA05,2022-12-08,Vinnytska,3.0,1.0,0.0,13.0,36.0,27.0,18.0,...,26.0,11.0,7.0,0.0,24.0,13.0,125.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska
28,UA05,2022-12-22,Vinnytska,2.0,1.0,0.0,13.0,36.0,33.0,18.0,...,29.0,11.0,8.0,0.0,24.0,13.0,130.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska
29,UA05,2023-01-05,Vinnytska,3.0,1.0,1.0,17.0,39.0,33.0,19.0,...,33.0,10.0,8.0,0.0,25.0,14.0,138.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska


In [14]:
merged_sheet3.iloc[20:35]

Unnamed: 0,ADMIN1_ID,DATE,OBLAST_x,PEOPLEREACHED,NUMBEROFORGANISATIONS,CLUSTERSPRESENT,OBLAST_y
20,UA05,2022-09-22,Vinnytska,395394,58,"Camp Coordination & Camp Management,Coordinati...",Vinnytska
21,UA05,2022-09-29,VinnytskaVinnytska,790788,214,"Camp Coordination & Camp Management,Coordinati...",Vinnytska
22,UA05,2022-09-29,VinnytskaVinnytska,790788,214,"Camp Coordination & Camp Management,Coordinati...",Vinnytska
23,UA05,2022-10-13,Vinnytska,395394,123,"Camp Coordination & Camp Management,Coordinati...",Vinnytska
24,UA05,2022-10-27,Vinnytska,395394,119,"Camp Coordination & Camp Management,Coordinati...",Vinnytska
25,UA05,2022-11-10,Vinnytska,449220,125,"Camp Coordination & Camp Management,Coordinati...",Vinnytska
26,UA05,2022-11-25,Vinnytska,449220,125,"Camp Coordination & Camp Management,Coordinati...",Vinnytska
27,UA05,2022-12-08,Vinnytska,449220,125,"Camp Coordination & Camp Management,Coordinati...",Vinnytska
28,UA05,2022-12-22,Vinnytska,449220,130,"Camp Coordination & Camp Management,Coordinati...",Vinnytska
29,UA05,2023-01-05,Vinnytska,449220,138,"Camp Coordination & Camp Management,Coordinati...",Vinnytska


In [15]:
merged_sheet1['OBLAST_y'].value_counts()

OBLAST_y
Vinnytska           30
Odeska              30
Chernihivska        30
Chernivetska        30
Cherkaska           30
Khmelnytska         30
Khersonska          30
Kharkivska          30
Ternopilska         30
Sumska              30
Rivnenska           30
Poltavska           30
Mykolaivska         30
Volynska            30
Lvivska             30
Luhanska            30
Kirovohradska       30
Kyivska             30
Ivano-Frankivska    30
Zaporizka           30
Zakarpatska         30
Zhytomyrska         30
Donetska            30
Dnipropetrovska     30
Kyiv                30
Name: count, dtype: int64

In [16]:
sum(merged_sheet3['PEOPLEREACHED'].value_counts())

750

In [17]:
# replace zero values in the empty tuple to NAN for future linear interpolation
#NOTE : THIS IS HARD CODED NOT OPTIMAL
merged_sheet1.iloc[25] = merged_sheet1.iloc[25].replace(0,np.nan)
merged_sheet1.iloc[26] = merged_sheet1.iloc[26].replace(0,np.nan)

merged_sheet3.iloc[25] = merged_sheet3.iloc[25].replace(merged_sheet3.iloc[25]['PEOPLEREACHED'],np.nan)
merged_sheet3.iloc[26] = merged_sheet3.iloc[26].replace(merged_sheet3.iloc[26]['PEOPLEREACHED'],np.nan)


In [18]:
merged_sheet1['PEOPLE_REACHED'] = merged_sheet3['PEOPLEREACHED']

In [19]:
#check for null values
merged_sheet1.isna().any().any()

True

In [20]:
# check for null values in specific row
merged_sheet1.iloc[26].isna().any()

True

In [21]:
#filled null values using linear interpolation
interpolated_df = merged_sheet1.interpolate(method='linear')

  interpolated_df = merged_sheet1.interpolate(method='linear')


In [22]:
interpolated_df.isna().any().any()

False

In [23]:
# rounded all int and float dtypes since interpolation caused floats
interpolated_df = interpolated_df.applymap(lambda x: round(x, 1) if isinstance(x, (int, float)) else x)

  interpolated_df = interpolated_df.applymap(lambda x: round(x, 1) if isinstance(x, (int, float)) else x)


In [24]:
interpolated_df.iloc[20:35]

Unnamed: 0,ADMIN1_ID,DATE,OBLAST_x,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,MPC,...,PC_GBV,PC_MA,PC_PC,SHELTER,WASH,TOTAL,CLUSTERLIST,LOGISTICS,OBLAST_y,PEOPLE_REACHED
20,UA05,2022-09-22,Vinnytska,2.0,1.0,0.0,4.0,20.0,13.0,6.0,...,3.0,3.0,0.0,7.0,9.0,58.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska,395394.0
21,UA05,2022-09-29,VinnytskaVinnytska,6.0,2.0,0.0,20.0,68.0,48.0,34.0,...,14.0,8.0,0.0,46.0,30.0,214.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska,790788.0
22,UA05,2022-09-29,VinnytskaVinnytska,6.0,2.0,0.0,20.0,68.0,48.0,34.0,...,14.0,8.0,0.0,46.0,30.0,214.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska,790788.0
23,UA05,2022-10-13,Vinnytska,3.0,1.0,0.0,11.0,35.0,24.0,17.0,...,16.0,5.0,0.0,23.0,13.0,123.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska,395394.0
24,UA05,2022-10-27,Vinnytska,3.0,1.0,0.0,13.0,36.0,24.0,17.0,...,10.0,5.0,0.0,23.0,13.0,119.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska,395394.0
25,UA05,2022-11-10,Vinnytska,3.0,1.0,0.0,13.0,36.0,25.0,17.3,...,10.3,5.7,0.0,23.3,13.0,121.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska,413336.0
26,UA05,2022-11-25,Vinnytska,3.0,1.0,0.0,13.0,36.0,26.0,17.7,...,10.7,6.3,0.0,23.7,13.0,123.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska,431278.0
27,UA05,2022-12-08,Vinnytska,3.0,1.0,0.0,13.0,36.0,27.0,18.0,...,11.0,7.0,0.0,24.0,13.0,125.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska,449220.0
28,UA05,2022-12-22,Vinnytska,2.0,1.0,0.0,13.0,36.0,33.0,18.0,...,11.0,8.0,0.0,24.0,13.0,130.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska,449220.0
29,UA05,2023-01-05,Vinnytska,3.0,1.0,1.0,17.0,39.0,33.0,19.0,...,10.0,8.0,0.0,25.0,14.0,138.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska,449220.0


In [25]:
interpolated_df.sort_values(by=['DATE'])

Unnamed: 0,ADMIN1_ID,DATE,OBLAST_x,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,MPC,...,PC_GBV,PC_MA,PC_PC,SHELTER,WASH,TOTAL,CLUSTERLIST,LOGISTICS,OBLAST_y,PEOPLE_REACHED
0,UA05,2022-04-28,Vinnytska,2.0,1.0,1.0,5.0,17.0,16.0,10.0,...,2.0,2.0,11.0,6.0,7.0,42.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska,91200.0
690,UA74,2022-04-28,Chernihivska,1.0,0.0,0.0,1.0,17.0,12.0,6.0,...,0.0,1.0,8.0,2.0,10.0,36.0,"Camp Coordination & Camp Management,Education,...",0.0,Chernihivska,171600.0
60,UA12,2022-04-28,Dnipropetrovska,1.0,1.0,1.0,3.0,31.0,19.0,12.0,...,4.0,2.0,14.0,10.0,15.0,58.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Dnipropetrovska,231800.0
660,UA73,2022-04-28,Chernivetska,2.0,1.0,0.0,3.0,22.0,9.0,10.0,...,2.0,2.0,12.0,6.0,11.0,50.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Chernivetska,31200.0
90,UA14,2022-04-28,Donetska,0.0,1.0,0.0,2.0,48.0,26.0,12.0,...,4.0,1.0,21.0,9.0,18.0,75.0,"Coordination and Common Services,Education,Foo...",0.0,Donetska,122100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359,UA46,2023-01-05,Lvivska,4.0,1.0,1.0,23.0,62.0,52.0,26.0,...,20.0,8.0,0.0,27.0,23.0,194.0,"Camp Coordination & Camp Management,Coordinati...",1.0,Lvivska,957877.0
569,UA63,2023-01-05,Kharkivska,3.0,0.0,1.0,13.0,111.0,49.0,24.0,...,7.0,10.0,0.0,33.0,25.0,175.0,"Education,Food Security and Livelihoods,Health...",0.0,Kharkivska,2070173.0
719,UA74,2023-01-05,Chernihivska,0.0,0.0,0.0,18.0,56.0,29.0,20.0,...,7.0,9.0,0.0,33.0,17.0,149.0,"Education,Food Security and Livelihoods,Health...",0.0,Chernihivska,477499.0
89,UA12,2023-01-05,Dnipropetrovska,3.0,1.0,1.0,19.0,113.0,52.0,33.0,...,12.0,9.0,0.0,51.0,32.0,244.0,"Coordination and Common Services,Emergency Tel...",1.0,Dnipropetrovska,847604.0


In [26]:
interpolated_df.describe()

Unnamed: 0,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,MPC,NUTRITION,PROTECTIONTOTAL,PC_CP,PC_GBV,PC_MA,PC_PC,SHELTER,WASH,TOTAL,LOGISTICS,PEOPLE_REACHED
count,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0
mean,0.908,0.381333,0.192,6.189333,34.133333,19.610667,13.705333,1.602667,5.958667,11.078667,5.553333,4.14,1.804,12.088,12.689333,77.418667,0.233333,470911.5
std,1.287016,0.53818,0.420363,5.37997,27.577556,14.599793,9.082728,1.544876,9.210034,9.96683,4.454329,2.705994,4.586243,9.815506,8.739412,53.084492,0.465307,437962.7
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13800.0
25%,0.0,0.0,0.0,2.0,18.0,10.0,9.0,0.0,0.0,4.0,2.0,2.0,0.0,6.0,7.0,42.0,0.0,217977.5
50%,0.0,0.0,0.0,5.0,29.0,16.0,12.0,1.0,0.0,8.0,5.0,4.0,0.0,10.0,12.0,66.0,0.0,383711.0
75%,1.0,1.0,0.0,9.0,44.0,25.0,18.0,2.0,13.0,16.0,8.0,5.525,0.0,16.0,16.0,101.0,0.0,583976.0
max,8.0,2.0,2.0,32.0,202.0,100.0,56.0,10.0,42.0,52.0,30.0,16.0,28.0,80.0,58.0,364.0,2.0,4140346.0


In [27]:
# create label encoder instance
label_encoder = LabelEncoder()

#fit label encoder to transform Oblast column
encoded_oblast = label_encoder.fit_transform(interpolated_df['OBLAST_y'])

#create copy
df_encoded = interpolated_df.copy()

#Add column and assign values
df_encoded['OBLAST_ENCODED'] = encoded_oblast

In [28]:
# create label encoder instance
label_encoder = LabelEncoder()

#fit label encoder to transform Date column
encoded_oblast = label_encoder.fit_transform(df_encoded['DATE'])

#create copy
df_encode = df_encoded.copy()

#Add column and assign values
df_encode['DATE_ENCODED'] = encoded_oblast

In [29]:
df_encode.sort_values(by=['DATE'])

Unnamed: 0,ADMIN1_ID,DATE,OBLAST_x,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,MPC,...,PC_PC,SHELTER,WASH,TOTAL,CLUSTERLIST,LOGISTICS,OBLAST_y,PEOPLE_REACHED,OBLAST_ENCODED,DATE_ENCODED
0,UA05,2022-04-28,Vinnytska,2.0,1.0,1.0,5.0,17.0,16.0,10.0,...,11.0,6.0,7.0,42.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Vinnytska,91200.0,20,0
690,UA74,2022-04-28,Chernihivska,1.0,0.0,0.0,1.0,17.0,12.0,6.0,...,8.0,2.0,10.0,36.0,"Camp Coordination & Camp Management,Education,...",0.0,Chernihivska,171600.0,1,0
60,UA12,2022-04-28,Dnipropetrovska,1.0,1.0,1.0,3.0,31.0,19.0,12.0,...,14.0,10.0,15.0,58.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Dnipropetrovska,231800.0,3,0
660,UA73,2022-04-28,Chernivetska,2.0,1.0,0.0,3.0,22.0,9.0,10.0,...,12.0,6.0,11.0,50.0,"Camp Coordination & Camp Management,Coordinati...",0.0,Chernivetska,31200.0,2,0
90,UA14,2022-04-28,Donetska,0.0,1.0,0.0,2.0,48.0,26.0,12.0,...,21.0,9.0,18.0,75.0,"Coordination and Common Services,Education,Foo...",0.0,Donetska,122100.0,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359,UA46,2023-01-05,Lvivska,4.0,1.0,1.0,23.0,62.0,52.0,26.0,...,0.0,27.0,23.0,194.0,"Camp Coordination & Camp Management,Coordinati...",1.0,Lvivska,957877.0,13,28
569,UA63,2023-01-05,Kharkivska,3.0,0.0,1.0,13.0,111.0,49.0,24.0,...,0.0,33.0,25.0,175.0,"Education,Food Security and Livelihoods,Health...",0.0,Kharkivska,2070173.0,6,28
719,UA74,2023-01-05,Chernihivska,0.0,0.0,0.0,18.0,56.0,29.0,20.0,...,0.0,33.0,17.0,149.0,"Education,Food Security and Livelihoods,Health...",0.0,Chernihivska,477499.0,1,28
89,UA12,2023-01-05,Dnipropetrovska,3.0,1.0,1.0,19.0,113.0,52.0,33.0,...,0.0,51.0,32.0,244.0,"Coordination and Common Services,Emergency Tel...",1.0,Dnipropetrovska,847604.0,3,28


In [30]:
df_encoded.columns

Index(['ADMIN1_ID', 'DATE', 'OBLAST_x', 'CCCM', 'CCS', 'ETC', 'EDUCATION',
       'FSL', 'HEALTH', 'MPC', 'NUTRITION', 'PROTECTIONTOTAL', 'PC_CP',
       'PC_GBV', 'PC_MA', 'PC_PC', 'SHELTER', 'WASH', 'TOTAL', 'CLUSTERLIST',
       'LOGISTICS', 'OBLAST_y', 'PEOPLE_REACHED', 'OBLAST_ENCODED'],
      dtype='object')

In [31]:
merged_sheet1.shape

(750, 23)

In [32]:
#People_Reached_by_Oblast
merged_sheet2.head(10)

Unnamed: 0,ADMIN1_ID,DATE,OBLAST_x,EDUCATION,FSL,GENERALPROTECTION,CP,GBV,MINEACTION,PROTECTIONTOTAL,HEALTH,MPC,NUTRITION,SHELTER,WASH,PEOPLEREACHED,ADM1_ID,OBLAST_y
0,UA05,2022-04-28,Vinnytska,2800.0,7900.0,11200.0,100.0,100.0,500.0,11900.0,4600.0,91200.0,0,15800.0,2300.0,91200.0,0,Vinnytska
1,UA05,2022-05-06,Vinnytska,2800.0,14300.0,9600.0,100.0,4100.0,500.0,14300.0,4600.0,113000.0,0,16200.0,2300.0,113000.0,0,Vinnytska
2,UA05,2022-05-12,Vinnytska,4900.0,15500.0,11400.0,200.0,4100.0,500.0,16300.0,9900.0,138800.0,0,17500.0,3700.0,138800.0,0,Vinnytska
3,UA05,2022-05-19,Vinnytska,9000.0,17500.0,0.0,500.0,6100.0,500.0,20300.0,23200.0,140600.0,0,17500.0,3700.0,140600.0,0,Vinnytska
4,UA05,2022-05-26,Vinnytska,11500.0,17600.0,0.0,4400.0,6200.0,700.0,25600.0,23300.0,216600.0,0,21700.0,3800.0,216600.0,0,Vinnytska
5,UA05,2022-06-02,Vinnytska,11500.0,19000.0,0.0,21500.0,6200.0,700.0,48000.0,23300.0,236000.0,0,33300.0,3800.0,236000.0,0,Vinnytska
6,UA05,2022-06-09,Vinnytska,11500.0,19000.0,0.0,21500.0,6200.0,700.0,21000.0,23300.0,240100.0,0,40500.0,23800.0,240100.0,0,Vinnytska
7,UA05,2022-07-07,Vinnytska,16800.0,39200.0,0.0,46500.0,6900.0,11800.0,94900.0,27500.0,214600.0,0,59400.0,42700.0,214600.0,0,Vinnytska
8,UA05,2022-07-14,Vinnytska,16800.0,39200.0,0.0,35700.0,7200.0,11800.0,75700.0,27500.0,216000.0,0,60700.0,42700.0,216000.0,0,Vinnytska
9,UA05,2022-07-21,Vinnytska,16800.0,39200.0,0.0,35700.0,7200.0,99800.0,174700.0,39000.0,217200.0,-,62600.0,42700.0,217200.0,0,Vinnytska


In [33]:
#UDE_Inputs
merged_sheet3.head()

Unnamed: 0,ADMIN1_ID,DATE,OBLAST_x,PEOPLEREACHED,NUMBEROFORGANISATIONS,CLUSTERSPRESENT,OBLAST_y
0,UA05,2022-04-28,Vinnytska,91200.0,42,"Camp Coordination & Camp Management,Coordinati...",Vinnytska
1,UA05,2022-05-06,Vinnytska,113000.0,45,"Camp Coordination & Camp Management,Coordinati...",Vinnytska
2,UA05,2022-05-12,Vinnytska,138800.0,49,"Camp Coordination & Camp Management,Coordinati...",Vinnytska
3,UA05,2022-05-19,Vinnytska,140600.0,50,"Camp Coordination & Camp Management,Coordinati...",Vinnytska
4,UA05,2022-05-26,Vinnytska,216600.0,48,"Coordination and Common Services,Emergency Tel...",Vinnytska


In [34]:
# copy_df = pd.DataFrame(np.nan, index=merged_sheet1.index,columns=merged_sheet1.columns)
# copy_df['DATE'] = merged_sheet1['DATE']
# copy_df['OBLAST'] = merged_sheet1['OBLAST']
# copy_df.head()

In [35]:
# # Create an Excel writer object
# excel_writer1 = pd.ExcelWriter('Data_ Round 33 - Ukraine 5W - 2022-11-10.xlsx', engine='xlsxwriter')
# excel_writer2 = pd.ExcelWriter('Data_ Round 34 - Ukraine 5W - 2022-11-25.xlsx', engine='xlsxwriter')
# # Write each DataFrame to a separate sheet in the Excel file
# copy_df.to_excel(excel_writer1, sheet_name='Num_of_Orgs_by_Oblast', index=False)
# copy_df.to_excel(excel_writer2, sheet_name='Num_of_Orgs_by_Oblast', index=False)

In [36]:
len(merged_sheet1.columns)
merged_sheet1.columns

Index(['ADMIN1_ID', 'DATE', 'OBLAST_x', 'CCCM', 'CCS', 'ETC', 'EDUCATION',
       'FSL', 'HEALTH', 'MPC', 'NUTRITION', 'PROTECTIONTOTAL', 'PC_CP',
       'PC_GBV', 'PC_MA', 'PC_PC', 'SHELTER', 'WASH', 'TOTAL', 'CLUSTERLIST',
       'LOGISTICS', 'OBLAST_y', 'PEOPLE_REACHED'],
      dtype='object')

In [37]:
merged_sheet1.shape

(750, 23)

In [38]:
merged_sheet3.shape

(750, 7)

In [39]:
merged_sheet1.DATE.value_counts()

DATE
2022-09-29    50
2022-04-28    25
2022-08-11    25
2022-12-22    25
2022-12-08    25
2022-11-25    25
2022-11-10    25
2022-10-27    25
2022-10-13    25
2022-09-22    25
2022-09-15    25
2022-09-08    25
2022-08-25    25
2022-08-18    25
2022-08-04    25
2022-05-06    25
2022-07-28    25
2022-07-21    25
2022-07-14    25
2022-07-07    25
2022-06-30    25
2022-06-23    25
2022-06-16    25
2022-06-09    25
2022-06-02    25
2022-05-26    25
2022-05-19    25
2022-05-12    25
2023-01-05    25
Name: count, dtype: int64

In [40]:
df_encode.drop(columns = ['ADMIN1_ID', 'DATE', 'OBLAST_y', 'OBLAST_x', 'CLUSTERLIST'], inplace = True)

In [41]:
df_encode.head()

Unnamed: 0,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,MPC,NUTRITION,PROTECTIONTOTAL,PC_CP,PC_GBV,PC_MA,PC_PC,SHELTER,WASH,TOTAL,LOGISTICS,PEOPLE_REACHED,OBLAST_ENCODED,DATE_ENCODED
0,2.0,1.0,1.0,5.0,17.0,16.0,10.0,0.0,16.0,2.0,2.0,2.0,11.0,6.0,7.0,42.0,0.0,91200.0,20,0
1,2.0,1.0,1.0,5.0,20.0,17.0,10.0,0.0,0.0,2.0,3.0,2.0,11.0,6.0,8.0,45.0,0.0,113000.0,20,1
2,2.0,1.0,1.0,6.0,20.0,18.0,10.0,0.0,19.0,4.0,4.0,2.0,14.0,6.0,9.0,49.0,0.0,138800.0,20,2
3,2.0,1.0,1.0,6.0,20.0,20.0,10.0,0.0,21.0,6.0,4.0,2.0,14.0,6.0,9.0,50.0,0.0,140600.0,20,3
4,0.0,1.0,1.0,6.0,20.0,20.0,9.0,0.0,21.0,6.0,4.0,4.0,12.0,7.0,9.0,48.0,0.0,216600.0,20,4


In [42]:
print(df_encode.dtypes)

CCCM               float64
CCS                float64
ETC                float64
EDUCATION          float64
FSL                float64
HEALTH             float64
MPC                float64
NUTRITION          float64
PROTECTIONTOTAL    float64
PC_CP              float64
PC_GBV             float64
PC_MA              float64
PC_PC              float64
SHELTER            float64
WASH               float64
TOTAL              float64
LOGISTICS          float64
PEOPLE_REACHED     float64
OBLAST_ENCODED       int32
DATE_ENCODED         int32
dtype: object


Decision Trees

Modeling for PC_PC

In [43]:
y = df_encode['PC_PC']
X = df_encode.drop(columns = ['PC_PC'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state = 1234)

In [44]:
selector = SelectKBest(f_classif, k=4)
selector.fit(X, y)
filter = selector.get_support()
pc_pc_top_4_features = X.columns[filter]

print(pc_pc_top_4_features)


Index(['CCS', 'PROTECTIONTOTAL', 'PC_MA', 'DATE_ENCODED'], dtype='object')


In [45]:
y = df_encode['PC_PC']
X = df_encode[pc_pc_top_4_features]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state = 1234)

In [46]:
depth_range = [2**n for n in range(2,5)]
leaf_range = [25*2**n for n in range(0,3)]

param_grid={'max_depth': depth_range, 'min_samples_leaf': leaf_range}


In [47]:
print('start')

pc_pc_model = DecisionTreeRegressor()

grid = GridSearchCV(pc_pc_model, param_grid, cv = 5)

grid_search = grid.fit(X_train, y_train)

print("finished")

start
finished


In [48]:
print("The best hyperparameters are: ")
print(grid_search.best_params_)
best_max = grid_search.best_params_['max_depth']
best_min = grid_search.best_params_['min_samples_leaf']

The best hyperparameters are: 
{'max_depth': 4, 'min_samples_leaf': 25}


In [49]:
pc_pc_model = DecisionTreeRegressor(max_depth = best_max, min_samples_leaf = best_min)

pc_pc_model.fit(X_train, y_train)

prediction = pc_pc_model.predict(X_test)

knr_score = mean_squared_error(y_test, prediction, squared = False)
knr_r2 = r2_score(y_test, prediction)

print('The rmse and r2 for PC_PC is: ')
print(knr_score)
print(knr_r2)

The rmse and r2 for PC_PC is: 
0.9337181295561384
0.953562854447449


Modeling for Shelter

In [50]:
y = df_encode['SHELTER']
X = df_encode.drop(columns = ['SHELTER'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state = 1234)

In [51]:
selector = SelectKBest(f_classif, k=4)
selector.fit(X, y)
filter = selector.get_support()
shelter_top_4_features = X.columns[filter]

print(shelter_top_4_features)

Index(['FSL', 'MPC', 'WASH', 'TOTAL'], dtype='object')


In [52]:
y = df_encode['SHELTER']
X = df_encode[shelter_top_4_features]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state = 1234)

In [53]:
depth_range = [2**n for n in range(2,5)]
leaf_range = [25*2**n for n in range(0,3)]

param_grid={'max_depth': depth_range, 'min_samples_leaf': leaf_range}

In [54]:
print('start')

shelter_model = DecisionTreeRegressor()

grid = GridSearchCV(shelter_model, param_grid, cv = 5)

grid_search = grid.fit(X_train, y_train)

print("finished")

start
finished


In [55]:
print("The best hyperparameters are: ")
print(grid_search.best_params_)
best_max = grid_search.best_params_['max_depth']
best_min = grid_search.best_params_['min_samples_leaf']

The best hyperparameters are: 
{'max_depth': 8, 'min_samples_leaf': 25}


In [56]:
shelter_model = DecisionTreeRegressor(max_depth = best_max, min_samples_leaf = best_min)

shelter_model.fit(X_train, y_train)

prediction = shelter_model.predict(X_test)

shelter_rmse = mean_squared_error(y_test, prediction, squared = False)
shelter_r2 = r2_score(y_test, prediction)

print('The rmse and r2 for shelter is: ')
print(shelter_rmse)
print(shelter_r2)

The rmse and r2 for shelter is: 
4.127127200088797
0.8572689669834755


Modeling for Wash

In [57]:
y = df_encode['WASH']
X = df_encode.drop(columns = ['WASH'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state = 1234)

In [58]:
selector = SelectKBest(f_classif, k=4)
selector.fit(X, y)
filter = selector.get_support()
wash_top_4_features = X.columns[filter]

print(wash_top_4_features)

Index(['FSL', 'MPC', 'SHELTER', 'TOTAL'], dtype='object')


In [59]:
y = df_encode['WASH']
X = df_encode[wash_top_4_features]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state = 1234)

In [60]:
depth_range = [2**n for n in range(2,5)]
leaf_range = [25*2**n for n in range(0,3)]

param_grid={'max_depth': depth_range, 'min_samples_leaf': leaf_range}

In [61]:
print('start')

wash_model = DecisionTreeRegressor()

grid = GridSearchCV(wash_model, param_grid, cv = 5)

grid_search = grid.fit(X_train, y_train)

print("finished")

start
finished


In [62]:
print("The best hyperparameters are: ")
print(grid_search.best_params_)
best_max = grid_search.best_params_['max_depth']
best_min = grid_search.best_params_['min_samples_leaf']

The best hyperparameters are: 
{'max_depth': 8, 'min_samples_leaf': 25}


In [63]:
wash_model = DecisionTreeRegressor(max_depth = best_max, min_samples_leaf = best_min)

wash_model.fit(X_train, y_train)

prediction = wash_model.predict(X_test)

wash_rmse = mean_squared_error(y_test, prediction, squared = False)
wash_r2 = r2_score(y_test, prediction)

print('The rmse and r2 for wash is: ')
print(wash_rmse)
print(wash_r2)

The rmse and r2 for wash is: 
3.095807399875726
0.8626227449230174


KNN

PC_PC

In [64]:
y = df_encode['PC_PC']
X = df_encode[pc_pc_top_4_features]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state = 1234)

In [65]:
pc_pc_model = KNeighborsRegressor(n_neighbors = 3)

pc_pc_model.fit(X_train, y_train)

prediction = pc_pc_model.predict(X_test)

knr_score = mean_squared_error(y_test, prediction, squared = False)
knr_r2 = r2_score(y_test, prediction)

print('The rmse and r2 for PC_PC is: ')
print(knr_score)
print(knr_r2)

The rmse and r2 for PC_PC is: 
1.632993161855452
0.8579626157604682


SHELTER

In [66]:
y = df_encode['SHELTER']
X = df_encode[shelter_top_4_features]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state = 1234)

In [67]:
shelter_model = KNeighborsRegressor(n_neighbors = 3)

shelter_model.fit(X_train, y_train)

prediction = shelter_model.predict(X_test)

knr_score = mean_squared_error(y_test, prediction, squared = False)
knr_r2 = r2_score(y_test, prediction)

print('The rmse and r2 for shelter is: ')
print(knr_score)
print(knr_r2)

The rmse and r2 for shelter is: 
3.585627689251343
0.892265925191005


WASH

In [68]:
y = df_encode['WASH']
X = df_encode[wash_top_4_features]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state = 1234)

In [69]:
wash_model = KNeighborsRegressor(n_neighbors = 3)

wash_model.fit(X_train, y_train)

prediction = wash_model.predict(X_test)

knr_score = mean_squared_error(y_test, prediction, squared = False)
knr_r2 = r2_score(y_test, prediction)

print('The rmse and r2 for shelter is: ')
print(knr_score)
print(knr_r2)

The rmse and r2 for shelter is: 
2.32379000772445
0.9225964772796771


LINEAR REGRESSION

PC_PC

In [71]:
y = df_encode['PC_PC']
X = df_encode[pc_pc_top_4_features]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state = 1234)

In [72]:
pc_pc_model = LinearRegression()

# Fit the model to the training data 
pc_pc_model.fit(X_train, y_train)

#  Make predictions on the test data 
prediction = pc_pc_model.predict(X_test)

knr_score = mean_squared_error(y_test, prediction, squared = False)
knr_r2 = r2_score(y_test, prediction)

print('The rmse and r2 for pc_pc is: ')
print(knr_score)
print(knr_r2)

The rmse and r2 for pc_pc is: 
3.426006295040495
0.37481255679770886


SHELTER

In [73]:
y = df_encode['SHELTER']
X = df_encode[shelter_top_4_features]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state = 1234)

In [74]:
shelter_model = LinearRegression()

shelter_model.fit(X_train, y_train)

prediction = shelter_model.predict(X_test)

knr_score = mean_squared_error(y_test, prediction, squared = False)
knr_r2 = r2_score(y_test, prediction)

print('The rmse and r2 for shelter is: ')
print(knr_score)
print(knr_r2)

The rmse and r2 for shelter is: 
4.4973167841567605
0.8305156274547325


WASH

In [75]:
y = df_encode['WASH']
X = df_encode[wash_top_4_features]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state = 1234)

In [76]:
wash_model = LinearRegression()

wash_model.fit(X_train, y_train)

prediction = wash_model.predict(X_test)

knr_score = mean_squared_error(y_test, prediction, squared = False)
knr_r2 = r2_score(y_test, prediction)

print('The rmse and r2 for shelter is: ')
print(knr_score)
print(knr_r2)

The rmse and r2 for shelter is: 
3.0528120559475393
0.8664121061650667
