In [76]:
import pandas as pd
import numpy as np
import os
import re
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import pickle

In [77]:
def getDate(filename):
    match_str = re.search(r'\d{4}-\d{2}-\d{2}',filename)
    try:
        return datetime.datetime.strptime(match_str.group(), '%Y-%m-%d').date()
    except ValueError:
        return None

In [78]:
def cleanColumn(name):
    name = name.upper()
    clean_name = re.sub(r'[^a-zA-Z0-9_]', '', name)
    clean_name = clean_name.replace('\n', ' ')
    if clean_name == 'SHELTERNFI':
        clean_name = 'SHELTER'
    if clean_name == 'PROTECTIONCP':
        clean_name = 'PC_CP'
    if clean_name == 'PROTECTIONGBV':
        clean_name = 'PC_GBV'
    if clean_name == 'PROTECTIONMA':
        clean_name = 'PC_MA'
    if clean_name == 'TOTALREACHED':
        clean_name = 'PEOPLEREACHED'
    return clean_name

In [79]:
#using csv and xlsx
csv_dir = 'ukraine_data_excel'


In [80]:
#using xlsx
sheet1_df = []
sheet2_df = []
sheet3_df = []

In [81]:
#using xlsx
for filename in os.listdir(csv_dir):
  if filename.endswith('.xlsx'):
    file_path = os.path.join(csv_dir,filename)
    excel_df = pd.ExcelFile(file_path)
    files_date = getDate(filename)
    for sheet in excel_df.sheet_names:
      df = pd.read_excel(excel_df,sheet_name=sheet)
      df.columns = [cleanColumn(col) for col in df.columns]
      df['DATE'] = files_date
      if sheet == 'Num_of_Orgs_by_Oblast':
        sheet1_df.append(df)
      elif sheet == 'People_Reached_by_Oblast':
        sheet2_df.append(df)
      elif sheet == 'UDE_Inputs':
        sheet3_df.append(df)

In [82]:
combined_sheet1 = pd.concat(sheet1_df, keys=[str(df['DATE'].iloc[0]) for df in sheet1_df])
res_sheet1 = combined_sheet1.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()
merged_sheet1 = pd.merge(res_sheet1, combined_sheet1[['ADMIN1_ID', 'DATE', 'OBLAST']], on=['ADMIN1_ID', 'DATE'], how='left')

  res_sheet1 = combined_sheet1.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()


In [83]:
combined_sheet2 = pd.concat(sheet2_df, keys=[str(df['DATE'].iloc[0]) for df in sheet2_df])
res_sheet2 = combined_sheet2.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()
columns_to_merge = ['ADMIN1_ID', 'DATE']
merged_sheet2 = pd.merge(combined_sheet2[combined_sheet2.columns.to_list()], res_sheet2[columns_to_merge] , on=['ADMIN1_ID', 'DATE'], how='right')

  res_sheet2 = combined_sheet2.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()


In [84]:
merged_sheet2.head()

Unnamed: 0,OBLAST,ADM1_ID,EDUCATION,FSL,CP,GBV,MINEACTION,PROTECTIONTOTAL,HEALTH,MPC,NUTRITION,SHELTER,WASH,PEOPLEREACHED,DATE,ADMIN1_ID,GENERALPROTECTION
0,Vinnytska,,2800.0,7900.0,100.0,100.0,500.0,11900.0,4600.0,91200.0,,15800.0,2300.0,91200.0,2022-04-28,UA05,11200.0
1,Vinnytska,,2800.0,14300.0,100.0,4100.0,500.0,14300.0,4600.0,113000.0,,16200.0,2300.0,113000.0,2022-05-06,UA05,9600.0
2,Vinnytska,,4900.0,15500.0,200.0,4100.0,500.0,16300.0,9900.0,138800.0,,17500.0,3700.0,138800.0,2022-05-12,UA05,11400.0
3,Vinnytska,,9000.0,17500.0,500.0,6100.0,500.0,20300.0,23200.0,140600.0,,17500.0,3700.0,140600.0,2022-05-19,UA05,
4,Vinnytska,,11500.0,17600.0,4400.0,6200.0,700.0,25600.0,23300.0,216600.0,,21700.0,3800.0,216600.0,2022-05-26,UA05,


In [85]:
merged_sheet2.shape

(675, 17)

In [86]:
merged_sheet2.fillna(0,inplace=True)
merged_sheet2.drop(columns=['ADM1_ID'],inplace=True)

In [87]:
merged_sheet1.isna().any().any()

False

In [88]:
combined_sheet3 = pd.concat(sheet3_df, keys=[str(df['DATE'].iloc[0]) for df in sheet3_df])
res_sheet3 = combined_sheet3.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()
merged_sheet3 = pd.merge(res_sheet3, combined_sheet3[['ADMIN1_ID', 'DATE', 'OBLAST']], on=['ADMIN1_ID', 'DATE'], how='left')

  res_sheet3 = combined_sheet3.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()


In [89]:
#Num_of_Orgs_by_Oblast
merged_sheet1.iloc[20:35]

Unnamed: 0,ADMIN1_ID,DATE,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,LOGISTICS,MPC,NUTRITION,SHELTER,PROTECTIONTOTAL,PC_CP,PC_GBV,PC_MA,WASH,TOTAL,PC_PC,OBLAST
20,UA05,2022-09-22,2.0,1.0,0.0,4.0,20.0,13.0,0.0,6.0,0.0,7.0,0.0,4.0,3.0,3.0,9.0,58.0,0.0,Vinnytska
21,UA05,2022-09-29,6.0,2.0,0.0,20.0,68.0,48.0,0.0,34.0,0.0,46.0,0.0,36.0,14.0,8.0,30.0,214.0,0.0,Vinnytska
22,UA05,2022-09-29,6.0,2.0,0.0,20.0,68.0,48.0,0.0,34.0,0.0,46.0,0.0,36.0,14.0,8.0,30.0,214.0,0.0,Vinnytska
23,UA05,2022-10-13,3.0,1.0,0.0,11.0,35.0,24.0,0.0,17.0,0.0,23.0,0.0,25.0,16.0,5.0,13.0,123.0,0.0,Vinnytska
24,UA05,2022-10-27,3.0,1.0,0.0,13.0,36.0,24.0,0.0,17.0,0.0,23.0,0.0,25.0,10.0,5.0,13.0,119.0,0.0,Vinnytska
25,UA05,2022-11-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Vinnytska
26,UA05,2022-11-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Vinnytska
27,UA05,2022-12-08,3.0,1.0,0.0,13.0,36.0,27.0,0.0,18.0,0.0,24.0,0.0,26.0,11.0,7.0,13.0,125.0,0.0,Vinnytska
28,UA05,2022-12-22,2.0,1.0,0.0,13.0,36.0,33.0,0.0,18.0,0.0,24.0,0.0,29.0,11.0,8.0,13.0,130.0,0.0,Vinnytska
29,UA05,2023-01-05,3.0,1.0,1.0,17.0,39.0,33.0,0.0,19.0,0.0,25.0,0.0,33.0,10.0,8.0,14.0,138.0,0.0,Vinnytska


In [90]:
#People_Reached_by_Oblast
merged_sheet2.iloc[10:25]

Unnamed: 0,OBLAST,EDUCATION,FSL,CP,GBV,MINEACTION,PROTECTIONTOTAL,HEALTH,MPC,NUTRITION,SHELTER,WASH,PEOPLEREACHED,DATE,ADMIN1_ID,GENERALPROTECTION
10,Vinnytska,16793,39158.0,51426.0,9326.0,99772.0,197230.0,38960.0,227756.0,0,65765.0,391243.0,391243.0,2022-07-28,UA05,0.0
11,Vinnytska,16801,39158.0,51426.0,9326.0,102279.0,199737.0,38960.0,229081.0,0,74073.0,391243.0,391243.0,2022-08-04,UA05,0.0
12,Vinnytska,16812,39158.0,67831.0,21467.0,102669.0,229031.0,38960.0,238079.0,0,84680.0,395394.0,395394.0,2022-08-11,UA05,0.0
13,Vinnytska,16824,39158.0,67831.0,21467.0,102669.0,229031.0,43295.0,238270.0,0,84955.0,395394.0,395394.0,2022-08-18,UA05,0.0
14,Vinnytska,18424,39158.0,67831.0,21467.0,102669.0,229802.0,43295.0,231601.0,0,85408.0,395394.0,395394.0,2022-08-25,UA05,0.0
15,Vinnytska,18439,39158.0,67831.0,21467.0,105178.0,246444.0,88760.0,244098.0,0,103967.0,395394.0,395394.0,2022-09-08,UA05,0.0
16,Vinnytska,18442,39158.0,107050.0,19299.0,105178.0,275036.0,88760.0,231215.0,0,105146.0,395394.0,395394.0,2022-09-15,UA05,0.0
17,Vinnytska,18442,39158.0,107050.0,19299.0,105178.0,275036.0,88760.0,232815.0,0,105223.0,395394.0,395394.0,2022-09-22,UA05,0.0
18,Vinnytska,18505,39158.0,107050.0,19299.0,105178.0,275036.0,88760.0,243513.0,0,107109.0,395394.0,395394.0,2022-09-29,UA05,0.0
19,Vinnytska,18505,39158.0,107050.0,19299.0,105178.0,275036.0,88760.0,243513.0,0,107109.0,395394.0,395394.0,2022-09-29,UA05,0.0


In [91]:
merged_sheet3.iloc[20:35]

Unnamed: 0,ADMIN1_ID,DATE,PEOPLEREACHED,NUMBEROFORGANISATIONS,OBLAST
20,UA05,2022-09-22,395394,58,Vinnytska
21,UA05,2022-09-29,790788,214,Vinnytska
22,UA05,2022-09-29,790788,214,Vinnytska
23,UA05,2022-10-13,395394,123,Vinnytska
24,UA05,2022-10-27,395394,119,Vinnytska
25,UA05,2022-11-10,449220,125,Vinnytska
26,UA05,2022-11-25,449220,125,Vinnytska
27,UA05,2022-12-08,449220,125,Vinnytska
28,UA05,2022-12-22,449220,130,Vinnytska
29,UA05,2023-01-05,449220,138,Vinnytska


In [92]:
merged_sheet1['OBLAST'].value_counts()

Vinnytska           30
Odeska              30
Chernihivska        30
Chernivetska        30
Cherkaska           30
Khmelnytska         30
Khersonska          30
Kharkivska          30
Ternopilska         30
Sumska              30
Rivnenska           30
Poltavska           30
Mykolaivska         30
Volynska            30
Lvivska             30
Luhanska            30
Kirovohradska       30
Kyivska             30
Ivano-Frankivska    30
Zaporizka           30
Zakarpatska         30
Zhytomyrska         30
Donetska            30
Dnipropetrovska     30
Kyiv                30
Name: OBLAST, dtype: int64

In [93]:
sum(merged_sheet3['PEOPLEREACHED'].value_counts())

750

In [94]:
merged_sheet1['DATE'].dtype

dtype('O')

In [95]:
########### mergedsheet 1 prep for interpolation

In [96]:
# convert date column to string type
merged_sheet1['DATE'] = merged_sheet1['DATE'].astype(str)
#  the specific date you want to find
specific_date_1006 = '2022-10-06'
specific_date_1125 = '2022-11-25'
specific_date_1110 = '2022-11-10'
# filter the df for rows with the specific date
filtered_values1006 = merged_sheet1.loc[merged_sheet1['DATE'] == specific_date_1006]
filtered_values1110 = merged_sheet1.loc[merged_sheet1['DATE'] == specific_date_1110]
filtered_values1125 = merged_sheet1.loc[merged_sheet1['DATE'] == specific_date_1125]

filtered_values1110

Unnamed: 0,ADMIN1_ID,DATE,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,LOGISTICS,MPC,NUTRITION,SHELTER,PROTECTIONTOTAL,PC_CP,PC_GBV,PC_MA,WASH,TOTAL,PC_PC,OBLAST
25,UA05,2022-11-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Vinnytska
55,UA07,2022-11-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Volynska
85,UA12,2022-11-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Dnipropetrovska
115,UA14,2022-11-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Donetska
145,UA18,2022-11-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Zhytomyrska
175,UA21,2022-11-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Zakarpatska
205,UA23,2022-11-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Zaporizka
235,UA26,2022-11-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Ivano-Frankivska
265,UA32,2022-11-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Kyivska
295,UA35,2022-11-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Kirovohradska


In [97]:
# replace zero values in the empty tuple to NAN for future linear interpolation of ORGS
filtered_values1006= filtered_values1006.replace(0,np.nan)
filtered_values1110= filtered_values1110.replace(0,np.nan)
filtered_values1125= filtered_values1125.replace(0,np.nan)

In [98]:
#overwrite the old values
merged_sheet1.loc[merged_sheet1['DATE'] == specific_date_1006] = filtered_values1006
merged_sheet1.loc[merged_sheet1['DATE'] == specific_date_1110] = filtered_values1110
merged_sheet1.loc[merged_sheet1['DATE'] == specific_date_1125] = filtered_values1125

In [99]:
########### mergedsheet 2 prep for interpolation

In [100]:
# Convert date column to string type
merged_sheet2['DATE'] = merged_sheet2['DATE'].astype(str)
# filter the df for rows with the specific date
filtered_values1006_m2 = merged_sheet2.loc[merged_sheet2['DATE'] == specific_date_1006]
filtered_values1110_m2 = merged_sheet2.loc[merged_sheet2['DATE'] == specific_date_1110]
filtered_values1125_m2 = merged_sheet2.loc[merged_sheet2['DATE'] == specific_date_1125]
filtered_values1006_m2

Unnamed: 0,OBLAST,EDUCATION,FSL,CP,GBV,MINEACTION,PROTECTIONTOTAL,HEALTH,MPC,NUTRITION,SHELTER,WASH,PEOPLEREACHED,DATE,ADMIN1_ID,GENERALPROTECTION


In [101]:
# replace zero values in the empty tuple to NAN for future linear interpolation of People
filtered_values1006_m2 = filtered_values1006_m2.replace(0,np.nan)
filtered_values1110_m2 = filtered_values1110_m2.replace(0,np.nan)
filtered_values1125_m2 = filtered_values1125_m2.replace(0,np.nan)

In [102]:
#overwrite the old values
merged_sheet2.loc[merged_sheet2['DATE'] == specific_date_1006] = filtered_values1006_m2
merged_sheet2.loc[merged_sheet2['DATE'] == specific_date_1110] = filtered_values1110_m2
merged_sheet2.loc[merged_sheet2['DATE'] == specific_date_1125] = filtered_values1125_m2

In [103]:
#check for null values
merged_sheet1.isna().any().any()

True

In [104]:
# check for null values in specific row
merged_sheet1.iloc[26].isna().any()

True

In [105]:
#replace dashed in education and nutrition to 0, maybe nan
merged_sheet2 = merged_sheet2.replace('-',0)
e =merged_sheet2[merged_sheet2['EDUCATION']=='-']
n =merged_sheet2[merged_sheet2['NUTRITION']=='-']
e,n

(Empty DataFrame
 Columns: [OBLAST, EDUCATION, FSL, CP, GBV, MINEACTION, PROTECTIONTOTAL, HEALTH, MPC, NUTRITION, SHELTER, WASH, PEOPLEREACHED, DATE, ADMIN1_ID, GENERALPROTECTION]
 Index: [],
 Empty DataFrame
 Columns: [OBLAST, EDUCATION, FSL, CP, GBV, MINEACTION, PROTECTIONTOTAL, HEALTH, MPC, NUTRITION, SHELTER, WASH, PEOPLEREACHED, DATE, ADMIN1_ID, GENERALPROTECTION]
 Index: [])

In [106]:
#filled null values using linear interpolation
num_of_org_interpolated_df = merged_sheet1.interpolate(method='linear')

In [107]:
num_of_org_interpolated_df.isna().any().any()

False

In [108]:
# rounded all int and float dtypes since interpolation caused floats
num_of_org_interpolated_df = num_of_org_interpolated_df.applymap(lambda x: round(x, 1) if isinstance(x, (int, float)) else x)

In [109]:
num_of_org_interpolated_df.loc[num_of_org_interpolated_df['DATE']==specific_date_1110]

Unnamed: 0,ADMIN1_ID,DATE,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,LOGISTICS,MPC,NUTRITION,SHELTER,PROTECTIONTOTAL,PC_CP,PC_GBV,PC_MA,WASH,TOTAL,PC_PC,OBLAST
25,UA05,2022-11-10,3.0,1.0,0.0,13.0,36.0,25.0,0.0,17.3,0.0,23.3,0.0,25.3,10.3,5.7,13.0,121.0,0.0,Vinnytska
55,UA07,2022-11-10,1.0,0.0,0.0,7.7,20.0,11.0,0.0,14.3,1.0,13.0,0.0,23.0,11.3,6.3,8.0,83.3,0.0,Volynska
85,UA12,2022-11-10,4.0,1.0,1.0,15.3,107.7,39.3,1.0,25.0,4.0,46.7,0.0,33.0,12.3,8.0,30.0,216.7,0.0,Dnipropetrovska
115,UA14,2022-11-10,0.0,1.0,0.0,8.7,83.7,40.3,0.0,29.3,3.0,21.3,0.0,25.7,11.3,8.3,29.0,138.3,0.0,Donetska
145,UA18,2022-11-10,0.0,0.0,0.0,10.3,31.3,17.3,0.0,17.0,0.0,10.0,0.0,14.0,7.3,5.7,9.0,89.0,0.0,Zhytomyrska
175,UA21,2022-11-10,3.0,1.0,0.0,16.3,34.3,16.3,0.0,19.0,1.0,17.0,0.0,28.7,11.7,7.7,15.0,116.3,0.0,Zakarpatska
205,UA23,2022-11-10,3.0,0.0,0.0,11.0,94.0,25.7,0.0,26.0,3.0,27.0,0.0,23.3,13.3,5.7,23.0,144.0,0.0,Zaporizka
235,UA26,2022-11-10,3.0,0.0,0.0,16.3,37.0,15.7,0.0,17.3,1.0,15.7,0.0,27.7,10.3,8.3,15.3,110.3,0.0,Ivano-Frankivska
265,UA32,2022-11-10,1.0,0.0,0.0,13.3,61.0,20.7,0.0,24.3,2.0,23.3,0.0,21.0,9.0,7.3,17.0,141.7,0.0,Kyivska
295,UA35,2022-11-10,3.0,1.0,0.0,12.7,39.7,15.7,1.0,16.0,2.0,14.7,0.0,13.7,5.0,6.3,11.0,109.0,0.0,Kirovohradska


In [110]:
num_of_org_interpolated_df.sort_values(by=['DATE'])

Unnamed: 0,ADMIN1_ID,DATE,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,LOGISTICS,MPC,NUTRITION,SHELTER,PROTECTIONTOTAL,PC_CP,PC_GBV,PC_MA,WASH,TOTAL,PC_PC,OBLAST
0,UA05,2022-04-28,2.0,1.0,1.0,5.0,17.0,16.0,0.0,10.0,0.0,6.0,16.0,2.0,2.0,2.0,7.0,42.0,11.0,Vinnytska
690,UA74,2022-04-28,1.0,0.0,0.0,1.0,17.0,12.0,0.0,6.0,2.0,2.0,9.0,1.0,0.0,1.0,10.0,36.0,8.0,Chernihivska
60,UA12,2022-04-28,1.0,1.0,1.0,3.0,31.0,19.0,0.0,12.0,2.0,10.0,21.0,5.0,4.0,2.0,15.0,58.0,14.0,Dnipropetrovska
660,UA73,2022-04-28,2.0,1.0,0.0,3.0,22.0,9.0,0.0,10.0,3.0,6.0,17.0,5.0,2.0,2.0,11.0,50.0,12.0,Chernivetska
90,UA14,2022-04-28,0.0,1.0,0.0,2.0,48.0,26.0,0.0,12.0,2.0,9.0,29.0,7.0,4.0,1.0,18.0,75.0,21.0,Donetska
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359,UA46,2023-01-05,4.0,1.0,1.0,23.0,62.0,52.0,1.0,26.0,5.0,27.0,0.0,41.0,20.0,8.0,23.0,194.0,0.0,Lvivska
569,UA63,2023-01-05,3.0,0.0,1.0,13.0,111.0,49.0,0.0,24.0,3.0,33.0,0.0,26.0,7.0,10.0,25.0,175.0,0.0,Kharkivska
719,UA74,2023-01-05,0.0,0.0,0.0,18.0,56.0,29.0,0.0,20.0,3.0,33.0,0.0,16.0,7.0,9.0,17.0,149.0,0.0,Chernihivska
89,UA12,2023-01-05,3.0,1.0,1.0,19.0,113.0,52.0,1.0,33.0,4.0,51.0,0.0,40.0,12.0,9.0,32.0,244.0,0.0,Dnipropetrovska


In [111]:
num_of_org_interpolated_df.describe()

Unnamed: 0,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,LOGISTICS,MPC,NUTRITION,SHELTER,PROTECTIONTOTAL,PC_CP,PC_GBV,PC_MA,WASH,TOTAL,PC_PC
count,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0
mean,1.004,0.405333,0.206667,6.906667,37.310667,21.125333,0.252,14.957333,1.725333,13.284,5.958667,12.408,6.150667,4.582667,13.702667,85.046667,1.804
std,1.310993,0.542922,0.428782,5.333751,27.023168,13.969423,0.475532,8.484305,1.526542,9.633194,9.210034,9.950024,4.399605,2.564004,8.254102,50.612416,4.586243
min,0.0,0.0,0.0,0.0,5.0,5.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,14.0,0.0
25%,0.0,0.0,0.0,3.0,20.0,11.0,0.0,9.0,1.0,6.25,0.0,5.0,3.0,3.0,8.0,49.0,0.0
50%,1.0,0.0,0.0,6.0,31.0,17.0,0.0,13.0,1.0,11.0,0.0,9.5,5.0,4.0,13.0,73.0,0.0
75%,2.0,1.0,0.0,10.0,45.0,26.825,0.0,18.0,3.0,17.0,13.0,18.0,8.0,6.7,16.75,108.0,0.0
max,8.0,2.0,2.0,32.0,202.0,100.0,2.0,56.0,10.0,80.0,42.0,52.0,30.0,16.0,58.0,364.0,28.0


In [112]:
#filled null values using linear interpolation for merged sheet2
people_reached_interpolated_df = merged_sheet2.interpolate(method='linear')
people_reached_interpolated_df.fillna(method='ffill',inplace=True)

In [113]:
#check for null
people_reached_interpolated_df.isna().any().any()

False

In [114]:
# rounded all int and float dtypes since interpolation caused floats
people_reached_interpolated_df = people_reached_interpolated_df.applymap(lambda x: round(x, 1) if isinstance(x, (int, float)) else x)

In [115]:
people_reached_interpolated_df.loc[people_reached_interpolated_df['DATE']==specific_date_1125]

Unnamed: 0,OBLAST,EDUCATION,FSL,CP,GBV,MINEACTION,PROTECTIONTOTAL,HEALTH,MPC,NUTRITION,SHELTER,WASH,PEOPLEREACHED,DATE,ADMIN1_ID,GENERALPROTECTION
23,Vinnytska,19172.3,39158.0,131268.0,20120.0,107388.0,228971.0,105527.0,260786.3,0.0,150959.3,431278.0,431278.0,2022-11-25,UA05,0.0
50,Volynska,6099.7,48442.0,117463.3,24004.7,68398.3,166848.3,284201.3,158911.3,396.0,26462.0,157799.0,292047.0,2022-11-25,UA07,0.0
77,Dnipropetrovska,51344.7,768589.0,90438.7,26215.7,313910.3,324489.7,549308.3,447314.3,22773.0,289301.7,775382.0,804355.0,2022-11-25,UA12,0.0
104,Donetska,38916.7,444975.0,172301.7,11220.3,230246.7,425259.0,655598.7,125237.7,12467.0,219541.0,616792.7,655598.7,2022-11-25,UA14,0.0
131,Zhytomyrska,19828.3,29487.0,144915.0,19051.0,99233.3,167760.0,361190.0,168676.7,0.0,29663.7,365963.3,372856.0,2022-11-25,UA18,0.0
158,Zakarpatska,62461.3,81575.0,152583.3,27812.3,75460.7,452974.7,291196.0,249368.7,0.0,105656.7,208017.0,452974.7,2022-11-25,UA21,0.0
185,Zaporizka,9047.3,584573.0,94540.0,12893.0,150531.3,206109.0,150747.0,243850.7,8080.0,78324.3,103080.7,584573.0,2022-11-25,UA23,0.0
212,Ivano-Frankivska,47233.7,113680.0,173289.7,26016.3,88800.0,261799.7,227739.7,151646.0,0.0,101167.0,190192.7,302635.3,2022-11-25,UA26,0.0
239,Kyivska,11944.7,282016.0,85651.0,12283.3,287355.3,396295.0,738844.7,139756.0,7709.0,41635.3,57351.3,738844.7,2022-11-25,UA32,0.0
266,Kirovohradska,26659.0,93035.0,29324.0,1953.0,72881.3,41276.7,563398.7,125677.7,3960.0,56457.3,17572.7,563398.7,2022-11-25,UA35,0.0


In [116]:
#NUM OF ORGS
# create label encoder instance
label_encoder = LabelEncoder()
#fit label encoder to transform Oblast column
encoded_oblast = label_encoder.fit_transform(num_of_org_interpolated_df['OBLAST'])
#create copy
num_of_org_df_encoded = num_of_org_interpolated_df.copy()
#Add column and assign values
num_of_org_df_encoded['OBLAST_ENCODED'] = encoded_oblast

In [117]:
#NUM OF ORGS
# create label encoder instance
label_encoder = LabelEncoder()
#fit label encoder to transform Date column
encoded_oblast = label_encoder.fit_transform(num_of_org_df_encoded['DATE'])
#create copy
num_of_org_df_encode = num_of_org_df_encoded.copy()
#Add column and assign values
num_of_org_df_encode['DATE_ENCODED'] = encoded_oblast

In [118]:
#PEOPLE REACHED
# create label encoder instance
label_encoder = LabelEncoder()
#fit label encoder to transform Oblast column
encoded_oblast = label_encoder.fit_transform(people_reached_interpolated_df['OBLAST'])
#create copy
people_reached_df_encoded = people_reached_interpolated_df.copy()
#Add column and assign values
people_reached_df_encoded['OBLAST_ENCODED'] = encoded_oblast

In [119]:
#PEOPLE REACHED
# create label encoder instance
label_encoder = LabelEncoder()
#fit label encoder to transform Date column
encoded_oblast = label_encoder.fit_transform(people_reached_df_encoded['DATE'])
#create copy
people_reached_df_encode = people_reached_df_encoded.copy()
#Add column and assign values
people_reached_df_encode['DATE_ENCODED'] = encoded_oblast

In [120]:
#changing the name of the df
people_reached = people_reached_df_encode
num_of_org = num_of_org_df_encode

In [121]:
people_reached.sort_values(by=['DATE'])

Unnamed: 0,OBLAST,EDUCATION,FSL,CP,GBV,MINEACTION,PROTECTIONTOTAL,HEALTH,MPC,NUTRITION,SHELTER,WASH,PEOPLEREACHED,DATE,ADMIN1_ID,GENERALPROTECTION,OBLAST_ENCODED,DATE_ENCODED
0,Vinnytska,2800.0,7900.0,100.0,100.0,500.0,11900.0,4600.0,91200.0,0.0,15800.0,2300.0,91200.0,2022-04-28,UA05,11200.0,20,0
297,Lvivska,13800.0,374000.0,500.0,0.0,100.0,38600.0,33300.0,60300.0,300.0,34700.0,3100.0,374000.0,2022-04-28,UA46,37900.0,13,0
378,Poltavska,1700.0,242200.0,2500.0,100.0,200.0,10800.0,1000.0,1700.0,0.0,400.0,300.0,242200.0,2022-04-28,UA53,8000.0,16,0
621,Chernihivska,0.0,171600.0,0.0,0.0,0.0,900.0,38000.0,2100.0,0.0,200.0,25000.0,171600.0,2022-04-28,UA74,900.0,1,0
54,Dnipropetrovska,400.0,231800.0,400.0,2200.0,3700.0,24900.0,34200.0,23500.0,13300.0,18800.0,4000.0,231800.0,2022-04-28,UA12,18600.0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53,Volynska,9301.0,48442.0,156594.0,24265.0,69061.0,197390.0,290618.0,178808.0,396.0,29013.0,89246.0,290618.0,2023-01-05,UA07,0.0,21,25
296,Luhanska,1628.0,95665.0,30051.0,6942.0,120298.0,54648.0,325102.0,31431.0,5627.0,19377.0,151824.0,325102.0,2023-01-05,UA44,0.0,12,25
377,Odeska,30339.0,489739.0,30113.0,9424.0,184449.0,130637.0,412864.0,346180.0,9333.0,88215.0,159352.0,489739.0,2023-01-05,UA51,0.0,15,25
431,Rivnenska,21059.0,135289.0,179968.0,17690.0,77063.0,237598.0,351300.0,166146.0,396.0,41267.0,968.0,351300.0,2023-01-05,UA56,0.0,17,25


RANDOM FOREST - PC_PC

In [122]:
num_of_org.drop(columns = ['ADMIN1_ID', 'DATE', 'OBLAST'], inplace = True)

In [123]:
y = num_of_org['PC_PC']
X = num_of_org.drop(columns = ['PC_PC'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05, random_state = 1234)

In [124]:
param_grid = {'n_estimators' : [10,20,30,40,50,60,70,80,90,100]}

In [125]:
print('start')
model = RandomForestRegressor()

grid = GridSearchCV(model, param_grid, cv = 5)

grid_search = grid.fit(X_train, y_train)

print('finish')

start
finish


In [126]:
print("The best hyperparameters are: ")
print(grid_search.best_params_)
best_n = grid_search.best_params_['n_estimators']

The best hyperparameters are: 
{'n_estimators': 30}


In [127]:
pc_pc_model = RandomForestRegressor(criterion = 'squared_error', n_estimators = best_n)

pc_pc_model.fit(X_train, y_train)

prediction = pc_pc_model.predict(X_test)

knr_score = mean_squared_error(y_test, prediction, squared = False)
knr_r2 = r2_score(y_test, prediction)

print('The rmse and r2 for PC_PC is: ')
print(knr_score)
print(knr_r2)

print('Running Cross-Validation...')

The rmse and r2 for PC_PC is: 
0.14337208778404387
0.9990574977861041
Running Cross-Validation...


RANDOM FOREST - SHELTER

In [128]:
y = num_of_org['SHELTER']
X = num_of_org.drop(columns = ['SHELTER'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 1234)

In [129]:
print('start')
model = RandomForestRegressor()

grid = GridSearchCV(model, param_grid, cv = 5)

grid_search = grid.fit(X_train, y_train)

print('finish')

start
finish


In [130]:
print("The best hyperparameters are: ")
print(grid_search.best_params_)
best_n = grid_search.best_params_['n_estimators']

The best hyperparameters are: 
{'n_estimators': 100}


In [131]:
shelter_model = RandomForestRegressor(criterion = 'squared_error', n_estimators = best_n)

shelter_model.fit(X_train, y_train)

prediction = shelter_model.predict(X_test)

knr_score = mean_squared_error(y_test, prediction, squared = False)
knr_r2 = r2_score(y_test, prediction)

print('The rmse and r2 for shelter is: ')
print(knr_score)
print(knr_r2)

The rmse and r2 for shelter is: 
2.1046408941511454
0.9423673421938991


RANDOM FOREST - WASH

In [132]:
y = num_of_org['WASH']
X = num_of_org.drop(columns = ['WASH'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 1234)

In [133]:
print('start')
model = RandomForestRegressor()

grid = GridSearchCV(model, param_grid, cv = 5)

grid_search = grid.fit(X_train, y_train)

print('finish')

start
finish


In [134]:
print("The best hyperparameters are: ")
print(grid_search.best_params_)
best_n = grid_search.best_params_['n_estimators']

The best hyperparameters are: 
{'n_estimators': 90}


In [135]:
wash_model = RandomForestRegressor(criterion = 'squared_error', n_estimators = best_n)

wash_model.fit(X_train, y_train)

prediction = wash_model.predict(X_test)

knr_score = mean_squared_error(y_test, prediction, squared = False)
knr_r2 = r2_score(y_test, prediction)

print('The rmse and r2 for wash is: ')
print(knr_score)
print(knr_r2)

The rmse and r2 for wash is: 
1.5970307067145895
0.9592188276976424


GBR - PC_PC

In [136]:
y = num_of_org['PC_PC']
X = num_of_org.drop(columns = ['PC_PC'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05, random_state = 1234)

In [137]:
depth_range = [2**n for n in range(2,5)]

In [138]:
param_grid = {'n_estimators' : [10,20,30,40,50,60,70,80,90,100]}

In [139]:
#setting up hyperparameters
param_grid = {'max_depth' : depth_range, 'n_estimators' : [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}

In [140]:
#finding the best hyperparameters
print('start')

model = GradientBoostingRegressor()

grid = GridSearchCV(model, param_grid, cv = 2)

grid_search = grid.fit(X_train, y_train)
print('finish')

start
finish


In [141]:
#saving best hyperparameters
print(grid_search.best_params_)

best_max = grid_search.best_params_['max_depth']

best_n = grid_search.best_params_['n_estimators']


{'max_depth': 4, 'n_estimators': 80}


In [142]:
pc_pc_model = GradientBoostingRegressor(max_depth = best_max, n_estimators = best_n)

pc_pc_model.fit(X_train, y_train)

prediction = pc_pc_model.predict(X_test)

m_score = mean_squared_error(y_test, prediction, squared = False)
m_r2 = r2_score(y_test, prediction)

print('The rmse and r2 for PC_PC is: ')
print(m_score)
print(m_r2)

The rmse and r2 for PC_PC is: 
0.2667430714063942
0.9967375831228861


GBR - SHELTER

In [143]:
y = num_of_org['SHELTER']
X = num_of_org.drop(columns = ['SHELTER'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 1234)

In [144]:
#finding the best hyperparameters
print('start')

model = GradientBoostingRegressor()

grid = GridSearchCV(model, param_grid, cv = 2)

grid_search = grid.fit(X_train, y_train)
print('finish')

start
finish


In [145]:
#saving best hyperparameters
print(grid_search.best_params_)

best_max = grid_search.best_params_['max_depth']

best_n = grid_search.best_params_['n_estimators']


{'max_depth': 4, 'n_estimators': 100}


In [146]:
shelter_model = GradientBoostingRegressor(max_depth = best_max, n_estimators = best_n)

shelter_model.fit(X_train, y_train)

prediction = shelter_model.predict(X_test)

m_score = mean_squared_error(y_test, prediction, squared = False)
m_r2 = r2_score(y_test, prediction)

print('The rmse and r2 for shelter is: ')
print(m_score)
print(m_r2)

The rmse and r2 for shelter is: 
1.7024168095079766
0.9634854109757648


GBR - WASH

In [147]:
y = num_of_org['WASH']
X = num_of_org.drop(columns = ['WASH'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state = 1234)

In [148]:
#finding the best hyperparameters
print('start')

model = GradientBoostingRegressor()

grid = GridSearchCV(model, param_grid, cv = 2)

grid_search = grid.fit(X_train, y_train)
print('finish')

start
finish


In [149]:
#saving best hyperparameters
print(grid_search.best_params_)

best_max = grid_search.best_params_['max_depth']

best_n = grid_search.best_params_['n_estimators']

{'max_depth': 4, 'n_estimators': 100}


In [150]:
wash_model = GradientBoostingRegressor(max_depth = best_max, n_estimators = best_n)

wash_model.fit(X_train, y_train)

prediction = wash_model.predict(X_test)

m_score = mean_squared_error(y_test, prediction, squared = False)
m_r2 = r2_score(y_test, prediction)

print('The rmse and r2 for wash is: ')
print(m_score)
print(m_r2)

The rmse and r2 for wash is: 
1.0750471405206719
0.9812083512338134
