In [96]:
import pandas as pd
import numpy as np
import os
import re
import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [97]:
def getDate(filename):
    match_str = re.search(r'\d{4}-\d{2}-\d{2}',filename)
    try:
        return datetime.datetime.strptime(match_str.group(), '%Y-%m-%d').date()
    except ValueError:
        return None

In [98]:
def cleanColumn(name):
    name = name.upper()
    clean_name = re.sub(r'[^a-zA-Z0-9_]', '', name)
    clean_name = clean_name.replace('\n', ' ')
    if clean_name == 'SHELTERNFI':
        clean_name = 'SHELTER'
    if clean_name == 'PROTECTIONCP':
        clean_name = 'PC_CP'
    if clean_name == 'PROTECTIONGBV':
        clean_name = 'PC_GBV'
    if clean_name == 'PROTECTIONMA':
        clean_name = 'PC_MA'
    if clean_name == 'TOTALREACHED':
        clean_name = 'PEOPLEREACHED'
    return clean_name

In [99]:
#using csv and xlsx
csv_dir = 'ukraine_data_excel'


In [100]:
#using xlsx
sheet1_df = []
sheet2_df = []
sheet3_df = []

In [101]:
#using xlsx
for filename in os.listdir(csv_dir):
  if filename.endswith('.xlsx'):
    file_path = os.path.join(csv_dir,filename)
    excel_df = pd.ExcelFile(file_path)
    files_date = getDate(filename)
    for sheet in excel_df.sheet_names:
      df = pd.read_excel(excel_df,sheet_name=sheet)
      df.columns = [cleanColumn(col) for col in df.columns]
      df['DATE'] = files_date
      if sheet == 'Num_of_Orgs_by_Oblast':
        sheet1_df.append(df)
      elif sheet == 'People_Reached_by_Oblast':
        sheet2_df.append(df)
      elif sheet == 'UDE_Inputs':
        sheet3_df.append(df)

In [102]:
combined_sheet1 = pd.concat(sheet1_df, keys=[str(df['DATE'].iloc[0]) for df in sheet1_df])
res_sheet1 = combined_sheet1.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()
merged_sheet1 = pd.merge(res_sheet1, combined_sheet1[['ADMIN1_ID', 'DATE', 'OBLAST']], on=['ADMIN1_ID', 'DATE'], how='left')

  res_sheet1 = combined_sheet1.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()


In [103]:
combined_sheet2 = pd.concat(sheet2_df, keys=[str(df['DATE'].iloc[0]) for df in sheet2_df])
res_sheet2 = combined_sheet2.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()
merged_sheet2 = pd.merge(res_sheet2, combined_sheet2[['ADMIN1_ID', 'DATE', 'OBLAST']], on=['ADMIN1_ID', 'DATE'], how='left')

  res_sheet2 = combined_sheet2.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()


In [104]:
combined_sheet3 = pd.concat(sheet3_df, keys=[str(df['DATE'].iloc[0]) for df in sheet3_df])
res_sheet3 = combined_sheet3.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()
merged_sheet3 = pd.merge(res_sheet3, combined_sheet3[['ADMIN1_ID', 'DATE', 'OBLAST']], on=['ADMIN1_ID', 'DATE'], how='left')

  res_sheet3 = combined_sheet3.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()


In [105]:
merged_sheet1.shape

(750, 20)

In [106]:
merged_sheet3.shape

(750, 5)

In [107]:
merged_sheet1.iloc[26]

ADMIN1_ID                UA05
DATE               2022-11-25
CCCM                      0.0
CCS                       0.0
ETC                       0.0
EDUCATION                 0.0
FSL                       0.0
HEALTH                    0.0
MPC                       0.0
NUTRITION                 0.0
PROTECTIONTOTAL           0.0
PC_CP                     0.0
PC_GBV                    0.0
PC_MA                     0.0
PC_PC                     0.0
SHELTER                   0.0
WASH                      0.0
TOTAL                     0.0
LOGISTICS                 0.0
OBLAST              Vinnytska
Name: 26, dtype: object

In [108]:
#Num_of_Orgs_by_Oblast
merged_sheet1.iloc[20:35]


Unnamed: 0,ADMIN1_ID,DATE,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,MPC,NUTRITION,PROTECTIONTOTAL,PC_CP,PC_GBV,PC_MA,PC_PC,SHELTER,WASH,TOTAL,LOGISTICS,OBLAST
20,UA05,2022-09-22,2.0,1.0,0.0,4.0,20.0,13.0,6.0,0.0,0.0,4.0,3.0,3.0,0.0,7.0,9.0,58.0,0.0,Vinnytska
21,UA05,2022-09-29,6.0,2.0,0.0,20.0,68.0,48.0,34.0,0.0,0.0,36.0,14.0,8.0,0.0,46.0,30.0,214.0,0.0,Vinnytska
22,UA05,2022-09-29,6.0,2.0,0.0,20.0,68.0,48.0,34.0,0.0,0.0,36.0,14.0,8.0,0.0,46.0,30.0,214.0,0.0,Vinnytska
23,UA05,2022-10-13,3.0,1.0,0.0,11.0,35.0,24.0,17.0,0.0,0.0,25.0,16.0,5.0,0.0,23.0,13.0,123.0,0.0,Vinnytska
24,UA05,2022-10-27,3.0,1.0,0.0,13.0,36.0,24.0,17.0,0.0,0.0,25.0,10.0,5.0,0.0,23.0,13.0,119.0,0.0,Vinnytska
25,UA05,2022-11-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Vinnytska
26,UA05,2022-11-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Vinnytska
27,UA05,2022-12-08,3.0,1.0,0.0,13.0,36.0,27.0,18.0,0.0,0.0,26.0,11.0,7.0,0.0,24.0,13.0,125.0,0.0,Vinnytska
28,UA05,2022-12-22,2.0,1.0,0.0,13.0,36.0,33.0,18.0,0.0,0.0,29.0,11.0,8.0,0.0,24.0,13.0,130.0,0.0,Vinnytska
29,UA05,2023-01-05,3.0,1.0,1.0,17.0,39.0,33.0,19.0,0.0,0.0,33.0,10.0,8.0,0.0,25.0,14.0,138.0,0.0,Vinnytska


In [109]:
merged_sheet3.iloc[20:35]

Unnamed: 0,ADMIN1_ID,DATE,PEOPLEREACHED,NUMBEROFORGANISATIONS,OBLAST
20,UA05,2022-09-22,395394,58,Vinnytska
21,UA05,2022-09-29,790788,214,Vinnytska
22,UA05,2022-09-29,790788,214,Vinnytska
23,UA05,2022-10-13,395394,123,Vinnytska
24,UA05,2022-10-27,395394,119,Vinnytska
25,UA05,2022-11-10,449220,125,Vinnytska
26,UA05,2022-11-25,449220,125,Vinnytska
27,UA05,2022-12-08,449220,125,Vinnytska
28,UA05,2022-12-22,449220,130,Vinnytska
29,UA05,2023-01-05,449220,138,Vinnytska


In [110]:
merged_sheet1['OBLAST'].value_counts()

Vinnytska           30
Odeska              30
Chernihivska        30
Chernivetska        30
Cherkaska           30
Khmelnytska         30
Khersonska          30
Kharkivska          30
Ternopilska         30
Sumska              30
Rivnenska           30
Poltavska           30
Mykolaivska         30
Volynska            30
Lvivska             30
Luhanska            30
Kirovohradska       30
Kyivska             30
Ivano-Frankivska    30
Zaporizka           30
Zakarpatska         30
Zhytomyrska         30
Donetska            30
Dnipropetrovska     30
Kyiv                30
Name: OBLAST, dtype: int64

In [111]:
sum(merged_sheet3['PEOPLEREACHED'].value_counts())

750

In [112]:
# replace zero values in the empty tuple to NAN for future linear interpolation
#NOTE : THIS IS HARD CODED NOT OPTIMAL
merged_sheet1.iloc[25] = merged_sheet1.iloc[25].replace(0,np.nan)
merged_sheet1.iloc[26] = merged_sheet1.iloc[26].replace(0,np.nan)

merged_sheet3.iloc[25] = merged_sheet3.iloc[25].replace(merged_sheet3.iloc[25]['PEOPLEREACHED'],np.nan)
merged_sheet3.iloc[26] = merged_sheet3.iloc[26].replace(merged_sheet3.iloc[26]['PEOPLEREACHED'],np.nan)


In [113]:
merged_sheet1['PEOPLE_REACHED'] = merged_sheet3['PEOPLEREACHED']

In [114]:
#check for null values
merged_sheet1.isna().any().any()

True

In [115]:
# check for null values in specific row
merged_sheet1.iloc[26].isna().any()

True

In [116]:
#filled null values using linear interpolation
interpolated_df = merged_sheet1.interpolate(method='linear')

In [117]:
interpolated_df.isna().any().any()

False

In [118]:
# rounded all int and float dtypes since interpolation caused floats
interpolated_df = interpolated_df.applymap(lambda x: round(x, 1) if isinstance(x, (int, float)) else x)

In [119]:
interpolated_df.iloc[20:35]

Unnamed: 0,ADMIN1_ID,DATE,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,MPC,NUTRITION,...,PC_CP,PC_GBV,PC_MA,PC_PC,SHELTER,WASH,TOTAL,LOGISTICS,OBLAST,PEOPLE_REACHED
20,UA05,2022-09-22,2.0,1.0,0.0,4.0,20.0,13.0,6.0,0.0,...,4.0,3.0,3.0,0.0,7.0,9.0,58.0,0.0,Vinnytska,395394.0
21,UA05,2022-09-29,6.0,2.0,0.0,20.0,68.0,48.0,34.0,0.0,...,36.0,14.0,8.0,0.0,46.0,30.0,214.0,0.0,Vinnytska,790788.0
22,UA05,2022-09-29,6.0,2.0,0.0,20.0,68.0,48.0,34.0,0.0,...,36.0,14.0,8.0,0.0,46.0,30.0,214.0,0.0,Vinnytska,790788.0
23,UA05,2022-10-13,3.0,1.0,0.0,11.0,35.0,24.0,17.0,0.0,...,25.0,16.0,5.0,0.0,23.0,13.0,123.0,0.0,Vinnytska,395394.0
24,UA05,2022-10-27,3.0,1.0,0.0,13.0,36.0,24.0,17.0,0.0,...,25.0,10.0,5.0,0.0,23.0,13.0,119.0,0.0,Vinnytska,395394.0
25,UA05,2022-11-10,3.0,1.0,0.0,13.0,36.0,25.0,17.3,0.0,...,25.3,10.3,5.7,0.0,23.3,13.0,121.0,0.0,Vinnytska,413336.0
26,UA05,2022-11-25,3.0,1.0,0.0,13.0,36.0,26.0,17.7,0.0,...,25.7,10.7,6.3,0.0,23.7,13.0,123.0,0.0,Vinnytska,431278.0
27,UA05,2022-12-08,3.0,1.0,0.0,13.0,36.0,27.0,18.0,0.0,...,26.0,11.0,7.0,0.0,24.0,13.0,125.0,0.0,Vinnytska,449220.0
28,UA05,2022-12-22,2.0,1.0,0.0,13.0,36.0,33.0,18.0,0.0,...,29.0,11.0,8.0,0.0,24.0,13.0,130.0,0.0,Vinnytska,449220.0
29,UA05,2023-01-05,3.0,1.0,1.0,17.0,39.0,33.0,19.0,0.0,...,33.0,10.0,8.0,0.0,25.0,14.0,138.0,0.0,Vinnytska,449220.0


In [120]:
interpolated_df.sort_values(by=['DATE'])

Unnamed: 0,ADMIN1_ID,DATE,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,MPC,NUTRITION,...,PC_CP,PC_GBV,PC_MA,PC_PC,SHELTER,WASH,TOTAL,LOGISTICS,OBLAST,PEOPLE_REACHED
0,UA05,2022-04-28,2.0,1.0,1.0,5.0,17.0,16.0,10.0,0.0,...,2.0,2.0,2.0,11.0,6.0,7.0,42.0,0.0,Vinnytska,91200.0
690,UA74,2022-04-28,1.0,0.0,0.0,1.0,17.0,12.0,6.0,2.0,...,1.0,0.0,1.0,8.0,2.0,10.0,36.0,0.0,Chernihivska,171600.0
60,UA12,2022-04-28,1.0,1.0,1.0,3.0,31.0,19.0,12.0,2.0,...,5.0,4.0,2.0,14.0,10.0,15.0,58.0,0.0,Dnipropetrovska,231800.0
660,UA73,2022-04-28,2.0,1.0,0.0,3.0,22.0,9.0,10.0,3.0,...,5.0,2.0,2.0,12.0,6.0,11.0,50.0,0.0,Chernivetska,31200.0
90,UA14,2022-04-28,0.0,1.0,0.0,2.0,48.0,26.0,12.0,2.0,...,7.0,4.0,1.0,21.0,9.0,18.0,75.0,0.0,Donetska,122100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359,UA46,2023-01-05,4.0,1.0,1.0,23.0,62.0,52.0,26.0,5.0,...,41.0,20.0,8.0,0.0,27.0,23.0,194.0,1.0,Lvivska,957877.0
569,UA63,2023-01-05,3.0,0.0,1.0,13.0,111.0,49.0,24.0,3.0,...,26.0,7.0,10.0,0.0,33.0,25.0,175.0,0.0,Kharkivska,2070173.0
719,UA74,2023-01-05,0.0,0.0,0.0,18.0,56.0,29.0,20.0,3.0,...,16.0,7.0,9.0,0.0,33.0,17.0,149.0,0.0,Chernihivska,477499.0
89,UA12,2023-01-05,3.0,1.0,1.0,19.0,113.0,52.0,33.0,4.0,...,40.0,12.0,9.0,0.0,51.0,32.0,244.0,1.0,Dnipropetrovska,847604.0


In [121]:
interpolated_df.describe()

Unnamed: 0,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,MPC,NUTRITION,PROTECTIONTOTAL,PC_CP,PC_GBV,PC_MA,PC_PC,SHELTER,WASH,TOTAL,LOGISTICS,PEOPLE_REACHED
count,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0
mean,0.908,0.381333,0.192,6.189333,34.133333,19.610667,13.705333,1.602667,5.958667,11.078667,5.553333,4.14,1.804,12.088,12.689333,77.418667,0.233333,470911.5
std,1.287016,0.53818,0.420363,5.37997,27.577556,14.599793,9.082728,1.544876,9.210034,9.96683,4.454329,2.705994,4.586243,9.815506,8.739412,53.084492,0.465307,437962.7
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13800.0
25%,0.0,0.0,0.0,2.0,18.0,10.0,9.0,0.0,0.0,4.0,2.0,2.0,0.0,6.0,7.0,42.0,0.0,217977.5
50%,0.0,0.0,0.0,5.0,29.0,16.0,12.0,1.0,0.0,8.0,5.0,4.0,0.0,10.0,12.0,66.0,0.0,383711.0
75%,1.0,1.0,0.0,9.0,44.0,25.0,18.0,2.0,13.0,16.0,8.0,5.525,0.0,16.0,16.0,101.0,0.0,583976.0
max,8.0,2.0,2.0,32.0,202.0,100.0,56.0,10.0,42.0,52.0,30.0,16.0,28.0,80.0,58.0,364.0,2.0,4140346.0


In [122]:
# create label encoder instance
label_encoder = LabelEncoder()

#fit label encoder to transform Oblast column
encoded_oblast = label_encoder.fit_transform(interpolated_df['OBLAST'])

#create copy
df_encoded = interpolated_df.copy()

#Add column and assign values
df_encoded['OBLAST_ENCODED'] = encoded_oblast

In [123]:
# create label encoder instance
label_encoder = LabelEncoder()

#fit label encoder to transform Date column
encoded_oblast = label_encoder.fit_transform(df_encoded['DATE'])

#create copy
df_encode = df_encoded.copy()

#Add column and assign values
df_encode['DATE_ENCODED'] = encoded_oblast

In [124]:
df_encode.sort_values(by=['DATE'])

Unnamed: 0,ADMIN1_ID,DATE,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,MPC,NUTRITION,...,PC_MA,PC_PC,SHELTER,WASH,TOTAL,LOGISTICS,OBLAST,PEOPLE_REACHED,OBLAST_ENCODED,DATE_ENCODED
0,UA05,2022-04-28,2.0,1.0,1.0,5.0,17.0,16.0,10.0,0.0,...,2.0,11.0,6.0,7.0,42.0,0.0,Vinnytska,91200.0,20,0
690,UA74,2022-04-28,1.0,0.0,0.0,1.0,17.0,12.0,6.0,2.0,...,1.0,8.0,2.0,10.0,36.0,0.0,Chernihivska,171600.0,1,0
60,UA12,2022-04-28,1.0,1.0,1.0,3.0,31.0,19.0,12.0,2.0,...,2.0,14.0,10.0,15.0,58.0,0.0,Dnipropetrovska,231800.0,3,0
660,UA73,2022-04-28,2.0,1.0,0.0,3.0,22.0,9.0,10.0,3.0,...,2.0,12.0,6.0,11.0,50.0,0.0,Chernivetska,31200.0,2,0
90,UA14,2022-04-28,0.0,1.0,0.0,2.0,48.0,26.0,12.0,2.0,...,1.0,21.0,9.0,18.0,75.0,0.0,Donetska,122100.0,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359,UA46,2023-01-05,4.0,1.0,1.0,23.0,62.0,52.0,26.0,5.0,...,8.0,0.0,27.0,23.0,194.0,1.0,Lvivska,957877.0,13,28
569,UA63,2023-01-05,3.0,0.0,1.0,13.0,111.0,49.0,24.0,3.0,...,10.0,0.0,33.0,25.0,175.0,0.0,Kharkivska,2070173.0,6,28
719,UA74,2023-01-05,0.0,0.0,0.0,18.0,56.0,29.0,20.0,3.0,...,9.0,0.0,33.0,17.0,149.0,0.0,Chernihivska,477499.0,1,28
89,UA12,2023-01-05,3.0,1.0,1.0,19.0,113.0,52.0,33.0,4.0,...,9.0,0.0,51.0,32.0,244.0,1.0,Dnipropetrovska,847604.0,3,28


In [125]:
df_encoded.columns

Index(['ADMIN1_ID', 'DATE', 'CCCM', 'CCS', 'ETC', 'EDUCATION', 'FSL', 'HEALTH',
       'MPC', 'NUTRITION', 'PROTECTIONTOTAL', 'PC_CP', 'PC_GBV', 'PC_MA',
       'PC_PC', 'SHELTER', 'WASH', 'TOTAL', 'LOGISTICS', 'OBLAST',
       'PEOPLE_REACHED', 'OBLAST_ENCODED'],
      dtype='object')

In [126]:
merged_sheet1.shape

(750, 21)

In [127]:
#People_Reached_by_Oblast
merged_sheet2.head(10)

Unnamed: 0,ADMIN1_ID,DATE,FSL,GENERALPROTECTION,CP,MINEACTION,PROTECTIONTOTAL,HEALTH,MPC,SHELTER,WASH,PEOPLEREACHED,OBLAST
0,UA05,2022-04-28,7900.0,11200.0,100.0,500.0,11900.0,4600.0,91200.0,15800.0,2300.0,91200.0,Vinnytska
1,UA05,2022-05-06,14300.0,9600.0,100.0,500.0,14300.0,4600.0,113000.0,16200.0,2300.0,113000.0,Vinnytska
2,UA05,2022-05-12,15500.0,11400.0,200.0,500.0,16300.0,9900.0,138800.0,17500.0,3700.0,138800.0,Vinnytska
3,UA05,2022-05-19,17500.0,0.0,500.0,500.0,20300.0,23200.0,140600.0,17500.0,3700.0,140600.0,Vinnytska
4,UA05,2022-05-26,17600.0,0.0,4400.0,700.0,25600.0,23300.0,216600.0,21700.0,3800.0,216600.0,Vinnytska
5,UA05,2022-06-02,19000.0,0.0,21500.0,700.0,48000.0,23300.0,236000.0,33300.0,3800.0,236000.0,Vinnytska
6,UA05,2022-06-09,19000.0,0.0,21500.0,700.0,21000.0,23300.0,240100.0,40500.0,23800.0,240100.0,Vinnytska
7,UA05,2022-07-07,39200.0,0.0,46500.0,11800.0,94900.0,27500.0,214600.0,59400.0,42700.0,214600.0,Vinnytska
8,UA05,2022-07-14,39200.0,0.0,35700.0,11800.0,75700.0,27500.0,216000.0,60700.0,42700.0,216000.0,Vinnytska
9,UA05,2022-07-21,39200.0,0.0,35700.0,99800.0,174700.0,39000.0,217200.0,62600.0,42700.0,217200.0,Vinnytska


In [128]:
#UDE_Inputs
merged_sheet3.head()

Unnamed: 0,ADMIN1_ID,DATE,PEOPLEREACHED,NUMBEROFORGANISATIONS,OBLAST
0,UA05,2022-04-28,91200.0,42,Vinnytska
1,UA05,2022-05-06,113000.0,45,Vinnytska
2,UA05,2022-05-12,138800.0,49,Vinnytska
3,UA05,2022-05-19,140600.0,50,Vinnytska
4,UA05,2022-05-26,216600.0,48,Vinnytska


In [129]:
# copy_df = pd.DataFrame(np.nan, index=merged_sheet1.index,columns=merged_sheet1.columns)
# copy_df['DATE'] = merged_sheet1['DATE']
# copy_df['OBLAST'] = merged_sheet1['OBLAST']
# copy_df.head()

In [130]:
# # Create an Excel writer object
# excel_writer1 = pd.ExcelWriter('Data_ Round 33 - Ukraine 5W - 2022-11-10.xlsx', engine='xlsxwriter')
# excel_writer2 = pd.ExcelWriter('Data_ Round 34 - Ukraine 5W - 2022-11-25.xlsx', engine='xlsxwriter')
# # Write each DataFrame to a separate sheet in the Excel file
# copy_df.to_excel(excel_writer1, sheet_name='Num_of_Orgs_by_Oblast', index=False)
# copy_df.to_excel(excel_writer2, sheet_name='Num_of_Orgs_by_Oblast', index=False)

In [131]:
len(merged_sheet1.columns)
merged_sheet1.columns

Index(['ADMIN1_ID', 'DATE', 'CCCM', 'CCS', 'ETC', 'EDUCATION', 'FSL', 'HEALTH',
       'MPC', 'NUTRITION', 'PROTECTIONTOTAL', 'PC_CP', 'PC_GBV', 'PC_MA',
       'PC_PC', 'SHELTER', 'WASH', 'TOTAL', 'LOGISTICS', 'OBLAST',
       'PEOPLE_REACHED'],
      dtype='object')

In [132]:
merged_sheet1.shape

(750, 21)

In [133]:
merged_sheet3.shape

(750, 5)

In [134]:
merged_sheet1.DATE.value_counts()

2022-09-29    50
2022-04-28    25
2022-08-11    25
2022-12-22    25
2022-12-08    25
2022-11-25    25
2022-11-10    25
2022-10-27    25
2022-10-13    25
2022-09-22    25
2022-09-15    25
2022-09-08    25
2022-08-25    25
2022-08-18    25
2022-08-04    25
2022-05-06    25
2022-07-28    25
2022-07-21    25
2022-07-14    25
2022-07-07    25
2022-06-30    25
2022-06-23    25
2022-06-16    25
2022-06-09    25
2022-06-02    25
2022-05-26    25
2022-05-19    25
2022-05-12    25
2023-01-05    25
Name: DATE, dtype: int64

In [135]:
df_encode.head()

Unnamed: 0,ADMIN1_ID,DATE,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,MPC,NUTRITION,...,PC_MA,PC_PC,SHELTER,WASH,TOTAL,LOGISTICS,OBLAST,PEOPLE_REACHED,OBLAST_ENCODED,DATE_ENCODED
0,UA05,2022-04-28,2.0,1.0,1.0,5.0,17.0,16.0,10.0,0.0,...,2.0,11.0,6.0,7.0,42.0,0.0,Vinnytska,91200.0,20,0
1,UA05,2022-05-06,2.0,1.0,1.0,5.0,20.0,17.0,10.0,0.0,...,2.0,11.0,6.0,8.0,45.0,0.0,Vinnytska,113000.0,20,1
2,UA05,2022-05-12,2.0,1.0,1.0,6.0,20.0,18.0,10.0,0.0,...,2.0,14.0,6.0,9.0,49.0,0.0,Vinnytska,138800.0,20,2
3,UA05,2022-05-19,2.0,1.0,1.0,6.0,20.0,20.0,10.0,0.0,...,2.0,14.0,6.0,9.0,50.0,0.0,Vinnytska,140600.0,20,3
4,UA05,2022-05-26,0.0,1.0,1.0,6.0,20.0,20.0,9.0,0.0,...,4.0,12.0,7.0,9.0,48.0,0.0,Vinnytska,216600.0,20,4


In [136]:
#Education, FSL, Health, Logistics
column_corr = ['EDUCATION', 'FSL', 'HEALTH', 'LOGISTICS']
df_encode.corr()[column_corr].sort_values(by = 'EDUCATION', ascending=False)

  df_encode.corr()[column_corr].sort_values(by = 'EDUCATION', ascending=False)


Unnamed: 0,EDUCATION,FSL,HEALTH,LOGISTICS
EDUCATION,1.0,0.671308,0.710664,0.338595
PC_CP,0.875966,0.708309,0.709541,0.357334
TOTAL,0.873112,0.909779,0.841754,0.403699
PC_GBV,0.86402,0.639221,0.739086,0.438138
PC_MA,0.818421,0.704889,0.681301,0.283645
SHELTER,0.78555,0.847987,0.683844,0.357396
MPC,0.779182,0.856869,0.729453,0.233004
HEALTH,0.710664,0.790626,1.0,0.435736
WASH,0.684146,0.924303,0.791537,0.236182
FSL,0.671308,1.0,0.790626,0.255188


In [137]:
#split data into features and label
#label is education

In [138]:
y_education = df_encode['EDUCATION']
X_education = df_encode.drop(columns = ['ADMIN1_ID', 'DATE', 'EDUCATION', 'OBLAST'], axis= 1)

In [139]:
X_education.shape

(750, 19)

In [140]:
X_train, X_test, y_train, y_test = train_test_split(X_education, y_education, test_size=0.25, random_state=1234)

In [141]:
#creating linear regression model
education_model = LinearRegression()

In [142]:
#fitting the model
education_model.fit(X_train, y_train)

In [143]:
education_model.score(X_test, y_test)

0.9024718361710508

In [144]:
#make predictions on the model
education_predict = education_model.predict(X_test)

In [145]:
#evaluate linear regression model
lr_rmse = mean_squared_error(y_test, education_predict, squared=False)
lr_r2 = r2_score(y_test, education_predict)
#end solution
print('[LR] Root Mean Squared Error: {0}'.format(lr_rmse))
print('[LR] R2: {0}'.format(lr_r2))

[LR] Root Mean Squared Error: 1.675140387769516
[LR] R2: 0.9024718361710508


In [146]:
# evaluate the model
mse = mean_squared_error(y_test, education_predict)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 2.806095318736604


In [148]:
# Plot the results
#plt.scatter(X_test, y_test, color='black')
#plt.plot(X_test, education_predict, color='blue', linewidth=3)
#plt.xlabel('X')
#plt.ylabel('y')
#plt.title('Linear Regression Model')
#plt.show()

In [159]:
y_FSL = df_encode['FSL']
X_FSL = df_encode.drop(columns = ['ADMIN1_ID', 'DATE', 'FSL', 'OBLAST'], axis= 1)

In [160]:
X_FSL.shape

(750, 19)

In [161]:
X_train, X_test, y_train, y_test = train_test_split(X_FSL, y_FSL, test_size=0.25, random_state=1234)

In [162]:
#creating linear regression model
FSL_model = LinearRegression()

In [163]:
FSL_model.fit(X_train, y_train)

In [164]:
FSL_model.score(X_test, y_test)

0.9560139112701658

In [166]:
#make predictions on the model
FSL_predict = FSL_model.predict(X_test)

In [167]:
#evaluate linear regression model
lr_rmse = mean_squared_error(y_test, FSL_predict, squared=False)
lr_r2 = r2_score(y_test, FSL_predict)
#end solution
print('[LR] Root Mean Squared Error: {0}'.format(lr_rmse))
print('[LR] R2: {0}'.format(lr_r2))

[LR] Root Mean Squared Error: 5.528470258882991
[LR] R2: 0.9560139112701658


In [168]:
# evaluate the model
mse = mean_squared_error(y_test, FSL_predict)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 30.563983403353767


In [170]:
y_health = df_encode['HEALTH']
X_health = df_encode.drop(columns = ['ADMIN1_ID', 'DATE', 'HEALTH', 'OBLAST'], axis= 1)

In [171]:
X_health.shape

(750, 19)

In [172]:
X_train, X_test, y_train, y_test = train_test_split(X_health, y_health, test_size=0.25, random_state=1234)

In [173]:
#creating linear regression model
health_model = LinearRegression()

In [174]:
health_model.fit(X_train, y_train)

In [175]:
health_model.score(X_test, y_test)

0.8870532418601575

In [176]:
#make predictions on the model
health_predict = health_model.predict(X_test)

In [177]:
#evaluate linear regression model
lr_rmse = mean_squared_error(y_test, health_predict, squared=False)
lr_r2 = r2_score(y_test, health_predict)
#end solution
print('[LR] Root Mean Squared Error: {0}'.format(lr_rmse))
print('[LR] R2: {0}'.format(lr_r2))

[LR] Root Mean Squared Error: 5.344771464972745
[LR] R2: 0.8870532418601575


In [178]:
# evaluate the model
mse = mean_squared_error(y_test, health_predict)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 28.56658201278691


In [179]:
y_logistics = df_encode['LOGISTICS']
X_logistics = df_encode.drop(columns = ['ADMIN1_ID', 'DATE', 'LOGISTICS', 'OBLAST'], axis= 1)

In [180]:
X_logistics.shape

(750, 19)

In [181]:
X_train, X_test, y_train, y_test = train_test_split(X_logistics, y_logistics, test_size=0.25, random_state=1234)

In [182]:
#creating linear regression model
logistics_model = LinearRegression()

In [183]:
logistics_model.fit(X_train, y_train)

In [184]:
logistics_model.score(X_test, y_test)

0.6261800666017017

In [185]:
#make predictions on the model
logistics_predict = logistics_model.predict(X_test)

In [186]:
#evaluate linear regression model
lr_rmse = mean_squared_error(y_test, logistics_predict, squared=False)
lr_r2 = r2_score(y_test, logistics_predict)
#end solution
print('[LR] Root Mean Squared Error: {0}'.format(lr_rmse))
print('[LR] R2: {0}'.format(lr_r2))

[LR] Root Mean Squared Error: 0.2809695099445359
[LR] R2: 0.6261800666017017


In [187]:
# evaluate the model
mse = mean_squared_error(y_test, logistics_predict)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 0.07894386551847267
