In [158]:
import pandas as pd
import numpy as np
import os
import re
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import MinMaxScaler
import pickle

In [159]:
def getDate(filename):
    match_str = re.search(r'\d{4}-\d{2}-\d{2}',filename)
    try:
        return datetime.datetime.strptime(match_str.group(), '%Y-%m-%d').date()
    except ValueError:
        return None

In [160]:
def cleanColumn(name):
    name = name.upper()
    clean_name = re.sub(r'[^a-zA-Z0-9_]', '', name)
    clean_name = clean_name.replace('\n', ' ')
    if clean_name == 'SHELTERNFI':
        clean_name = 'SHELTER'
    if clean_name == 'PROTECTIONCP':
        clean_name = 'PC_CP'
    if clean_name == 'PROTECTIONGBV':
        clean_name = 'PC_GBV'
    if clean_name == 'PROTECTIONMA':
        clean_name = 'PC_MA'
    if clean_name == 'TOTALREACHED':
        clean_name = 'PEOPLEREACHED'
    return clean_name

In [161]:
#using csv and xlsx
csv_dir = 'ukraine_data_excel'


In [162]:
#using xlsx
sheet1_df = []
sheet2_df = []
sheet3_df = []

In [163]:
#using xlsx
for filename in os.listdir(csv_dir):
  if filename.endswith('.xlsx'):
    file_path = os.path.join(csv_dir,filename)
    excel_df = pd.ExcelFile(file_path)
    files_date = getDate(filename)
    for sheet in excel_df.sheet_names:
      df = pd.read_excel(excel_df,sheet_name=sheet)
      df.columns = [cleanColumn(col) for col in df.columns]
      df['DATE'] = files_date
      if sheet == 'Num_of_Orgs_by_Oblast':
        sheet1_df.append(df)
      elif sheet == 'People_Reached_by_Oblast':
        sheet2_df.append(df)
      elif sheet == 'UDE_Inputs':
        sheet3_df.append(df)

In [164]:
combined_sheet1 = pd.concat(sheet1_df, keys=[str(df['DATE'].iloc[0]) for df in sheet1_df])
res_sheet1 = combined_sheet1.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()
merged_sheet1 = pd.merge(res_sheet1, combined_sheet1[['ADMIN1_ID', 'DATE', 'OBLAST']], on=['ADMIN1_ID', 'DATE'], how='left')

  res_sheet1 = combined_sheet1.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()


In [165]:
combined_sheet2 = pd.concat(sheet2_df, keys=[str(df['DATE'].iloc[0]) for df in sheet2_df])
res_sheet2 = combined_sheet2.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()
columns_to_merge = ['ADMIN1_ID', 'DATE'] 
merged_sheet2 = pd.merge(combined_sheet2[combined_sheet2.columns.to_list()], res_sheet2[columns_to_merge] , on=['ADMIN1_ID', 'DATE'], how='right')

  res_sheet2 = combined_sheet2.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()


In [166]:
merged_sheet2.head()

Unnamed: 0,OBLAST,ADMIN1_ID,EDUCATION,FSL,GENERALPROTECTION,CP,GBV,MINEACTION,PROTECTIONTOTAL,HEALTH,MPC,NUTRITION,SHELTER,WASH,PEOPLEREACHED,DATE,ADM1_ID
0,Vinnytska,UA05,2800.0,7900.0,11200.0,100.0,100.0,500.0,11900.0,4600.0,91200.0,,15800.0,2300.0,91200.0,2022-04-28,
1,Vinnytska,UA05,2800.0,14300.0,9600.0,100.0,4100.0,500.0,14300.0,4600.0,113000.0,,16200.0,2300.0,113000.0,2022-05-06,
2,Vinnytska,UA05,4900.0,15500.0,11400.0,200.0,4100.0,500.0,16300.0,9900.0,138800.0,,17500.0,3700.0,138800.0,2022-05-12,
3,Vinnytska,UA05,9000.0,17500.0,,500.0,6100.0,500.0,20300.0,23200.0,140600.0,,17500.0,3700.0,140600.0,2022-05-19,
4,Vinnytska,UA05,11500.0,17600.0,,4400.0,6200.0,700.0,25600.0,23300.0,216600.0,,21700.0,3800.0,216600.0,2022-05-26,


In [167]:
merged_sheet2.shape

(675, 17)

In [168]:
merged_sheet2.fillna(0,inplace=True)
merged_sheet2.drop(columns=['ADM1_ID'],inplace=True)

In [169]:
merged_sheet1.isna().any().any()

False

In [None]:
combined_sheet3 = pd.concat(sheet3_df, keys=[str(df['DATE'].iloc[0]) for df in sheet3_df])
res_sheet3 = combined_sheet3.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()
merged_sheet3 = pd.merge(res_sheet3, combined_sheet3[['ADMIN1_ID', 'DATE', 'OBLAST']], on=['ADMIN1_ID', 'DATE'], how='left')

In [None]:
#Num_of_Orgs_by_Oblast
merged_sheet1.iloc[20:35]

In [None]:
#People_Reached_by_Oblast
merged_sheet2.iloc[10:25]

In [None]:
merged_sheet3.iloc[20:35]

In [None]:
merged_sheet1['OBLAST'].value_counts()

In [175]:
sum(merged_sheet3['PEOPLEREACHED'].value_counts())

750

In [176]:
merged_sheet1['DATE'].dtype

dtype('O')

In [177]:
########### mergedsheet 1 prep for interpolation

In [178]:
# convert 'date' column to string type 
merged_sheet1['DATE'] = merged_sheet1['DATE'].astype(str)
#  the specific date you want to find
specific_date_1006 = '2022-10-06'
specific_date_1125 = '2022-11-25'
specific_date_1110 = '2022-11-10'
# filter the df for rows with the specific date
filtered_values1006 = merged_sheet1.loc[merged_sheet1['DATE'] == specific_date_1006]
filtered_values1110 = merged_sheet1.loc[merged_sheet1['DATE'] == specific_date_1110]
filtered_values1125 = merged_sheet1.loc[merged_sheet1['DATE'] == specific_date_1125]

filtered_values1110

Unnamed: 0,ADMIN1_ID,DATE,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,MPC,NUTRITION,PROTECTIONTOTAL,PC_CP,PC_GBV,PC_MA,PC_PC,SHELTER,WASH,TOTAL,LOGISTICS,OBLAST
25,UA05,2022-11-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Vinnytska
55,UA07,2022-11-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Volynska
85,UA12,2022-11-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Dnipropetrovska
115,UA14,2022-11-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Donetska
145,UA18,2022-11-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Zhytomyrska
175,UA21,2022-11-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Zakarpatska
205,UA23,2022-11-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Zaporizka
235,UA26,2022-11-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Ivano-Frankivska
265,UA32,2022-11-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Kyivska
295,UA35,2022-11-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Kirovohradska


In [179]:
# replace zero values in the empty tuple to NAN for future linear interpolation of ORGS
filtered_values1006= filtered_values1006.replace(0,np.nan)
filtered_values1110= filtered_values1110.replace(0,np.nan)
filtered_values1125= filtered_values1125.replace(0,np.nan)
# merged_sheet1.iloc[25] = merged_sheet1.iloc[25].replace(0,np.nan)
# merged_sheet1.iloc[26] = merged_sheet1.iloc[26].replace(0,np.nan)



In [180]:
#overwrite the old values
merged_sheet1.loc[merged_sheet1['DATE'] == specific_date_1006] = filtered_values1006
merged_sheet1.loc[merged_sheet1['DATE'] == specific_date_1110] = filtered_values1110
merged_sheet1.loc[merged_sheet1['DATE'] == specific_date_1125] = filtered_values1125

In [181]:
########### mergedsheet 2 prep for interpolation

In [182]:
# Convert 'date' column to string type if it's not already
merged_sheet2['DATE'] = merged_sheet2['DATE'].astype(str)
filtered_values1006_m2 = merged_sheet2.loc[merged_sheet2['DATE'] == specific_date_1006]
filtered_values1110_m2 = merged_sheet2.loc[merged_sheet2['DATE'] == specific_date_1110]
filtered_values1125_m2 = merged_sheet2.loc[merged_sheet2['DATE'] == specific_date_1125]
filtered_values1006_m2

Unnamed: 0,OBLAST,ADMIN1_ID,EDUCATION,FSL,GENERALPROTECTION,CP,GBV,MINEACTION,PROTECTIONTOTAL,HEALTH,MPC,NUTRITION,SHELTER,WASH,PEOPLEREACHED,DATE
19,Vinnytska,UA05,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,2022-10-06
46,Volynska,UA07,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,2022-10-06
73,Dnipropetrovska,UA12,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,2022-10-06
100,Donetska,UA14,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,2022-10-06
127,Zhytomyrska,UA18,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,2022-10-06
154,Zakarpatska,UA21,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,2022-10-06
181,Zaporizka,UA23,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,2022-10-06
208,Ivano-Frankivska,UA26,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,2022-10-06
235,Kyivska,UA32,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,2022-10-06
262,Kirovohradska,UA35,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,2022-10-06


In [183]:
# replace zero values in the empty tuple to NAN for future linear interpolation of People
filtered_values1006_m2 = filtered_values1006_m2.replace(0,np.nan)
filtered_values1110_m2 = filtered_values1110_m2.replace(0,np.nan)
filtered_values1125_m2 = filtered_values1125_m2.replace(0,np.nan)

In [184]:
#overwrite the old values
merged_sheet2.loc[merged_sheet2['DATE'] == specific_date_1006] = filtered_values1006_m2
merged_sheet2.loc[merged_sheet2['DATE'] == specific_date_1110] = filtered_values1110_m2
merged_sheet2.loc[merged_sheet2['DATE'] == specific_date_1125] = filtered_values1125_m2

In [185]:
#check for null values
merged_sheet1.isna().any().any()

True

In [186]:
# check for null values in specific row
merged_sheet1.iloc[26].isna().any()

True

In [187]:
merged_sheet2.iloc[640]

OBLAST               Chernihivska
ADMIN1_ID                    UA74
EDUCATION                     NaN
FSL                           NaN
GENERALPROTECTION             NaN
CP                            NaN
GBV                           NaN
MINEACTION                    NaN
PROTECTIONTOTAL               NaN
HEALTH                        NaN
MPC                           NaN
NUTRITION                     NaN
SHELTER                       NaN
WASH                          NaN
PEOPLEREACHED                 NaN
DATE                   2022-10-06
Name: 640, dtype: object

In [188]:
#filled null values using linear interpolation
num_of_org_interpolated_df = merged_sheet1.interpolate(method='linear')

In [189]:
num_of_org_interpolated_df.isna().any().any()

False

In [190]:
# rounded all int and float dtypes since interpolation caused floats
num_of_org_interpolated_df = num_of_org_interpolated_df.applymap(lambda x: round(x, 1) if isinstance(x, (int, float)) else x)

In [191]:
num_of_org_interpolated_df.loc[num_of_org_interpolated_df['DATE']==specific_date_1110]

Unnamed: 0,ADMIN1_ID,DATE,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,MPC,NUTRITION,PROTECTIONTOTAL,PC_CP,PC_GBV,PC_MA,PC_PC,SHELTER,WASH,TOTAL,LOGISTICS,OBLAST
25,UA05,2022-11-10,3.0,1.0,0.0,13.0,36.0,25.0,17.3,0.0,0.0,25.3,10.3,5.7,0.0,23.3,13.0,121.0,0.0,Vinnytska
55,UA07,2022-11-10,1.0,0.0,0.0,7.7,20.0,11.0,14.3,1.0,0.0,23.0,11.3,6.3,0.0,13.0,8.0,83.3,0.0,Volynska
85,UA12,2022-11-10,4.0,1.0,1.0,15.3,107.7,39.3,25.0,4.0,0.0,33.0,12.3,8.0,0.0,46.7,30.0,216.7,1.0,Dnipropetrovska
115,UA14,2022-11-10,0.0,1.0,0.0,8.7,83.7,40.3,29.3,3.0,0.0,25.7,11.3,8.3,0.0,21.3,29.0,138.3,0.0,Donetska
145,UA18,2022-11-10,0.0,0.0,0.0,10.3,31.3,17.3,17.0,0.0,0.0,14.0,7.3,5.7,0.0,10.0,9.0,89.0,0.0,Zhytomyrska
175,UA21,2022-11-10,3.0,1.0,0.0,16.3,34.3,16.3,19.0,1.0,0.0,28.7,11.7,7.7,0.0,17.0,15.0,116.3,0.0,Zakarpatska
205,UA23,2022-11-10,3.0,0.0,0.0,11.0,94.0,25.7,26.0,3.0,0.0,23.3,13.3,5.7,0.0,27.0,23.0,144.0,0.0,Zaporizka
235,UA26,2022-11-10,3.0,0.0,0.0,16.3,37.0,15.7,17.3,1.0,0.0,27.7,10.3,8.3,0.0,15.7,15.3,110.3,0.0,Ivano-Frankivska
265,UA32,2022-11-10,1.0,0.0,0.0,13.3,61.0,20.7,24.3,2.0,0.0,21.0,9.0,7.3,0.0,23.3,17.0,141.7,0.0,Kyivska
295,UA35,2022-11-10,3.0,1.0,0.0,12.7,39.7,15.7,16.0,2.0,0.0,13.7,5.0,6.3,0.0,14.7,11.0,109.0,1.0,Kirovohradska


In [192]:
num_of_org_interpolated_df.sort_values(by=['DATE'])

Unnamed: 0,ADMIN1_ID,DATE,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,MPC,NUTRITION,PROTECTIONTOTAL,PC_CP,PC_GBV,PC_MA,PC_PC,SHELTER,WASH,TOTAL,LOGISTICS,OBLAST
0,UA05,2022-04-28,2.0,1.0,1.0,5.0,17.0,16.0,10.0,0.0,16.0,2.0,2.0,2.0,11.0,6.0,7.0,42.0,0.0,Vinnytska
690,UA74,2022-04-28,1.0,0.0,0.0,1.0,17.0,12.0,6.0,2.0,9.0,1.0,0.0,1.0,8.0,2.0,10.0,36.0,0.0,Chernihivska
60,UA12,2022-04-28,1.0,1.0,1.0,3.0,31.0,19.0,12.0,2.0,21.0,5.0,4.0,2.0,14.0,10.0,15.0,58.0,0.0,Dnipropetrovska
660,UA73,2022-04-28,2.0,1.0,0.0,3.0,22.0,9.0,10.0,3.0,17.0,5.0,2.0,2.0,12.0,6.0,11.0,50.0,0.0,Chernivetska
90,UA14,2022-04-28,0.0,1.0,0.0,2.0,48.0,26.0,12.0,2.0,29.0,7.0,4.0,1.0,21.0,9.0,18.0,75.0,0.0,Donetska
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359,UA46,2023-01-05,4.0,1.0,1.0,23.0,62.0,52.0,26.0,5.0,0.0,41.0,20.0,8.0,0.0,27.0,23.0,194.0,1.0,Lvivska
569,UA63,2023-01-05,3.0,0.0,1.0,13.0,111.0,49.0,24.0,3.0,0.0,26.0,7.0,10.0,0.0,33.0,25.0,175.0,0.0,Kharkivska
719,UA74,2023-01-05,0.0,0.0,0.0,18.0,56.0,29.0,20.0,3.0,0.0,16.0,7.0,9.0,0.0,33.0,17.0,149.0,0.0,Chernihivska
89,UA12,2023-01-05,3.0,1.0,1.0,19.0,113.0,52.0,33.0,4.0,0.0,40.0,12.0,9.0,0.0,51.0,32.0,244.0,1.0,Dnipropetrovska


In [193]:
num_of_org_interpolated_df.describe()

Unnamed: 0,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,MPC,NUTRITION,PROTECTIONTOTAL,PC_CP,PC_GBV,PC_MA,PC_PC,SHELTER,WASH,TOTAL,LOGISTICS
count,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0
mean,0.916667,0.378667,0.196,6.344667,34.328,19.677333,13.735333,1.602667,5.958667,11.522,5.724,4.284,1.804,12.256667,12.68,78.646,0.233333
std,1.113314,0.485379,0.39511,4.444422,21.160086,11.596443,6.02777,1.296804,9.210034,8.539077,3.702005,2.132039,4.586243,7.813827,6.241044,38.980829,0.423235
min,0.0,0.0,0.0,0.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,14.0,0.0
25%,0.0,0.0,0.0,3.0,20.0,11.0,9.0,1.0,0.0,5.0,3.0,3.0,0.0,6.0,8.0,49.0,0.0
50%,1.0,0.0,0.0,6.0,30.0,16.0,13.0,1.0,0.0,9.0,5.0,4.0,0.0,11.0,12.0,71.5,0.0
75%,1.0,1.0,0.0,9.0,43.225,24.65,17.7,3.0,13.0,17.0,8.0,6.0,0.0,16.0,16.0,102.0,0.0
max,4.0,1.0,1.0,23.0,113.0,71.0,35.0,5.0,42.0,41.0,20.0,10.0,28.0,51.0,32.0,244.0,1.0


In [194]:
#filled null values using linear interpolation for merged sheet2
people_reached_interpolated_df = merged_sheet2.interpolate(method='linear')
people_reached_interpolated_df.fillna(method='ffill',inplace=True)

In [195]:
#check for null
people_reached_interpolated_df.isna().any().any()

False

In [196]:
# rounded all int and float dtypes since interpolation caused floats
people_reached_interpolated_df = people_reached_interpolated_df.applymap(lambda x: round(x, 1) if isinstance(x, (int, float)) else x)

In [267]:
people_reached_interpolated_df.loc[people_reached_interpolated_df['DATE']==specific_date_1125]

Unnamed: 0,OBLAST,ADMIN1_ID,EDUCATION,FSL,GENERALPROTECTION,CP,GBV,MINEACTION,PROTECTIONTOTAL,HEALTH,MPC,NUTRITION,SHELTER,WASH,PEOPLEREACHED,DATE
23,Vinnytska,UA05,18519,39158.0,0.0,131268.0,20120.0,107388.0,228971.0,105527.0,260786.3,0.0,150959.3,431278.0,431278.0,2022-11-25
50,Volynska,UA07,1427,48442.0,0.0,117463.3,24004.7,68398.3,166848.3,284201.3,158911.3,396.0,26462.0,157799.0,292047.0,2022-11-25
77,Dnipropetrovska,UA12,43548,768589.0,0.0,90438.7,26215.7,313910.3,324489.7,549308.3,447314.3,22773.0,289301.7,775382.0,804355.0,2022-11-25
104,Donetska,UA14,20784,444975.0,0.0,172301.7,11220.3,230246.7,425259.0,655598.7,125237.7,12467.0,219541.0,616792.7,655598.7,2022-11-25
131,Zhytomyrska,UA18,18383,29487.0,0.0,144915.0,19051.0,99233.3,167760.0,361190.0,168676.7,0.0,29663.7,365963.3,372856.0,2022-11-25
158,Zakarpatska,UA21,59040,81575.0,0.0,152583.3,27812.3,75460.7,452974.7,291196.0,249368.7,0.0,105656.7,208017.0,452974.7,2022-11-25
185,Zaporizka,UA23,6608,584573.0,0.0,94540.0,12893.0,150531.3,206109.0,150747.0,243850.7,8080.0,78324.3,103080.7,584573.0,2022-11-25
212,Ivano-Frankivska,UA26,43161,113680.0,0.0,173289.7,26016.3,88800.0,261799.7,227739.7,151646.0,0.0,101167.0,190192.7,302635.3,2022-11-25
239,Kyivska,UA32,9626,282016.0,0.0,85651.0,12283.3,287355.3,396295.0,738844.7,139756.0,7709.0,41635.3,57351.3,738844.7,2022-11-25
266,Kirovohradska,UA35,18345,93035.0,0.0,29324.0,1953.0,72881.3,41276.7,563398.7,125677.7,3960.0,56457.3,17572.7,563398.7,2022-11-25


In [198]:
#NUM OF ORGS
# create label encoder instance
label_encoder = LabelEncoder()
#fit label encoder to transform Oblast column
encoded_oblast = label_encoder.fit_transform(num_of_org_interpolated_df['OBLAST'])
#create copy
num_of_org_df_encoded = num_of_org_interpolated_df.copy()
#Add column and assign values
num_of_org_df_encoded['OBLAST_ENCODED'] = encoded_oblast

In [199]:
#NUM OF ORGS
# create label encoder instance
label_encoder = LabelEncoder()
#fit label encoder to transform Date column
encoded_oblast = label_encoder.fit_transform(num_of_org_df_encoded['DATE'])
#create copy
num_of_org_df_encode = num_of_org_df_encoded.copy()
#Add column and assign values
num_of_org_df_encode['DATE_ENCODED'] = encoded_oblast

In [200]:
#PEOPLE REACHED
# create label encoder instance
label_encoder = LabelEncoder()
#fit label encoder to transform Oblast column
encoded_oblast = label_encoder.fit_transform(people_reached_interpolated_df['OBLAST'])
#create copy
people_reached_df_encoded = people_reached_interpolated_df.copy()
#Add column and assign values
people_reached_df_encoded['OBLAST_ENCODED'] = encoded_oblast

In [201]:
#PEOPLE REACHED
# create label encoder instance
label_encoder = LabelEncoder()
#fit label encoder to transform Date column
encoded_oblast = label_encoder.fit_transform(people_reached_df_encoded['DATE'])
#create copy
people_reached_df_encode = people_reached_df_encoded.copy()
#Add column and assign values
people_reached_df_encode['DATE_ENCODED'] = encoded_oblast

In [None]:
#changing the name of the df
people_reached = people_reached_df_encode
num_of_org = num_of_org_df_encode

In [202]:
num_of_org.sort_values(by=['DATE'])

Unnamed: 0,ADMIN1_ID,DATE,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,MPC,NUTRITION,...,PC_GBV,PC_MA,PC_PC,SHELTER,WASH,TOTAL,LOGISTICS,OBLAST,OBLAST_ENCODED,DATE_ENCODED
0,UA05,2022-04-28,2.0,1.0,1.0,5.0,17.0,16.0,10.0,0.0,...,2.0,2.0,11.0,6.0,7.0,42.0,0.0,Vinnytska,20,0
690,UA74,2022-04-28,1.0,0.0,0.0,1.0,17.0,12.0,6.0,2.0,...,0.0,1.0,8.0,2.0,10.0,36.0,0.0,Chernihivska,1,0
60,UA12,2022-04-28,1.0,1.0,1.0,3.0,31.0,19.0,12.0,2.0,...,4.0,2.0,14.0,10.0,15.0,58.0,0.0,Dnipropetrovska,3,0
660,UA73,2022-04-28,2.0,1.0,0.0,3.0,22.0,9.0,10.0,3.0,...,2.0,2.0,12.0,6.0,11.0,50.0,0.0,Chernivetska,2,0
90,UA14,2022-04-28,0.0,1.0,0.0,2.0,48.0,26.0,12.0,2.0,...,4.0,1.0,21.0,9.0,18.0,75.0,0.0,Donetska,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359,UA46,2023-01-05,4.0,1.0,1.0,23.0,62.0,52.0,26.0,5.0,...,20.0,8.0,0.0,27.0,23.0,194.0,1.0,Lvivska,13,29
569,UA63,2023-01-05,3.0,0.0,1.0,13.0,111.0,49.0,24.0,3.0,...,7.0,10.0,0.0,33.0,25.0,175.0,0.0,Kharkivska,6,29
719,UA74,2023-01-05,0.0,0.0,0.0,18.0,56.0,29.0,20.0,3.0,...,7.0,9.0,0.0,33.0,17.0,149.0,0.0,Chernihivska,1,29
89,UA12,2023-01-05,3.0,1.0,1.0,19.0,113.0,52.0,33.0,4.0,...,12.0,9.0,0.0,51.0,32.0,244.0,1.0,Dnipropetrovska,3,29


In [203]:
people_reached.sort_values(by=['DATE'])

Unnamed: 0,OBLAST,ADMIN1_ID,EDUCATION,FSL,GENERALPROTECTION,CP,GBV,MINEACTION,PROTECTIONTOTAL,HEALTH,MPC,NUTRITION,SHELTER,WASH,PEOPLEREACHED,DATE,OBLAST_ENCODED,DATE_ENCODED
0,Vinnytska,UA05,2800.0,7900.0,11200.0,100.0,100.0,500.0,11900.0,4600.0,91200.0,0,15800.0,2300.0,91200.0,2022-04-28,20,0
297,Lvivska,UA46,13800.0,374000.0,37900.0,500.0,0.0,100.0,38600.0,33300.0,60300.0,300.0,34700.0,3100.0,374000.0,2022-04-28,13,0
378,Poltavska,UA53,1700.0,242200.0,8000.0,2500.0,100.0,200.0,10800.0,1000.0,1700.0,0,400.0,300.0,242200.0,2022-04-28,16,0
621,Chernihivska,UA74,0,171600.0,900.0,0.0,0.0,0.0,900.0,38000.0,2100.0,0,200.0,25000.0,171600.0,2022-04-28,1,0
54,Dnipropetrovska,UA12,400.0,231800.0,18600.0,400.0,2200.0,3700.0,24900.0,34200.0,23500.0,13300.0,18800.0,4000.0,231800.0,2022-04-28,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53,Volynska,UA07,9301,48442.0,0.0,156594.0,24265.0,69061.0,197390.0,290618.0,178808.0,396.0,29013.0,89246.0,290618.0,2023-01-05,21,26
296,Luhanska,UA44,1628,95665.0,0.0,30051.0,6942.0,120298.0,54648.0,325102.0,31431.0,5627.0,19377.0,151824.0,325102.0,2023-01-05,12,26
377,Odeska,UA51,30339,489739.0,0.0,30113.0,9424.0,184449.0,130637.0,412864.0,346180.0,9333.0,88215.0,159352.0,489739.0,2023-01-05,15,26
431,Rivnenska,UA56,21059,135289.0,0.0,179968.0,17690.0,77063.0,237598.0,351300.0,166146.0,396.0,41267.0,968.0,351300.0,2023-01-05,17,26


In [204]:
num_of_org.iloc[2:5]

Unnamed: 0,ADMIN1_ID,DATE,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,MPC,NUTRITION,...,PC_GBV,PC_MA,PC_PC,SHELTER,WASH,TOTAL,LOGISTICS,OBLAST,OBLAST_ENCODED,DATE_ENCODED
2,UA05,2022-05-12,2.0,1.0,1.0,6.0,20.0,18.0,10.0,0.0,...,4.0,2.0,14.0,6.0,9.0,49.0,0.0,Vinnytska,20,2
3,UA05,2022-05-19,2.0,1.0,1.0,6.0,20.0,20.0,10.0,0.0,...,4.0,2.0,14.0,6.0,9.0,50.0,0.0,Vinnytska,20,3
4,UA05,2022-05-26,0.0,1.0,1.0,6.0,20.0,20.0,9.0,0.0,...,4.0,4.0,12.0,7.0,9.0,48.0,0.0,Vinnytska,20,4


In [205]:
people_reached.iloc[18:25]

Unnamed: 0,OBLAST,ADMIN1_ID,EDUCATION,FSL,GENERALPROTECTION,CP,GBV,MINEACTION,PROTECTIONTOTAL,HEALTH,MPC,NUTRITION,SHELTER,WASH,PEOPLEREACHED,DATE,OBLAST_ENCODED,DATE_ENCODED
18,Vinnytska,UA05,18505,39158.0,0.0,107050.0,19299.0,105178.0,275036.0,88760.0,243513.0,0,107109.0,395394.0,395394.0,2022-09-29,20,18
19,Vinnytska,UA05,18505,39158.0,0.0,107050.0,19299.0,105178.0,275036.0,89916.5,243637.5,0,107109.0,395394.0,395394.0,2022-10-06,20,19
20,Vinnytska,UA05,18519,39158.0,0.0,107050.0,19299.0,105178.0,275036.0,91073.0,243762.0,0,107109.0,395394.0,395394.0,2022-10-13,20,20
21,Vinnytska,UA05,18519,39158.0,0.0,107050.0,19622.0,105178.0,220783.0,91073.0,246287.0,0,141662.0,395394.0,395394.0,2022-10-27,20,21
22,Vinnytska,UA05,18519,39158.0,0.0,119159.0,19871.0,106283.0,224877.0,98300.0,253536.7,0,146310.7,413336.0,413336.0,2022-11-10,20,22
23,Vinnytska,UA05,18519,39158.0,0.0,131268.0,20120.0,107388.0,228971.0,105527.0,260786.3,0,150959.3,431278.0,431278.0,2022-11-25,20,23
24,Vinnytska,UA05,19499,39158.0,0.0,143377.0,20369.0,108493.0,233065.0,112754.0,268036.0,0,155608.0,449220.0,449220.0,2022-12-08,20,24


In [207]:
people_reached.describe()

Unnamed: 0,FSL,GENERALPROTECTION,CP,GBV,MINEACTION,PROTECTIONTOTAL,HEALTH,MPC,SHELTER,WASH,PEOPLEREACHED,OBLAST_ENCODED,DATE_ENCODED
count,675.0,675.0,675.0,675.0,675.0,675.0,675.0,675.0,675.0,675.0,675.0,675.0,675.0
mean,322360.1,1079.851852,48951.50963,10075.06963,87349.311111,169513.685926,229081.474815,113779.737037,51623.983704,179315.7,442267.9,12.0,13.0
std,402087.8,4596.280587,64356.53692,9379.233431,81629.120822,172213.859271,180738.593741,91701.2334,62037.192128,212746.6,373000.8,7.21645,7.794657
min,7900.0,0.0,0.0,0.0,0.0,800.0,700.0,0.0,200.0,0.0,13800.0,0.0,0.0
25%,62808.0,0.0,3300.0,2150.0,1550.0,35550.0,85750.0,41983.5,12873.5,25000.0,219284.0,6.0,6.0
50%,129400.0,0.0,22900.0,7600.0,85592.0,133087.0,211600.0,101214.0,31700.0,102661.0,377200.0,12.0,13.0
75%,511900.0,0.0,73111.5,15446.35,117536.0,235802.5,345422.5,159354.5,69420.0,294335.0,545800.0,18.0,20.0
max,2070200.0,37900.0,372667.0,54959.0,352145.0,974403.0,761036.0,523050.0,505784.0,1595189.0,2070200.0,24.0,26.0


In [208]:
people_reached.columns

Index(['OBLAST', 'ADMIN1_ID', 'EDUCATION', 'FSL', 'GENERALPROTECTION', 'CP',
       'GBV', 'MINEACTION', 'PROTECTIONTOTAL', 'HEALTH', 'MPC', 'NUTRITION',
       'SHELTER', 'WASH', 'PEOPLEREACHED', 'DATE', 'OBLAST_ENCODED',
       'DATE_ENCODED'],
      dtype='object')

In [209]:
################ Linear Regression CCCM #############

In [210]:
# corrs = df_encode.corr()['CCCM']
# corrs_cccm = corrs.sort_values(ascending =False)
# corrs_cccm

In [211]:
# columns_to_drop = ['NUTRITION', 'PC_MA', 'CCS', 'HEALTH',
#                    'LOGISTICS', 'ETC', 'PC_PC', 'PROTECTIONTOTAL','OBLAST','ADMIN1_ID','DATE']
# df_cccm = df_encode.copy()
# # Drop the specified columns from the copied DataFrame
# df_cccm.drop(columns=columns_to_drop, inplace=True, axis=1)
# df_cccm


In [212]:
# X = df_cccm.drop(columns=['CCCM'], axis=1)
# y = df_cccm['CCCM']

In [213]:
# X_train, X_test , y_train, y_test = train_test_split(X,y, test_size=.2, random_state=42)

In [214]:
# model_lr_CCCM = LinearRegression()
# model_lr_CCCM.fit(X_train,y_train)
# prediction = model_lr_CCCM.predict(X_test)

In [215]:
# features = ['EDUCATION','FSL','MPC','PC_CP','PC_GBV','SHELTER','WASH','TOTAL','PEOPLE_REACHED','OBLAST_ENCODED','DATE_ENCODED']

In [216]:
# print('Model Summary:\n')

# # Print intercept (alpha), Value of the model's prediction when all input features are set to zero. Can be considered as the baseline prediction value.
# print('Intercept:')
# print('alpha = ' , model_lr_CCCM.intercept_)

# # Print weights, Features with larger absolute weights have a stronger impact on the model's predictions
# print('\nWeights:')
# i = 0
# for w in model_lr_CCCM.coef_:
#     print('w_',i+1,'= ', w, ' [ weight of ', features[i],']')
#     i += 1

In [217]:
# #RMSE finds the average error, the differences between the predicted values and the actual values. high is bad, low is good
# print('\nModel Performance\n\nRMSE = %.2f' % np.sqrt(mean_squared_error(y_test, prediction)))

# #the coefficient of determination : 1 is perfect prediction
# #measure of the proportion of variability in the prediction
# print('R^2= % .2f' % r2_score(y_test,prediction))

In [218]:

# param_grid = {
#     'fit_intercept': [True, False],  # Whether to calculate intercept or not
#     'positive': [True, False]  ,
#     'copy_X': [True,False]
# }
# print('Running Grid Search...')
# # negative sign is used because GridSearchCV maximizes a scoring function, and we want to minimize the MSE.
# lr_grid_search = GridSearchCV(model_lr_CCCM, param_grid, cv=5 ,scoring='neg_mean_squared_error')
# lr_grid_search.fit(X,y)
# print('Done')


In [219]:
# # A lower MSE is desirable, as it indicates that the model's predictions are closer to the actual values
# print("Best Parameters: ", lr_grid_search.best_params_)
# print("Best  Mean Squared Error: ",-1 * lr_grid_search.best_score_)

In [220]:
# model_lr_CCCM_gs = LinearRegression(fit_intercept=False,positive=True)
# model_lr_CCCM_gs.fit(X_train,y_train)
# lr_prediction_gs = model_lr_CCCM_gs.predict(X_test)

In [221]:
# print('Model Summary:\n')

# # Print intercept (alpha)
# print('Intercept:')
# print('alpha = ' , model_lr_CCCM_gs.intercept_)

# # Print weights
# print('\nWeights:')
# i = 0
# for w in model_lr_CCCM_gs.coef_:
#     print('w_',i+1,'= ', w, ' [ weight of ', features[i],']')
#     i += 1

In [222]:
# lr_rmse = mean_squared_error(y_test, lr_prediction_gs,squared=False)
# lr_r2 = r2_score(y_test,lr_prediction_gs)

# print('[LR] Root Mean Squared Error: {0}'.format(lr_rmse))
# print('[LR] R2: {0}'.format(lr_r2))

In [223]:
###### LINEAR REGRESSION CCS #########

In [224]:
# corrs = df_encode.corr()['CCS']
# corrs_ccs = corrs.sort_values(ascending =False)
# corrs_ccs

In [225]:
# columns_to_drop = ['TOTAL','PC_CP', 'PC_MA', 'WASH','PC_GBV','CCCM','FSL','EDUCATION','SHELTER','MPC',
#                    'PC_PC', 'PROTECTIONTOTAL','OBLAST','ADMIN1_ID','DATE']
# df_ccs = df_encode.copy()
# # Drop the specified columns from the copied DataFrame
# df_ccs.drop(columns=columns_to_drop, inplace=True, axis=1)
# df_ccs

In [226]:
# #Multiple Linear Regression
# X = df_ccs.drop(columns=['CCS'], axis=1)
# y = df_ccs['CCS']

In [227]:
# X_train, X_test , y_train, y_test = train_test_split(X,y, test_size=.2, random_state=42)

In [228]:
# model_lr_CCS = LinearRegression()
# model_lr_CCS.fit(X_train,y_train)
# prediction = model_lr_CCS.predict(X_test)

In [229]:
# features =[	'ETC','HEALTH','NUTRITION','LOGISTICS','PEOPLE_REACHED','OBLAST_ENCODED','DATE_ENCODED']

In [230]:
# print('Model Summary:\n')

# # Print intercept (alpha), Value of the model's prediction when all input features are set to zero. Can be considered as the baseline prediction value.
# print('Intercept:')
# print('alpha = ' , model_lr_CCS.intercept_)

# # Print weights, Features with larger absolute weights have a stronger impact on the model's predictions
# print('\nWeights:')
# i = 0
# for w in model_lr_CCS.coef_:
#     print('w_',i+1,'= ', w, ' [ weight of ', features[i],']')
#     i += 1

In [231]:
# #RMSE finds the differences between the predicted values and the actual values.
# print('\nModel Performance\n\nRMSE = %.2f' % np.sqrt(mean_squared_error(y_test, prediction)))

# #the coefficient of determination : 1 is perfect prediction
# #measure of the proportion of variability in the prediction
# print('R^2= % .2f' % r2_score(y_test,prediction))

In [232]:

# param_grid = {
#     'fit_intercept': [True, False],  # Whether to calculate intercept or not
#     'positive': [True, False]  ,
#     'copy_X': [True,False]
# }
# print('Running Grid Search...')
# lr_grid_search = GridSearchCV(model_lr_CCS, param_grid, cv=5 ,scoring='neg_mean_squared_error')
# lr_grid_search.fit(X,y)
# print('Done')

In [233]:

# print("Best Parameters: ", lr_grid_search.best_params_)
# print("Best  Mean Squared Error: ",-1 * lr_grid_search.best_score_)


In [234]:
# model_lr_CCS_gs = LinearRegression(positive=True)
# model_lr_CCS_gs.fit(X_train,y_train)
# lr_prediction_gs = model_lr_CCS_gs.predict(X_test)

In [235]:
# print('Model Summary:\n')

# # Print intercept (alpha)
# print('Intercept:')
# print('alpha = ' , model_lr_CCS_gs.intercept_)
# # Print weights
# print('\nWeights:')
# i = 0
# for w in model_lr_CCS_gs.coef_:
#     print('w_',i+1,'= ', w, ' [ weight of ', features[i],']')
#     i += 1

In [236]:
# lr_rmse = mean_squared_error(y_test, lr_prediction_gs,squared=False)
# lr_r2 = r2_score(y_test,lr_prediction_gs)

# print('[LR] Root Mean Squared Error: {0}'.format(lr_rmse))
# print('[LR] R2: {0}'.format(lr_r2))

In [237]:
#### LINEAR REGRESSION ETC #####

In [238]:
# corrs = df_encode.corr()['ETC']
# corrs_etc = corrs.sort_values(ascending =False)
# corrs_etc

In [239]:
# columns_to_drop = ['TOTAL','PC_CP', 'PC_MA', 'WASH','PC_GBV','CCCM','FSL','EDUCATION','SHELTER','MPC',
#                    'PC_PC', 'PROTECTIONTOTAL','OBLAST','ADMIN1_ID','DATE','NUTRITION']
# df_etc = df_encode.copy()
# # Drop the specified columns from the copied DataFrame
# df_etc.drop(columns=columns_to_drop, inplace=True, axis=1)
# df_etc

In [240]:
# X = df_etc.drop(columns=['ETC'], axis=1)
# y = df_etc['ETC']

In [241]:
# X_train, X_test , y_train, y_test = train_test_split(X,y, test_size=.2, random_state=42)

In [242]:
# model_lr_ETC = LinearRegression()
# model_lr_ETC.fit(X_train,y_train)
# prediction = model_lr_ETC.predict(X_test)

In [243]:
# features =[	'CCS','HEALTH','LOGISTICS','PEOPLE_REACHED','OBLAST_ENCODED','DATE_ENCODED']

In [244]:
# print('Model Summary:\n')

# # Print intercept (alpha)
# print('Intercept:')
# print('alpha = ' , model_lr_ETC.intercept_)

# # Print weights
# print('\nWeights:')
# i = 0
# for w in model_lr_ETC.coef_:
#     print('w_',i+1,'= ', w, ' [ weight of ', features[i],']')
#     i += 1

In [245]:
# #RMSE finds the differences between the predicted values and the actual values.
# print('\nModel Performance\n\nRMSE = %.2f' % np.sqrt(mean_squared_error(y_test, prediction)))#
# #the coefficient of determination : 1 is perfect prediction
# #measure of the proportion of variability in the prediction
# print('R^2= % .2f' % r2_score(y_test,prediction))

In [246]:
# param_grid = {
#     'fit_intercept': [True, False],  # Whether to calculate intercept or not
#     'positive': [True, False]  ,
#     'copy_X': [True,False]
# }
# print('Running Grid Search...')
# lr_grid_search = GridSearchCV(model_lr_ETC, param_grid, cv=5 ,scoring='neg_mean_squared_error')
# lr_grid_search.fit(X,y)
# print('Done')

In [247]:
# print("Best Parameters: ", lr_grid_search.best_params_)
# print("Best  Mean Squared Error: ",-1 * lr_grid_search.best_score_)

In [248]:
# model_lr_ETC_gs = LinearRegression(fit_intercept=False, positive=True)
# model_lr_ETC_gs.fit(X_train,y_train)
# lr_prediction_gs = model_lr_ETC_gs.predict(X_test)

In [249]:
# print('Model Summary:\n')

# # Print intercept (alpha)
# print('Intercept:')
# print('alpha = ' , model_lr_ETC_gs.intercept_)

# # Print weights
# print('\nWeights:')
# i = 0
# for w in model_lr_ETC_gs.coef_:
#     print('w_',i+1,'= ', w, ' [ weight of ', features[i],']')
#     i += 1

In [250]:
# lr_rmse = mean_squared_error(y_test, lr_prediction_gs,squared=False)
# lr_r2 = r2_score(y_test,lr_prediction_gs)

# print('[LR] Root Mean Squared Error: {0}'.format(lr_rmse))
# print('[LR] R2: {0}'.format(lr_r2))

In [251]:
# LINEAR REGRESSION TOTAL #######

In [252]:
# corrs = df_encode.corr()['TOTAL']
# corrs_total = corrs.sort_values(ascending =False)
# corrs_total

In [253]:
# columns_to_drop = ['CCS','LOGISTICS', 'ETC', 
#                    'PC_PC', 'PROTECTIONTOTAL','OBLAST','ADMIN1_ID','DATE']
# df_total = df_encode.copy()
# # Drop the specified columns from the copied DataFrame
# df_total.drop(columns=columns_to_drop, inplace=True, axis=1)
# df_total

In [254]:
# X = df_total.drop(columns=['TOTAL'], axis=1)
# y = df_total['TOTAL']

In [255]:
# X_train, X_test , y_train, y_test = train_test_split(X,y, test_size=.2, random_state=42)

In [256]:
# model_lr_TOTAL = LinearRegression()
# model_lr_TOTAL.fit(X_train,y_train)
# prediction = model_lr_TOTAL.predict(X_test)

In [257]:
# features =[	'CCCM','HEALTH','PEOPLE_REACHED','OBLAST_ENCODED','DATE_ENCODED','EDUCATION','FSL','MPC','NUTRITION','PC_CP','PC_GBV','PC_MA','SHELTER','WASH']

In [258]:
# print('Model Summary:\n')

# # Print intercept (alpha), Value of the model's prediction when all input features are set to zero. Can be considered as the baseline prediction value.
# print('Intercept:')
# print('alpha = ' , model_lr_TOTAL.intercept_)

# # Print weights, Features with larger absolute weights have a stronger impact on the model's predictions
# print('\nWeights:')
# i = 0
# for w in model_lr_TOTAL.coef_:
#     print('w_',i+1,'= ', w, ' [ weight of ', features[i],']')
#     i += 1

In [259]:
# #RMSE finds the differences between the predicted values and the actual values.
# print('\nModel Performance\n\nRMSE = %.2f' % np.sqrt(mean_squared_error(y_test, prediction)))#
# #the coefficient of determination : 1 is perfect prediction
# #measure of the proportion of variability in the prediction
# print('R^2= % .2f' % r2_score(y_test,prediction))

In [260]:
# param_grid = {
#     'fit_intercept': [True, False],  # Whether to calculate intercept or not
#     'positive': [True, False]  ,
#     'copy_X': [True,False]
# }
# print('Running Grid Search...')
# lr_grid_search = GridSearchCV(model_lr_TOTAL, param_grid, cv=5 ,scoring='neg_mean_squared_error')
# lr_grid_search.fit(X,y)
# print('Done')

In [261]:
# print("Best Parameters: ", lr_grid_search.best_params_)
# print("Best  Mean Squared Error: ",-1 * lr_grid_search.best_score_)

In [262]:
# model_lr_TOTAL_gs = LinearRegression(fit_intercept=False,positive=True)
# model_lr_TOTAL_gs.fit(X_train,y_train)
# lr_prediction_gs = model_lr_TOTAL_gs.predict(X_test)

In [263]:
# print('Model Summary:\n')

# # Print intercept (alpha)
# print('Intercept:')
# print('alpha = ' , model_lr_TOTAL_gs.intercept_)

# # Print weights
# print('\nWeights:')
# i = 0
# for w in model_lr_TOTAL_gs.coef_:
#     print('w_',i+1,'= ', w, ' [ weight of ', features[i],']')
#     i += 1

In [264]:
# lr_rmse = mean_squared_error(y_test, lr_prediction_gs,squared=False)
# lr_r2 = r2_score(y_test,lr_prediction_gs)

# print('[LR] Root Mean Squared Error: {0}'.format(lr_rmse))
# print('[LR] R2: {0}'.format(lr_r2))

In [265]:
# copy_df = pd.DataFrame(np.nan, index=merged_sheet1.index,columns=merged_sheet1.columns)
# copy_df['DATE'] = merged_sheet1['DATE']
# copy_df['OBLAST'] = merged_sheet1['OBLAST']
# copy_df.head()

In [266]:
# # Create an Excel writer object
# excel_writer1 = pd.ExcelWriter('Data_ Round 33 - Ukraine 5W - 2022-11-10.xlsx', engine='xlsxwriter')
# excel_writer2 = pd.ExcelWriter('Data_ Round 34 - Ukraine 5W - 2022-11-25.xlsx', engine='xlsxwriter')
# # Write each DataFrame to a separate sheet in the Excel file
# copy_df.to_excel(excel_writer1, sheet_name='Num_of_Orgs_by_Oblast', index=False)
# copy_df.to_excel(excel_writer2, sheet_name='Num_of_Orgs_by_Oblast', index=False)