In [110]:
import pandas as pd
import numpy as np
import os
import re
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
import pickle

In [111]:
def getDate(filename):
    match_str = re.search(r'\d{4}-\d{2}-\d{2}',filename)
    try:
        return datetime.datetime.strptime(match_str.group(), '%Y-%m-%d').date()
    except ValueError:
        return None

In [112]:
def cleanColumn(name):
    name = name.upper()
    clean_name = re.sub(r'[^a-zA-Z0-9_]', '', name)
    clean_name = clean_name.replace('\n', ' ')
    if clean_name == 'SHELTERNFI':
        clean_name = 'SHELTER'
    if clean_name == 'PROTECTIONCP':
        clean_name = 'PC_CP'
    if clean_name == 'PROTECTIONGBV':
        clean_name = 'PC_GBV'
    if clean_name == 'PROTECTIONMA':
        clean_name = 'PC_MA'
    if clean_name == 'TOTALREACHED':
        clean_name = 'PEOPLEREACHED'
    return clean_name

In [113]:
#using csv and xlsx
csv_dir = 'ukraine_data_excel'


In [165]:
#using xlsx
sheet1_df = []
sheet2_df = []
sheet3_df = []

In [166]:
#using xlsx
for filename in os.listdir(csv_dir):
  if filename.endswith('.xlsx'):
    file_path = os.path.join(csv_dir,filename)
    excel_df = pd.ExcelFile(file_path)
    files_date = getDate(filename)
    for sheet in excel_df.sheet_names:
      df = pd.read_excel(excel_df,sheet_name=sheet)
      df.columns = [cleanColumn(col) for col in df.columns]
      df['DATE'] = files_date
      if sheet == 'Num_of_Orgs_by_Oblast':
        sheet1_df.append(df)
      elif sheet == 'People_Reached_by_Oblast':
        sheet2_df.append(df)
      elif sheet == 'UDE_Inputs':
        sheet3_df.append(df)

In [167]:
combined_sheet1 = pd.concat(sheet1_df, keys=[str(df['DATE'].iloc[0]) for df in sheet1_df])
res_sheet1 = combined_sheet1.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()
merged_sheet1 = pd.merge(res_sheet1, combined_sheet1[['ADMIN1_ID', 'DATE', 'OBLAST']], on=['ADMIN1_ID', 'DATE'], how='left')

  res_sheet1 = combined_sheet1.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()


In [168]:
combined_sheet2 = pd.concat(sheet2_df, keys=[str(df['DATE'].iloc[0]) for df in sheet2_df])
res_sheet2 = combined_sheet2.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()
merged_sheet2 = pd.merge(res_sheet2, combined_sheet2[['ADMIN1_ID', 'DATE', 'OBLAST']], on=['ADMIN1_ID', 'DATE'], how='left')

  res_sheet2 = combined_sheet2.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()


In [169]:
combined_sheet3 = pd.concat(sheet3_df, keys=[str(df['DATE'].iloc[0]) for df in sheet3_df])
res_sheet3 = combined_sheet3.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()
merged_sheet3 = pd.merge(res_sheet3, combined_sheet3[['ADMIN1_ID', 'DATE', 'OBLAST']], on=['ADMIN1_ID', 'DATE'], how='left')

  res_sheet3 = combined_sheet3.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()


In [180]:
#Num_of_Orgs_by_Oblast
merged_sheet1.iloc[20:35]


Unnamed: 0,ADMIN1_ID,DATE,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,MPC,NUTRITION,PROTECTIONTOTAL,PC_CP,PC_GBV,PC_MA,PC_PC,SHELTER,WASH,TOTAL,LOGISTICS,OBLAST
20,UA05,2022-09-22,2.0,1.0,0.0,4.0,20.0,13.0,6.0,0.0,0.0,4.0,3.0,3.0,0.0,7.0,9.0,58.0,0.0,Vinnytska
21,UA05,2022-09-29,3.0,1.0,0.0,10.0,34.0,24.0,17.0,0.0,0.0,18.0,7.0,4.0,0.0,23.0,15.0,107.0,0.0,Vinnytska
22,UA05,2022-10-06,,,,,,,,,,,,,,,,,,Vinnytska
23,UA05,2022-10-13,3.0,1.0,0.0,11.0,35.0,24.0,17.0,0.0,0.0,25.0,16.0,5.0,0.0,23.0,13.0,123.0,0.0,Vinnytska
24,UA05,2022-10-27,3.0,1.0,0.0,13.0,36.0,24.0,17.0,0.0,0.0,25.0,10.0,5.0,0.0,23.0,13.0,119.0,0.0,Vinnytska
25,UA05,2022-11-10,,,,,,,,,,,,,,,,,,Vinnytska
26,UA05,2022-11-25,,,,,,,,,,,,,,,,,,Vinnytska
27,UA05,2022-12-08,3.0,1.0,0.0,13.0,36.0,27.0,18.0,0.0,0.0,26.0,11.0,7.0,0.0,24.0,13.0,125.0,0.0,Vinnytska
28,UA05,2022-12-22,2.0,1.0,0.0,13.0,36.0,33.0,18.0,0.0,0.0,29.0,11.0,8.0,0.0,24.0,13.0,130.0,0.0,Vinnytska
29,UA05,2023-01-05,3.0,1.0,1.0,17.0,39.0,33.0,19.0,0.0,0.0,33.0,10.0,8.0,0.0,25.0,14.0,138.0,0.0,Vinnytska


In [188]:
#People_Reached_by_Oblast
merged_sheet2.iloc[18:35]

Unnamed: 0,ADMIN1_ID,DATE,FSL,GENERALPROTECTION,CP,MINEACTION,PROTECTIONTOTAL,HEALTH,MPC,SHELTER,WASH,PEOPLEREACHED,OBLAST
18,UA05,2022-09-29,39158.0,0.0,107050.0,105178.0,275036.0,88760.0,243513.0,107109.0,395394.0,395394.0,Vinnytska
19,UA05,2022-10-06,,,,,,,,,,,Vinnytska
20,UA05,2022-10-13,39158.0,0.0,107050.0,105178.0,275036.0,91073.0,243762.0,107109.0,395394.0,395394.0,Vinnytska
21,UA05,2022-10-27,39158.0,0.0,107050.0,105178.0,220783.0,91073.0,246287.0,141662.0,395394.0,395394.0,Vinnytska
22,UA05,2022-11-10,,,,,,,,,,,Vinnytska
23,UA05,2022-11-25,,,,,,,,,,,Vinnytska
24,UA05,2022-12-08,39158.0,0.0,143377.0,108493.0,233065.0,112754.0,268036.0,155608.0,449220.0,449220.0,Vinnytska
25,UA05,2022-12-22,39158.0,0.0,167914.0,108561.0,262403.0,112875.0,271149.0,157030.0,449220.0,449220.0,Vinnytska
26,UA05,2023-01-05,39158.0,0.0,167914.0,108561.0,272219.0,112875.0,271149.0,159927.0,449220.0,449220.0,Vinnytska
27,UA07,2022-04-28,27800.0,15000.0,300.0,100.0,15500.0,85800.0,7800.0,6700.0,0.0,85800.0,Volynska


In [164]:
merged_sheet3.iloc[20:35]

Unnamed: 0,ADMIN1_ID,DATE,PEOPLEREACHED,NUMBEROFORGANISATIONS,OBLAST
20,UA05,2022-07-07,429200,112,Vinnytska
21,UA05,2022-07-07,429200,112,Vinnytska
22,UA05,2022-07-14,432000,74,Vinnytska
23,UA05,2022-07-14,432000,74,Vinnytska
24,UA05,2022-07-21,434400,134,Vinnytska
25,UA05,2022-07-21,434400,134,Vinnytska
26,UA05,2022-07-28,782486,138,Vinnytska
27,UA05,2022-07-28,782486,138,Vinnytska
28,UA05,2022-08-04,782486,86,Vinnytska
29,UA05,2022-08-04,782486,86,Vinnytska


In [None]:
merged_sheet1['OBLAST'].value_counts()

In [174]:
sum(merged_sheet3['PEOPLEREACHED'].value_counts())

750

In [177]:
# replace zero values in the empty tuple to NAN for future linear interpolation of ORGS
#NOTE : THIS IS HARD CODED NOT OPTIMAL
merged_sheet1.iloc[22] = merged_sheet1.iloc[22].replace(0,np.nan)
merged_sheet1.iloc[25] = merged_sheet1.iloc[25].replace(0,np.nan)
merged_sheet1.iloc[26] = merged_sheet1.iloc[26].replace(0,np.nan)

# merged_sheet3.iloc[25] = merged_sheet3.iloc[25].replace(merged_sheet3.iloc[25]['PEOPLEREACHED'],np.nan)
# merged_sheet3.iloc[26] = merged_sheet3.iloc[26].replace(merged_sheet3.iloc[26]['PEOPLEREACHED'],np.nan)


In [187]:
# replace zero values in the empty tuple to NAN for future linear interpolation of People
#NOTE : THIS IS HARD CODED NOT OPTIMAL
merged_sheet2.iloc[19] = merged_sheet2.iloc[19].replace(0,np.nan)
merged_sheet2.iloc[22] = merged_sheet2.iloc[22].replace(0,np.nan)
merged_sheet2.iloc[23] = merged_sheet2.iloc[23].replace(0,np.nan)

In [None]:
# merged_sheet1['PEOPLE_REACHED'] = merged_sheet3['PEOPLEREACHED']

In [178]:
#check for null values
merged_sheet1.isna().any().any()

True

In [179]:
# check for null values in specific row
merged_sheet1.iloc[26].isna().any()

True

In [195]:
#filled null values using linear interpolation
num_of_org_interpolated_df = merged_sheet1.interpolate(method='linear')

In [196]:
num_of_org_interpolated_df.isna().any().any()

False

In [197]:
# rounded all int and float dtypes since interpolation caused floats
num_of_org_interpolated_df = num_of_org_interpolated_df.applymap(lambda x: round(x, 1) if isinstance(x, (int, float)) else x)

In [206]:
num_of_org_interpolated_df.iloc[20:35]

Unnamed: 0,ADMIN1_ID,DATE,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,MPC,NUTRITION,PROTECTIONTOTAL,PC_CP,PC_GBV,PC_MA,PC_PC,SHELTER,WASH,TOTAL,LOGISTICS,OBLAST
20,UA05,2022-09-22,2.0,1.0,0.0,4.0,20.0,13.0,6.0,0.0,0.0,4.0,3.0,3.0,0.0,7.0,9.0,58.0,0.0,Vinnytska
21,UA05,2022-09-29,3.0,1.0,0.0,10.0,34.0,24.0,17.0,0.0,0.0,18.0,7.0,4.0,0.0,23.0,15.0,107.0,0.0,Vinnytska
22,UA05,2022-10-06,3.0,1.0,0.0,10.5,34.5,24.0,17.0,0.0,0.0,21.5,11.5,4.5,0.0,23.0,14.0,115.0,0.0,Vinnytska
23,UA05,2022-10-13,3.0,1.0,0.0,11.0,35.0,24.0,17.0,0.0,0.0,25.0,16.0,5.0,0.0,23.0,13.0,123.0,0.0,Vinnytska
24,UA05,2022-10-27,3.0,1.0,0.0,13.0,36.0,24.0,17.0,0.0,0.0,25.0,10.0,5.0,0.0,23.0,13.0,119.0,0.0,Vinnytska
25,UA05,2022-11-10,3.0,1.0,0.0,13.0,36.0,25.0,17.3,0.0,0.0,25.3,10.3,5.7,0.0,23.3,13.0,121.0,0.0,Vinnytska
26,UA05,2022-11-25,3.0,1.0,0.0,13.0,36.0,26.0,17.7,0.0,0.0,25.7,10.7,6.3,0.0,23.7,13.0,123.0,0.0,Vinnytska
27,UA05,2022-12-08,3.0,1.0,0.0,13.0,36.0,27.0,18.0,0.0,0.0,26.0,11.0,7.0,0.0,24.0,13.0,125.0,0.0,Vinnytska
28,UA05,2022-12-22,2.0,1.0,0.0,13.0,36.0,33.0,18.0,0.0,0.0,29.0,11.0,8.0,0.0,24.0,13.0,130.0,0.0,Vinnytska
29,UA05,2023-01-05,3.0,1.0,1.0,17.0,39.0,33.0,19.0,0.0,0.0,33.0,10.0,8.0,0.0,25.0,14.0,138.0,0.0,Vinnytska


In [None]:
num_of_org_interpolated_df.sort_values(by=['DATE'])

In [None]:
num_of_org_interpolated_df.describe()

In [201]:
#filled null values using linear interpolation
people_reached_interpolated_df = merged_sheet2.interpolate(method='linear')

In [202]:
people_reached_interpolated_df.isna().any().any()

False

In [203]:
# rounded all int and float dtypes since interpolation caused floats
people_reached_interpolated_df = people_reached_interpolated_df.applymap(lambda x: round(x, 1) if isinstance(x, (int, float)) else x)

In [205]:
people_reached_interpolated_df.iloc[18:26]

Unnamed: 0,ADMIN1_ID,DATE,FSL,GENERALPROTECTION,CP,MINEACTION,PROTECTIONTOTAL,HEALTH,MPC,SHELTER,WASH,PEOPLEREACHED,OBLAST
18,UA05,2022-09-29,39158.0,0.0,107050.0,105178.0,275036.0,88760.0,243513.0,107109.0,395394.0,395394.0,Vinnytska
19,UA05,2022-10-06,39158.0,0.0,107050.0,105178.0,275036.0,89916.5,243637.5,107109.0,395394.0,395394.0,Vinnytska
20,UA05,2022-10-13,39158.0,0.0,107050.0,105178.0,275036.0,91073.0,243762.0,107109.0,395394.0,395394.0,Vinnytska
21,UA05,2022-10-27,39158.0,0.0,107050.0,105178.0,220783.0,91073.0,246287.0,141662.0,395394.0,395394.0,Vinnytska
22,UA05,2022-11-10,39158.0,0.0,119159.0,106283.0,224877.0,98300.0,253536.7,146310.7,413336.0,413336.0,Vinnytska
23,UA05,2022-11-25,39158.0,0.0,131268.0,107388.0,228971.0,105527.0,260786.3,150959.3,431278.0,431278.0,Vinnytska
24,UA05,2022-12-08,39158.0,0.0,143377.0,108493.0,233065.0,112754.0,268036.0,155608.0,449220.0,449220.0,Vinnytska
25,UA05,2022-12-22,39158.0,0.0,167914.0,108561.0,262403.0,112875.0,271149.0,157030.0,449220.0,449220.0,Vinnytska


In [None]:
# create label encoder instance
label_encoder = LabelEncoder()

#fit label encoder to transform Oblast column
encoded_oblast = label_encoder.fit_transform(interpolated_df['OBLAST'])

#create copy
df_encoded = interpolated_df.copy()

#Add column and assign values
df_encoded['OBLAST_ENCODED'] = encoded_oblast

In [None]:
# create label encoder instance
label_encoder = LabelEncoder()

#fit label encoder to transform Date column
encoded_oblast = label_encoder.fit_transform(df_encoded['DATE'])

#create copy
df_encode = df_encoded.copy()

#Add column and assign values
df_encode['DATE_ENCODED'] = encoded_oblast

In [None]:
df_encode.sort_values(by=['DATE'])

In [None]:
df_encoded.columns

In [None]:
# Linear Regression CCCM #############

In [None]:
corrs = df_encode.corr()['CCCM']
corrs_cccm = corrs.sort_values(ascending =False)
corrs_cccm

In [None]:
columns_to_drop = ['NUTRITION', 'PC_MA', 'CCS', 'HEALTH',
                   'LOGISTICS', 'ETC', 'PC_PC', 'PROTECTIONTOTAL','OBLAST','ADMIN1_ID','DATE']
df_cccm = df_encode.copy()
# Drop the specified columns from the copied DataFrame
df_cccm.drop(columns=columns_to_drop, inplace=True, axis=1)
df_cccm


In [None]:
X = df_cccm.drop(columns=['CCCM'], axis=1)
y = df_cccm['CCCM']

In [None]:
X_train, X_test , y_train, y_test = train_test_split(X,y, test_size=.2, random_state=42)

In [None]:
model_lr_CCCM = LinearRegression()
model_lr_CCCM.fit(X_train,y_train)
prediction = model_lr_CCCM.predict(X_test)

In [None]:
features = ['EDUCATION','FSL','MPC','PC_CP','PC_GBV','SHELTER','WASH','TOTAL','PEOPLE_REACHED','OBLAST_ENCODED','DATE_ENCODED']

In [None]:
print('Model Summary:\n')

# Print intercept (alpha), Value of the model's prediction when all input features are set to zero. Can be considered as the baseline prediction value.
print('Intercept:')
print('alpha = ' , model_lr_CCCM.intercept_)

# Print weights, Features with larger absolute weights have a stronger impact on the model's predictions
print('\nWeights:')
i = 0
for w in model_lr_CCCM.coef_:
    print('w_',i+1,'= ', w, ' [ weight of ', features[i],']')
    i += 1

In [None]:
#RMSE finds the average error, the differences between the predicted values and the actual values. high is bad, low is good
print('\nModel Performance\n\nRMSE = %.2f' % np.sqrt(mean_squared_error(y_test, prediction)))

#the coefficient of determination : 1 is perfect prediction
#measure of the proportion of variability in the prediction
print('R^2= % .2f' % r2_score(y_test,prediction))

In [None]:

param_grid = {
    'fit_intercept': [True, False],  # Whether to calculate intercept or not
    'positive': [True, False]  ,
    'copy_X': [True,False]
}
print('Running Grid Search...')
# negative sign is used because GridSearchCV maximizes a scoring function, and we want to minimize the MSE.
lr_grid_search = GridSearchCV(model_lr_CCCM, param_grid, cv=5 ,scoring='neg_mean_squared_error')
lr_grid_search.fit(X,y)
print('Done')


In [None]:
# A lower MSE is desirable, as it indicates that the model's predictions are closer to the actual values
print("Best Parameters: ", lr_grid_search.best_params_)
print("Best  Mean Squared Error: ",-1 * lr_grid_search.best_score_)

In [None]:
model_lr_CCCM_gs = LinearRegression(fit_intercept=False,positive=True)
model_lr_CCCM_gs.fit(X_train,y_train)
lr_prediction_gs = model_lr_CCCM_gs.predict(X_test)

In [None]:
print('Model Summary:\n')

# Print intercept (alpha)
print('Intercept:')
print('alpha = ' , model_lr_CCCM_gs.intercept_)

# Print weights
print('\nWeights:')
i = 0
for w in model_lr_CCCM_gs.coef_:
    print('w_',i+1,'= ', w, ' [ weight of ', features[i],']')
    i += 1

In [None]:
lr_rmse = mean_squared_error(y_test, lr_prediction_gs,squared=False)
lr_r2 = r2_score(y_test,lr_prediction_gs)

print('[LR] Root Mean Squared Error: {0}'.format(lr_rmse))
print('[LR] R2: {0}'.format(lr_r2))

In [None]:
###### LINEAR REGRESSION CCS #########

In [None]:
corrs = df_encode.corr()['CCS']
corrs_ccs = corrs.sort_values(ascending =False)
corrs_ccs

In [None]:
columns_to_drop = ['TOTAL','PC_CP', 'PC_MA', 'WASH','PC_GBV','CCCM','FSL','EDUCATION','SHELTER','MPC',
                   'PC_PC', 'PROTECTIONTOTAL','OBLAST','ADMIN1_ID','DATE']
df_ccs = df_encode.copy()
# Drop the specified columns from the copied DataFrame
df_ccs.drop(columns=columns_to_drop, inplace=True, axis=1)
df_ccs

In [None]:
#Multiple Linear Regression
X = df_ccs.drop(columns=['CCS'], axis=1)
y = df_ccs['CCS']

In [None]:
X_train, X_test , y_train, y_test = train_test_split(X,y, test_size=.2, random_state=42)

In [None]:
model_lr_CCS = LinearRegression()
model_lr_CCS.fit(X_train,y_train)
prediction = model_lr_CCS.predict(X_test)

In [None]:
features =[	'ETC','HEALTH','NUTRITION','LOGISTICS','PEOPLE_REACHED','OBLAST_ENCODED','DATE_ENCODED']

In [None]:
print('Model Summary:\n')

# Print intercept (alpha), Value of the model's prediction when all input features are set to zero. Can be considered as the baseline prediction value.
print('Intercept:')
print('alpha = ' , model_lr_CCS.intercept_)

# Print weights, Features with larger absolute weights have a stronger impact on the model's predictions
print('\nWeights:')
i = 0
for w in model_lr_CCS.coef_:
    print('w_',i+1,'= ', w, ' [ weight of ', features[i],']')
    i += 1

In [None]:
#RMSE finds the differences between the predicted values and the actual values.
print('\nModel Performance\n\nRMSE = %.2f' % np.sqrt(mean_squared_error(y_test, prediction)))

#the coefficient of determination : 1 is perfect prediction
#measure of the proportion of variability in the prediction
print('R^2= % .2f' % r2_score(y_test,prediction))

In [None]:

param_grid = {
    'fit_intercept': [True, False],  # Whether to calculate intercept or not
    'positive': [True, False]  ,
    'copy_X': [True,False]
}
print('Running Grid Search...')
lr_grid_search = GridSearchCV(model_lr_CCS, param_grid, cv=5 ,scoring='neg_mean_squared_error')
lr_grid_search.fit(X,y)
print('Done')

In [None]:

print("Best Parameters: ", lr_grid_search.best_params_)
print("Best  Mean Squared Error: ",-1 * lr_grid_search.best_score_)


In [None]:
model_lr_CCS_gs = LinearRegression(positive=True)
model_lr_CCS_gs.fit(X_train,y_train)
lr_prediction_gs = model_lr_CCS_gs.predict(X_test)

In [None]:
print('Model Summary:\n')

# Print intercept (alpha)
print('Intercept:')
print('alpha = ' , model_lr_CCS_gs.intercept_)
# Print weights
print('\nWeights:')
i = 0
for w in model_lr_CCS_gs.coef_:
    print('w_',i+1,'= ', w, ' [ weight of ', features[i],']')
    i += 1

In [None]:
lr_rmse = mean_squared_error(y_test, lr_prediction_gs,squared=False)
lr_r2 = r2_score(y_test,lr_prediction_gs)

print('[LR] Root Mean Squared Error: {0}'.format(lr_rmse))
print('[LR] R2: {0}'.format(lr_r2))

In [None]:
#### LINEAR REGRESSION ETC #####

In [None]:
corrs = df_encode.corr()['ETC']
corrs_etc = corrs.sort_values(ascending =False)
corrs_etc

In [None]:
columns_to_drop = ['TOTAL','PC_CP', 'PC_MA', 'WASH','PC_GBV','CCCM','FSL','EDUCATION','SHELTER','MPC',
                   'PC_PC', 'PROTECTIONTOTAL','OBLAST','ADMIN1_ID','DATE','NUTRITION']
df_etc = df_encode.copy()
# Drop the specified columns from the copied DataFrame
df_etc.drop(columns=columns_to_drop, inplace=True, axis=1)
df_etc

In [None]:
X = df_etc.drop(columns=['ETC'], axis=1)
y = df_etc['ETC']

In [None]:
X_train, X_test , y_train, y_test = train_test_split(X,y, test_size=.2, random_state=42)

In [None]:
model_lr_ETC = LinearRegression()
model_lr_ETC.fit(X_train,y_train)
prediction = model_lr_ETC.predict(X_test)

In [None]:
features =[	'CCS','HEALTH','LOGISTICS','PEOPLE_REACHED','OBLAST_ENCODED','DATE_ENCODED']

In [None]:
print('Model Summary:\n')

# Print intercept (alpha)
print('Intercept:')
print('alpha = ' , model_lr_ETC.intercept_)

# Print weights
print('\nWeights:')
i = 0
for w in model_lr_ETC.coef_:
    print('w_',i+1,'= ', w, ' [ weight of ', features[i],']')
    i += 1

In [None]:
#RMSE finds the differences between the predicted values and the actual values.
print('\nModel Performance\n\nRMSE = %.2f' % np.sqrt(mean_squared_error(y_test, prediction)))#
#the coefficient of determination : 1 is perfect prediction
#measure of the proportion of variability in the prediction
print('R^2= % .2f' % r2_score(y_test,prediction))

In [None]:
param_grid = {
    'fit_intercept': [True, False],  # Whether to calculate intercept or not
    'positive': [True, False]  ,
    'copy_X': [True,False]
}
print('Running Grid Search...')
lr_grid_search = GridSearchCV(model_lr_ETC, param_grid, cv=5 ,scoring='neg_mean_squared_error')
lr_grid_search.fit(X,y)
print('Done')

In [None]:
print("Best Parameters: ", lr_grid_search.best_params_)
print("Best  Mean Squared Error: ",-1 * lr_grid_search.best_score_)

In [None]:
model_lr_ETC_gs = LinearRegression(fit_intercept=False, positive=True)
model_lr_ETC_gs.fit(X_train,y_train)
lr_prediction_gs = model_lr_ETC_gs.predict(X_test)

In [None]:
print('Model Summary:\n')

# Print intercept (alpha)
print('Intercept:')
print('alpha = ' , model_lr_ETC_gs.intercept_)

# Print weights
print('\nWeights:')
i = 0
for w in model_lr_ETC_gs.coef_:
    print('w_',i+1,'= ', w, ' [ weight of ', features[i],']')
    i += 1

In [None]:
lr_rmse = mean_squared_error(y_test, lr_prediction_gs,squared=False)
lr_r2 = r2_score(y_test,lr_prediction_gs)

print('[LR] Root Mean Squared Error: {0}'.format(lr_rmse))
print('[LR] R2: {0}'.format(lr_r2))

In [None]:
# LINEAR REGRESSION TOTAL #######

In [None]:
corrs = df_encode.corr()['TOTAL']
corrs_total = corrs.sort_values(ascending =False)
corrs_total

In [None]:
columns_to_drop = ['CCS','LOGISTICS', 'ETC', 
                   'PC_PC', 'PROTECTIONTOTAL','OBLAST','ADMIN1_ID','DATE']
df_total = df_encode.copy()
# Drop the specified columns from the copied DataFrame
df_total.drop(columns=columns_to_drop, inplace=True, axis=1)
df_total

In [None]:
X = df_total.drop(columns=['TOTAL'], axis=1)
y = df_total['TOTAL']

In [None]:
X_train, X_test , y_train, y_test = train_test_split(X,y, test_size=.2, random_state=42)

In [None]:
model_lr_TOTAL = LinearRegression()
model_lr_TOTAL.fit(X_train,y_train)
prediction = model_lr_TOTAL.predict(X_test)

In [None]:
features =[	'CCCM','HEALTH','PEOPLE_REACHED','OBLAST_ENCODED','DATE_ENCODED','EDUCATION','FSL','MPC','NUTRITION','PC_CP','PC_GBV','PC_MA','SHELTER','WASH']

In [None]:
print('Model Summary:\n')

# Print intercept (alpha), Value of the model's prediction when all input features are set to zero. Can be considered as the baseline prediction value.
print('Intercept:')
print('alpha = ' , model_lr_TOTAL.intercept_)

# Print weights, Features with larger absolute weights have a stronger impact on the model's predictions
print('\nWeights:')
i = 0
for w in model_lr_TOTAL.coef_:
    print('w_',i+1,'= ', w, ' [ weight of ', features[i],']')
    i += 1

In [None]:
#RMSE finds the differences between the predicted values and the actual values.
print('\nModel Performance\n\nRMSE = %.2f' % np.sqrt(mean_squared_error(y_test, prediction)))#
#the coefficient of determination : 1 is perfect prediction
#measure of the proportion of variability in the prediction
print('R^2= % .2f' % r2_score(y_test,prediction))

In [None]:
param_grid = {
    'fit_intercept': [True, False],  # Whether to calculate intercept or not
    'positive': [True, False]  ,
    'copy_X': [True,False]
}
print('Running Grid Search...')
lr_grid_search = GridSearchCV(model_lr_TOTAL, param_grid, cv=5 ,scoring='neg_mean_squared_error')
lr_grid_search.fit(X,y)
print('Done')

In [None]:
print("Best Parameters: ", lr_grid_search.best_params_)
print("Best  Mean Squared Error: ",-1 * lr_grid_search.best_score_)

In [None]:
model_lr_TOTAL_gs = LinearRegression(fit_intercept=False,positive=True)
model_lr_TOTAL_gs.fit(X_train,y_train)
lr_prediction_gs = model_lr_TOTAL_gs.predict(X_test)

In [None]:
print('Model Summary:\n')

# Print intercept (alpha)
print('Intercept:')
print('alpha = ' , model_lr_TOTAL_gs.intercept_)

# Print weights
print('\nWeights:')
i = 0
for w in model_lr_TOTAL_gs.coef_:
    print('w_',i+1,'= ', w, ' [ weight of ', features[i],']')
    i += 1

In [None]:
lr_rmse = mean_squared_error(y_test, lr_prediction_gs,squared=False)
lr_r2 = r2_score(y_test,lr_prediction_gs)

print('[LR] Root Mean Squared Error: {0}'.format(lr_rmse))
print('[LR] R2: {0}'.format(lr_r2))

In [None]:
merged_sheet1.shape

In [None]:
#People_Reached_by_Oblast
merged_sheet2.head(10)

In [None]:
#UDE_Inputs
merged_sheet3.head()

In [None]:
# copy_df = pd.DataFrame(np.nan, index=merged_sheet1.index,columns=merged_sheet1.columns)
# copy_df['DATE'] = merged_sheet1['DATE']
# copy_df['OBLAST'] = merged_sheet1['OBLAST']
# copy_df.head()

In [None]:
# # Create an Excel writer object
# excel_writer1 = pd.ExcelWriter('Data_ Round 33 - Ukraine 5W - 2022-11-10.xlsx', engine='xlsxwriter')
# excel_writer2 = pd.ExcelWriter('Data_ Round 34 - Ukraine 5W - 2022-11-25.xlsx', engine='xlsxwriter')
# # Write each DataFrame to a separate sheet in the Excel file
# copy_df.to_excel(excel_writer1, sheet_name='Num_of_Orgs_by_Oblast', index=False)
# copy_df.to_excel(excel_writer2, sheet_name='Num_of_Orgs_by_Oblast', index=False)