<a href="https://colab.research.google.com/github/sarrahrose04/PovertyMapping/blob/main/RidgeRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount{'c/content/gdrive', force_remount=True}


In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import pandas as pd

CENI_full_file = " "#paste link to dataset containing binned luminosity & poverty rates
dr_raw = pd.read_csv(CENI_full_file)
df_raw.head()


In [None]:
#Define parameters needed for model 

import re
import os

#extract country code, year, daytime satellite imagery source & imagery file resolution from tar filename
country, year, day_sat, img_res = re.search("[A-Z]{3}_[0-9]{4}_[A-Z]{2}_[0-9]{3}",CENI_full_file).group().split("_")

target_variable_name = "POV_"+year
df_full = df.raw.copy()

#print if the necessary columns are defined correctly
print(df_full[["geocode",target_variable_name]])
print(df_full.shape)
print(df_full.columns)

In [None]:
df_full = df_full.dropna()
df_full.shape

In [None]:
#Define features filename
features_filename = "_".join(["CNN_FOUT_RES34",country,year,day_sat,str(img_res)])+".csv"

#Load into python as df
features_raw = pd.read_csv(os.path.join(os.path.dirname(CENI_full_file), features_filename))
print(features_raw.shape)



In [None]:
#Compare filenames of daytime satellite imagery processed during feature extraction with filename list from original csv
all_img = features_raw["filename"]

missing_images_ID = df_full["filename"].isin(all_img)
missing_csventry_ID = all_img.isin(df_full["filename"])

missing_images = df_full(-missing_images_ID)
missing_entries = all_img[-missing_csventry_ID]
print*"images in the df_full, but not in the features file: ")
print(missing_images)

print("_________________")
print(" ")
print("images in the features file, but not in the df_full: ")
print(missing_entries)

In [None]:
#Delete all rows in original CSV file that contain filenames that were not processed during feature extraction
df = df_full.copy(deep = True)[missing_images_ID]
print(df_full.shape)
print(df.shape)


In [None]:
#Make a temporary file that only contains the filename & geocode columns
img_geocode = df[["filename", "geocode"]]
#drop the double rows we just want the relation between image and geocode
img_geocode = img_geocode.drop_duplicates()

img_geocode.head()


In [None]:
#Generate a new df containing only training poverty data
df_LHS = df[['geocode', 'data_split', target_variable_name]]
df_LHS = df.LHS.drop_duplicates(subset='geocode')
print(df_LHS.shape)

In [None]:
#Merge geocode filename dataframe with features dataframe
#Ensure that datatypes align 
img_geocode.filename.astype(str)
features_raw.filename.astype(str)
#merge
featrues = img_geocode.merge(features_raw, on = "filename")


In [None]:
#Compute avg features by geocode grp & generate 1 feature vector per geocode
avg_features = features.copy(deep = True)
avg_features.drop(columns=['filename'])
avg_features = avg_features.groupby('geocode', as_index=False).mean()


In [None]:
#Merge training poverty & averaged features dataframes

avg_features_full = df_LHS.merge(avg_features, on = 'geocode')

print(df_LHS.shape)
print(avg_features.shape)
print(avg_features_full.shape)
print(avg_features_full.iloc[:5,:6])

In [None]:
#Load packages needed to perform ridge regression 
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

In [None]:
import numpy as np
outlier_flag = 4 #standard deviation
validation_size_percent = 10

#Determine geocodes of outliers from avged features based on the defined standard deviation
#specified in variable outlier flag
outliers = avg_features_full['geocode'][avg_features_full['target_variable_name']>avg_features_full[target_variable_name].mean() +
                                        outlier_flag * avg_features_full[target_variable_name].std()].unique()

print("outlier Region: ")
print(outliers)
print("number of outliers: " + str(len(outliers)))

#Extract valisation datasets & drop outliers
validation_regions = avg_features_full['geocode'][avg_features_full['data_split'] == (validation_size_percent/100)].unique()
print("number of validation_regions: "+str(len(validation_regions)))

#combine validation and outlier regions to drop them at once
drop_regions = np.append(outliers, validation_regions)

#drop outliers and validation set
avg_features = avg_features_full[~avg_features_full['geocode'].isin(drop_regions)]
avg_features_validation = avg_features_full[avg_features_full['geocode'].isin(validation_regions)]

#Create separate dataframes for full, training and test datasets

#training set
Xs = avg_features.drop(target_variable_name, 'geocode', 'data_split'], axis=1)
y = avg_features[target_variable_name].values.reshape(-1,1)

#full dataset 
Xs_full = avg_features_full.drop([target_variable_name, 'geocode', 'data_split'], axis=1)
y_full = avg_features_full[target_variable_name].values.reshape(-1,1)

#only validation set
Xs_validation = avg_features_validation.drop([target]target_variable_name, 'geocode', 'data_split'], axis=1)
y_validation = avg_features_validation[target_variable_name].values.reshape(-1,1)

print(avg_features_full.shape)
print("Xs shape: "+str(Xs shape))
print("y shape: "+str(y.shape))
print("Outlier flag: "+str(outlier_flag) + " sd")
print("Validation Xs shape: ")+str(Xs_Validation.shape))
print("Validation relative size: "+str(round( Xs_validation.shape[0] / avg_features_full.shape[0],2)))


In [None]:
#Set parameter space for lambda (ridge regression penalty term) that needs to be searched through
max_lambda = 10
print("maximum lambda: " + str(max_lambda))
min_lambda = 0.01
print("minimum lambda: "+str(min_lambda))

parameters = {'alpha': 10**np.linspace(np.log10(min_lambda), np.log10(max_lambda), num = 15)}
print (parameters)

In [None]:
#Perform Ridge Regression 
ridge = Ridge(fit_intercept = True, normalize=True)
ridge_regressor = GridSearchCV(ridge, paramters, scoring = "neg_mean_squared_error")

%time ridge_regressor.fit(Xs,y)

In [None]:
#Identify the model with the best CV score
print(ridge_regressor.best_params_)
best_ridge = ridge_regressor.best_estimator
RSME_valid = round(((y_validation/100-0.01*best_ridge.predict(Xs_validation))**2).mean()**0.5,4)
RSME_full = round((y_full/100-0.01*best_ridge.predict(Xs_full))**2).mean()**0.5,4)

print("Validation RMSE: ") + str(RMSE_valid))
print("FullRMSE: "+str(RMSE_full))

In [None]:
#Define function for computing R-squared and root mean squared error

import shutil 

def Ridge_Rsquared(predicted, true): 
  SSE = sum((predicted - true)**2)
  SST = sum((true - true.mean())**2)
  R_square = 1 - SSE / SST
  RMSE = (SSE/len(true))**0.5
  return round(float(R_square),4)

#Implement calculations for training, validation & entire dataset
eval_valid = Ridge_Rsquared(0.01*best_ridge.predict(Xs_validation), 0.01*y_validation)
eval_full = Ridge_Rsquared(0.01*best_ridge.predict(Xs_full), 0.01*y_full)
eval_train = Ridge_Rsquared(0.01*best_ridge.predict(Xs), 0.01*y)

ridgestats = pd.DataFrame({"stat": ['RMSE_valid', "RMSE_full", "R2_valid", "R2_full", "R2_train"],
                           "value": [RMSE_valid, RMSE_full, eval_valid, eval_full, eval_train]})
print(ridgestats)

#Generate regression statistics outputs as CSV file and copy in GDrive
ridgestats_file = "_".join(["CNN", "Ridgestats", "RES34", country, year, day_sat, str(img_res)]) +".csv"
ridgestats.to_csv(ridgestats_file)
shutil.copy(os.path.join("/content/", ridgestats_file), "/content/gdrive/MyDrive")


In [None]:
import matplotlib.pyplot as plt

#add functionality to plot at 45deg line
def abline(slope, intercept):
  """Plot a line from slope & intercept"""
  axes = plt.gca()
  x_vals = np.array(axes.get_xlim())
  y_vals = intercept + slope * x_vals
  plt.plot(x_vals, y_vals, "--")

In [None]:
#Plot government published poverty rates against predicted poverty rates

plot_filename = "-".join(["CNN", "PLOT", "RES34", country, year, day_sat, str(img_res), "validation"]) + ".eps"

col_dict = {True: "r", False:"b"}
col = [col_dict[valid] for valid in avg_features_full["data_split"] == (validation_size_percent/100)]

plt.scatter(y_full, best_ridge.predict(Xs_full), c = col)
plt.ylabel("Predictions")
plt.xlabel("Survey")
plt.suptitle(country+ " " + year+ " " + "Ridge Regression")
txt = ""
plt.figtext(0.5, -0.1, txt, wrap=True, horizontalalignment="center", fonsize=12)
abline(1,0)

plt.savefig(plot_filename, format="eps", dpi = 600)
shutil.copy(os.path.join("/content/", plot_filename, "/content/gdrive/MyDrive"))

In [None]:
#Exports ridge regression model to GDrive
import pickls
trained_ridge_regression_file = "_".join(["CNN", "RidgeModel","RES34", country, year, day_sat, str(img_res)]) +".pkl"

#Save file to current WD
with open(trained_ridge_regression_file, "wb") as file: 
  pickle.dump(best_ridge, file)

#copy to gdrive
shutil.copy(os.path.join("/content/", trained_ridge_regression_file), "/content/gdrive/MyDrive/")


In [None]:
#Load from file
with open(trained_ridge_regression_file, "rb") as file: 
  best_ridge = pickle.load(file)

In [None]:
#Extract array of image level features, collapse into 1D array to get predicted poverty rates

#Perform prediction for all grids
pred_out = best_ridge.predict(features_raw.loc[ : , "0":"511"])
#Make prediction a DF with corresponding imagery filename as index
pred_out_pd = pd.DataFrame({'prediction': pred_out.flatten()}, index = features_raw.filename)

print(len(pred_out))
print(len(features_raw.filename))

In [None]:
#Merge poverty prediction dataframe with data frame containing gov-published poverty rates using imagery filename as merging partner
print(df.shape)
output = df_raw.join(pred_out_pd, on = "filename", how = "outer")
print(output.shape)

print("---------")
print(output[:5])

In [None]:
#Generate poverty prediction output file as CSV file
poverty_prediction_file = "_".join(["CNN", "POV", "RES34", country, year, day_dat, str(img_res)] )+".csv"
output.to_csv(poverty_prediction_file)

shutil.copy(os.path.join("/content/",poverty_prediction_file), "/content/gDrive/MyDrive/")