In [62]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
#from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [63]:
# Load Dataset
dataset = pd.read_excel('/Users/architag/Workspaces/UW/Final_Project/combined_dataset.xlsx')
print(dataset.shape)
print(dataset.columns.values)

# Prepare data for training and evaluation.
output_vars = ['Overall_EPI', 'Sanitation_Drinking_Water_Score', 'Climate_Change_Score', 'Waste_Mgmt_Score', 'Pollution_Score']
X = dataset[['GDP_Per_Capita','Urban_Population_Percent', 'Female_Population_Percent', 'Literacy_Rate', 'Poverty_Rate', 'Manufacturing_Percent_GDP']]
Y = dataset[output_vars]
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.3, random_state=1234)
print('Y_train columns:', y_train.columns.values)

(132, 14)
['Unnamed: 0' 'Country Name' 'Country Code' 'GDP_Per_Capita'
 'Urban_Population_Percent' 'Manufacturing_Percent_GDP'
 'Female_Population_Percent' 'Literacy_Rate' 'Poverty_Rate' 'Overall_EPI'
 'Sanitation_Drinking_Water_Score' 'Climate_Change_Score'
 'Waste_Mgmt_Score' 'Pollution_Score']
Y_train columns: ['Overall_EPI' 'Sanitation_Drinking_Water_Score' 'Climate_Change_Score'
 'Waste_Mgmt_Score' 'Pollution_Score']


In [77]:
def train_and_evaluate_regression_model(model, model_name:str, use_pca:bool=False, print_coeffs:bool=False):
    print('*********Model -> {:s}********'.format(model_name))
    if use_pca:
        model = make_pipeline(StandardScaler(), PCA(0.95), model)
    for output_var in output_vars:
        model.fit(X_train, y_train[output_var])
        print('\nAccuracy Results for Variable = ',output_var)
        print('Adjusted R^2 = {:.4f}'.format(model.score(X_train, y_train[output_var])))
        print('Mean Squared Error = {:.2f}'.format(mean_squared_error(y_test[output_var], model.predict(X_test))))
        if print_coeffs:
            print('Coefficients from training:',model.coef_)
    print('\n\n\n')

# Use different regression models to predict Sustainability metrics from Demographic and Economic input variables.
train_and_evaluate_regression_model(LinearRegression(), "Linear Regression", print_coeffs=True)
train_and_evaluate_regression_model(RandomForestRegressor(), "Random Forest Regressor")
train_and_evaluate_regression_model(SVR(kernel='poly'),"Support Vector Regressor (with Polynomial Kernel)")
train_and_evaluate_regression_model(SVR(kernel='rbf'),"Support Vector Regressor (with Radial Bias Function Kernel)")
train_and_evaluate_regression_model(MLPRegressor(hidden_layer_sizes=(100,100), random_state=1234, max_iter=10000, activation='logistic'), "Multi-Layer Perceptron (Neural Network) Regressor", use_pca=True)
#train_and_evaluate_regression_model(MultinomialNB(), "Multinomial Naive Bayes")


*********Model -> Linear Regression********

Accuracy Results for Variable =  Overall_EPI
Adjusted R^2 = 0.8241
Mean Squared Error = 53.76
Coefficients from training: [ 4.46156628e-04  1.49814633e-01  1.08461852e+00  1.53079378e-01
 -1.30443985e-01  2.97824621e-03]

Accuracy Results for Variable =  Sanitation_Drinking_Water_Score
Adjusted R^2 = 0.8391
Mean Squared Error = 125.45
Coefficients from training: [ 0.00061194  0.28673144  0.573371    0.45487135 -0.39305077 -0.18239259]

Accuracy Results for Variable =  Climate_Change_Score
Adjusted R^2 = 0.6080
Mean Squared Error = 126.39
Coefficients from training: [ 2.97500300e-04  1.84426768e-01  8.60142926e-01  2.57825191e-01
 -6.67342851e-02  4.24858700e-01]

Accuracy Results for Variable =  Waste_Mgmt_Score
Adjusted R^2 = 0.5966
Mean Squared Error = 613.06
Coefficients from training: [ 7.53720389e-04  4.80553905e-01  2.09350824e+00  5.54808140e-05
 -5.13244136e-01  5.49239278e-01]

Accuracy Results for Variable =  Pollution_Score
Adjust