# Machine Learning Sandbox
This notebook is for the purpose of testing various machine learning model to predict Biocapacity Deficit or Reserve (i.e., the signed value of Biocapacity) based on other variables (e.g., Carbon Footprint).

## Linear Regression

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Allows better display of DataFrames
from IPython.display import display

In [3]:
# Create DataFrame from CSV file
df = pd.read_csv('countries_clean.csv')

df.head()

Unnamed: 0.1,Unnamed: 0,Country,Region,Population (millions),HDI,GDP per Capita,Cropland Footprint,Grazing Footprint,Forest Footprint,Carbon Footprint,...,Cropland,Grazing Land,Forest Land,Fishing Water,Urban Land,Total Biocapacity,Biocapacity Deficit or Reserve,Earths Required,Countries Required,Data Quality
0,0,Afghanistan,Middle East/Central Asia,29.82,0.46,614.66,0.3,0.2,0.08,0.18,...,0.24,0.2,0.02,0.0,0.04,0.5,-0.3,0.46,1.6,6
1,1,Albania,Northern/Eastern Europe,3.16,0.73,4534.37,0.78,0.22,0.25,0.87,...,0.55,0.21,0.29,0.07,0.06,1.18,-1.03,1.27,1.87,6
2,2,Algeria,Africa,38.48,0.73,5430.57,0.6,0.16,0.17,1.14,...,0.24,0.27,0.03,0.01,0.03,0.59,-1.53,1.22,3.61,5
3,3,Angola,Africa,20.82,0.52,4665.91,0.33,0.15,0.12,0.2,...,0.2,1.42,0.64,0.26,0.04,2.55,1.61,0.54,0.37,6
4,5,Argentina,Latin America,41.09,0.83,13540.0,0.78,0.79,0.29,1.08,...,2.64,1.86,0.66,1.67,0.1,6.92,3.78,1.82,0.45,6


In [4]:
# Load features
X = df[['GDP per Capita','Carbon Footprint','Forest Footprint']]    # note to self, I keep forgetting this [[a,b,c]] syntax
y = df['Biocapacity Deficit or Reserve']

# Create a linear regression object
linreg = linear_model.LinearRegression()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


#print(X_train.shape, y_train.shape)
#print(X_test.shape, y_test.shape)

# Train the model using the training sets
linreg.fit(X, df['Biocapacity Deficit or Reserve'])

# Make predictions using the testing set
y_pred = linreg.predict(X_test)

# Display coefficients
print('Coefficients:', linreg.coef_)
print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))

# Explained variance score where 1 is a perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))

Coefficients: [ -7.67859366e-05  -4.48234378e-01   4.72021739e+00]
Mean squared error: 119.40
Variance score: 0.11


## Lasso Linear Regression

In [8]:
clf = linear_model.Lasso(alpha=0.1)
clf.fit(X, df['Biocapacity Deficit or Reserve'])

# Display coefficients
print('Coefficients:', clf.coef_)

# Display intercept
print('Intercept: %.2f' % clf.intercept_)

Coefficients: [ -7.79060842e-05  -3.82632223e-01   3.89408656e+00]
Intercept: 0.64


## SVM

In [16]:
# Fit regression model
svr_rbf = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)
svr_lin = SVR(kernel='linear', C=100, gamma='auto')
svr_poly = SVR(kernel='poly', C=100, gamma='auto', degree=3, epsilon=.1,
               coef0=1)

# Look at the results
lw = 2

svrs = [svr_rbf, svr_lin, svr_poly]
kernel_label = ['RBF', 'Linear', 'Polynomial']
model_color = ['m', 'c', 'g']

fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 10), sharey=True)
for ix, svr in enumerate(svrs):
    axes[ix].plot(X, svr.fit(X, y).predict(X), color=model_color[ix], lw=lw,
                  label='{} model'.format(kernel_label[ix]))
    axes[ix].scatter(X[svr.support_], y[svr.support_], facecolor="none",
                     edgecolor=model_color[ix], s=50,
                     label='{} support vectors'.format(kernel_label[ix]))
    axes[ix].scatter(X[np.setdiff1d(np.arange(len(X)), svr.support_)],
                     y[np.setdiff1d(np.arange(len(X)), svr.support_)],
                     facecolor="none", edgecolor="k", s=50,
                     label='other training data')
    axes[ix].legend(loc='upper center', bbox_to_anchor=(0.5, 1.1),
                    ncol=1, fancybox=True, shadow=True)

fig.text(0.5, 0.04, 'data', ha='center', va='center')
fig.text(0.06, 0.5, 'target', ha='center', va='center', rotation='vertical')
fig.suptitle("Support Vector Regression", fontsize=14)
plt.show()

KeyError: '[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  17  18\n  19  20  21  22  23  24  26  27  28  29  30  31  32  33  34  35  37  38\n  39  40  41  42  43  44  45  46  48  49  50  51  52  53  54  55  56  57\n  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75\n  76  77  78  79  81  82  83  84  85  86  87  88  89  90  91  92  93  94\n  95  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112\n 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130\n 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148\n 149 150 151 152 153 154 155 156 157 158 159 160 161 162] not in index'

## Random Forests

In [18]:
X, y = make_regression(n_features=4, n_informative=2,
                       random_state=0, shuffle=False)
regr = RandomForestRegressor(max_depth=2, random_state=0,
                             n_estimators=100)
regr.fit(X, y)  

print(regr.feature_importances_)
print(regr.predict([[0, 0, 0, 0]]))

[ 0.18146984  0.81473937  0.00145312  0.00233767]
[-8.32987858]
