# Linear Regression
This notebook is for the purpose of fitting and running a linear regression model to predict Biocapacity Deficit or Reserve (i.e., the signed value of Biocapacity).

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Allows better display of DataFrames
from IPython.display import display

In [2]:
# Create DataFrame from CSV file
df = pd.read_csv('countries_clean.csv')

df.head()

Unnamed: 0.1,Unnamed: 0,Country,Region,Population (millions),HDI,GDP per Capita,Cropland Footprint,Grazing Footprint,Forest Footprint,Carbon Footprint,...,Cropland,Grazing Land,Forest Land,Fishing Water,Urban Land,Total Biocapacity,Biocapacity Deficit or Reserve,Earths Required,Countries Required,Data Quality
0,0,Afghanistan,Middle East/Central Asia,29.82,0.46,614.66,0.3,0.2,0.08,0.18,...,0.24,0.2,0.02,0.0,0.04,0.5,-0.3,0.46,1.6,6
1,1,Albania,Northern/Eastern Europe,3.16,0.73,4534.37,0.78,0.22,0.25,0.87,...,0.55,0.21,0.29,0.07,0.06,1.18,-1.03,1.27,1.87,6
2,2,Algeria,Africa,38.48,0.73,5430.57,0.6,0.16,0.17,1.14,...,0.24,0.27,0.03,0.01,0.03,0.59,-1.53,1.22,3.61,5
3,3,Angola,Africa,20.82,0.52,4665.91,0.33,0.15,0.12,0.2,...,0.2,1.42,0.64,0.26,0.04,2.55,1.61,0.54,0.37,6
4,5,Argentina,Latin America,41.09,0.83,13540.0,0.78,0.79,0.29,1.08,...,2.64,1.86,0.66,1.67,0.1,6.92,3.78,1.82,0.45,6


In [3]:
# Load features
X = df[['GDP per Capita','Carbon Footprint']]    # note to self, I keep forgetting this [[a,b,c]] syntax
y = df['Biocapacity Deficit or Reserve']

# Create a linear regression object
linreg = linear_model.LinearRegression()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


#print(X_train.shape, y_train.shape)
#print(X_test.shape, y_test.shape)

# Train the model using the training sets
linreg.fit(X, df['Biocapacity Deficit or Reserve'])

# Make predictions using the testing set
y_pred = linreg.predict(X_test)

# Display coefficients
print('Coefficients:', linreg.coef_)
print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))

# Explained variance score where 1 is a perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))

Coefficients: [ -4.44257510e-05  -5.50980591e-01]
Mean squared error: 6.78
Variance score: 0.19


In [11]:
print(X_test.shape)
print(y_test.shape)

#Plot outputs
plt.scatter(X_test, y_test, color='black')
plt.plot(X_test, y_pred, color='blue', linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()

(33, 2)
(33,)


ValueError: x and y must be the same size