In [41]:
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn import linear_model
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
pd.set_option('display.max_columns', 100)
import warnings
warnings.filterwarnings('ignore')


In [42]:
#Models to try:
# Lasso, Gaussian Process

In [43]:
# Load data and finalize dataset variables
df = pd.read_csv('YearlyByCountry.csv')

# Drop index column from csv
df = df.drop('Unnamed: 0',axis=1)

df

Unnamed: 0,Year,Country,Population,Gas consumption,Coal consumption,Oil consumption,FossilFuelGrowth,CoalGrowth,GasGrowth,OilGrowth,AverageTemperature,AverageTemperatureUncertainty,TempMinus1,TempMinus2,TempMinus5
0,1980,Algeria,19220000.0,1.303000e+10,92870.0,7026000.0,28.703,-0.093,27.913,0.884,23.160166,0.382917,,,
1,1981,Algeria,19820000.0,9.799000e+09,279100.0,7375000.0,37.721,-0.221,32.821,5.121,23.579250,0.359833,23.160166,,
2,1982,Algeria,20450000.0,1.566000e+10,746000.0,8304000.0,32.592,9.188,19.154,4.250,23.094168,0.342417,23.579250,23.160166,
3,1983,Algeria,21100000.0,1.858000e+10,1210000.0,7898000.0,23.576,-1.047,15.304,9.319,23.683000,0.333000,23.094168,23.579250,
4,1984,Algeria,21760000.0,1.926000e+10,1247000.0,8710000.0,-11.412,0.942,-21.464,9.110,23.063084,0.373667,23.683000,23.094168,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2411,2009,Slovakia,5401000.0,5.398000e+09,7593000.0,4677000.0,-12.596,-1.211,-8.617,-2.768,9.001667,0.287833,9.397417,9.478917,8.145583
2412,2010,Slovakia,5404000.0,6.100000e+09,7214000.0,4855000.0,9.081,0.243,6.757,2.080,8.073000,0.274000,9.001667,9.397417,7.936917
2413,2011,Slovakia,5409000.0,5.631000e+09,7209000.0,4818000.0,-8.171,-2.329,-4.296,-1.547,8.846000,0.321583,8.073000,9.001667,8.528334
2414,2012,Slovakia,5415000.0,5.290000e+09,6893000.0,4417000.0,-8.020,-2.417,-3.160,-2.442,8.873167,0.390667,8.846000,8.073000,9.478917


In [48]:
def train_model(X,y,type="Ridge"):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    if type == "LinearRegression":
        lm = LinearRegression().fit(X_train,y_train)
    elif type == "ElasticNet":
        lm = ElasticNet(alpha=0.1).fit(X_train,y_train)
    elif type == "Ridge":
        lm = Ridge(alpha=0.1).fit(X_train,y_train)
    elif type == "Lasso":
        lm = Lasso(alpha=0.1).fit(X_train,y_train)
    else:
        lm = LinearRegression().fit(X_train,y_train)
    
    print("r2 score: ",r2_score(lm.predict(X_test),y_test))
    print("mse: ",mean_squared_error(lm.predict(X_test),y_test))
    
    return lm, X_train, X_test, y_train, y_test
    

In [49]:
data_columns = ['Year',
 'Population',
 'Gas consumption',
 'Coal consumption',
 'Oil consumption',
 'FossilFuelGrowth',
 'CoalGrowth',
 'GasGrowth',
 'OilGrowth',
 'TempMinus1',
 'TempMinus2',
 'TempMinus5']
target_columns = ['AverageTemperature']

df = df.dropna()

y = df[target_columns]
X = df[data_columns]
for model in ['LinearRegression','Lasso','Ridge','ElasticNet']:
    print("Model Type: ",model)
    lm, X_train, X_test, y_train, y_test = train_model(X,y,model)


Model Type:  LinearRegression
r2 score:  0.9964515112576592
mse:  0.27619530866036274
Model Type:  Lasso
r2 score:  0.9964512052887347
mse:  0.2755247986313138
Model Type:  Ridge
r2 score:  0.9964578717944165
mse:  0.2756789612367969
Model Type:  ElasticNet
r2 score:  0.9964624591269693
mse:  0.2748826974457328


In [85]:
score_df = df.loc[X_test.index]
score_df['Prediction'] = lm.predict(X_test)
score_df['absError'] = np.abs(score_df['AverageTemperature'] - score_df['Prediction'])
score_df['Error'] = score_df['AverageTemperature'] - score_df['Prediction']
#compare to simply using the previous year's temperature
score_df['Minus1Error'] = score_df['TempMinus1'] - score_df['Prediction']
score_df['absMinus1Error'] = np.abs(score_df['TempMinus1'] - score_df['Prediction'])
score_df = score_df.sort_values(by = 'AverageTemperature')
score_df[['AverageTemperature','Prediction','Error','absError','Minus1Error','absMinus1Error']].mean()

AverageTemperature    15.397619
Prediction            15.406171
Error                 -0.008552
absError               0.400443
Minus1Error           -0.076667
absMinus1Error         0.271423
dtype: float64

In [86]:
score_df[['AverageTemperature','Prediction']]

Unnamed: 0,AverageTemperature,Prediction
451,-18.784584,-17.846396
452,-17.870333,-17.588107
453,-17.827917,-17.769118
463,-17.067750,-16.800957
468,-16.574333,-16.226737
...,...,...
1526,28.889584,28.047950
1963,28.992916,29.096014
1969,29.205416,29.340805
1957,29.298416,28.741291


In [83]:
#plt.scatter(score_df.reset_index().index,score_df[['AverageTemperature']])
#plt.scatter(score_df.reset_index().index,score_df[['Prediction']])
#plt.show()