In [56]:
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.svm import SVR
from sklearn.feature_selection import RFE
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn import linear_model
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
pd.set_option('display.max_columns', 100)
import warnings
warnings.filterwarnings('ignore')


In [57]:
# Load data and finalize dataset variables
df = pd.read_csv('GlobalYearly.csv')

# Drop index column from csv
df = df.drop('Unnamed: 0',axis=1)

df.tail(5)

Unnamed: 0,Year,Population,Gas consumption,Coal consumption,Oil consumption,FossilFuelGrowth,CoalGrowth,GasGrowth,OilGrowth,AverageTemperature,AverageTemperatureUncertainty,TempMinus1,TempMinus2,TempMinus5,log_Gas,log_Coal,log_Oil,LandAverageTemperature,LandAverageTemperatureUncertainty,LandMaxTemperature,LandMaxTemperatureUncertainty,LandMinTemperature,LandMinTemperatureUncertainty,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty,Gas cumsum,Coal cumsum,Oil cumsum
29,2009,5673677000.0,2902559000000.0,6787239000.0,4712376000.0,-2063.064,-558.672,-586.854,-917.537,15.336614,0.268833,15.286109,15.456948,15.150121,28.696614,22.63831,22.273458,9.50525,0.085917,15.266583,0.10775,3.796917,0.126,15.827167,0.058917,57708710000000.0,125478100000.0,110838600000.0
30,2010,5729765000.0,3174016000000.0,7305800000.0,4866603000.0,5350.16,1837.893,2110.871,1401.398,15.289953,0.265576,15.336614,15.286109,15.154312,28.786019,22.711934,22.305662,9.703083,0.083417,15.449,0.103417,4.023917,0.115667,15.8955,0.058583,60882720000000.0,132783900000.0,115705200000.0
31,2011,5785305000.0,3264771000000.0,7729972000.0,4904080000.0,3290.304,2019.35,768.775,502.187,15.255487,0.275816,15.289953,15.336614,15.326454,28.814211,22.768371,22.313333,9.516,0.082,15.284833,0.114333,3.827667,0.136583,15.7695,0.059,64147490000000.0,140513800000.0,120609300000.0
32,2012,5842192000.0,3333059000000.0,7961497000.0,4973526000.0,1591.055,190.067,835.661,565.328,15.247377,0.36126,15.255487,15.289953,15.456948,28.834912,22.797883,22.327395,9.507333,0.083417,15.332833,0.107333,3.756167,0.145333,15.802333,0.0615,67480550000000.0,148475300000.0,125582800000.0
33,2013,5898015000.0,3373866000000.0,8033213000.0,5051871000.0,1876.422,828.76,530.413,517.259,15.906935,0.458943,15.247377,15.255487,15.286109,28.84708,22.80685,22.343024,9.6065,0.097667,15.373833,0.1155,3.911333,0.149833,15.854417,0.064667,70854420000000.0,156508500000.0,130634700000.0


In [58]:
def train_model(X,y,type="Ridge"):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    if type == "LinearRegression":
        lm = LinearRegression().fit(X_train,y_train)
    elif type == "ElasticNetScaled":
        pipe = Pipeline([('scaler', StandardScaler()), ('EN', ElasticNet(alpha=0.1))])
        lm = pipe.fit(X_train, y_train)
    elif type == "ElasticNet":
        lm = ElasticNet(alpha=0.1).fit(X_train,y_train)
    elif type == "Ridge":
        lm = Ridge(alpha=0.1).fit(X_train,y_train)
    elif type == "Lasso":
        lm = Lasso(alpha=0.1).fit(X_train,y_train)
    elif type == "OMP":
        lm = OrthogonalMatchingPursuit().fit(X_train,y_train)
    #elif type == "GaussianProcess":   <- this didn't work, so maybe I am doing something wrong.
    #    kernel = DotProduct() + WhiteKernel()
    #    lm = GaussianProcessRegressor(kernel=kernel,random_state=0,alpha = 1e-5).fit(X, y)
    #    print("Gscore: ",lm.score(X, y))
    else:
        lm = LinearRegression().fit(X_train,y_train)
    print("Model Type: ",type)     
    print("r2 score: ",r2_score(lm.predict(X_test),y_test))
    print("mse: ",mean_squared_error(lm.predict(X_test),y_test))
    
    return lm, X_train, X_test, y_train, y_test
    

In [59]:
pca_data_columns = ['Population',
 'Gas consumption',
 'Coal consumption',
 'Oil consumption',
 'Gas cumsum',
 'Coal cumsum',
 'Oil cumsum',
 'FossilFuelGrowth',
 'CoalGrowth',
 'GasGrowth',
 'OilGrowth',
 'TempMinus1',
 'TempMinus2']
 #'TempMinus5',


pca_df = df[pca_data_columns+['AverageTemperature']].dropna()

target_columns = ['AverageTemperature']

# create a base classifier used to evaluate a subset of attributes

# create the RFE model and select 3 attributes
rfe = RFE(estimator=RandomForestRegressor(), n_features_to_select=10)
rfe = rfe.fit(pca_df[pca_data_columns], pca_df[target_columns])
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

[ True  True  True  True  True  True  True False  True False False  True
  True]
[1 1 1 1 1 1 1 3 1 2 4 1 1]


In [60]:
data_columns = list(rfe.feature_names_in_[rfe.support_])
target_columns = ['AverageTemperature']

model_df = df[data_columns+['AverageTemperature']].dropna()

y = model_df[target_columns]
X = model_df[data_columns]
for model in ['LinearRegression','Lasso','ElasticNet','OMP','Ridge']:
    lm, X_train, X_test, y_train, y_test = train_model(X,y,model)

Model Type:  LinearRegression
r2 score:  0.8855876951129755
mse:  0.03642365402014431
Model Type:  Lasso
r2 score:  0.8280756568025082
mse:  0.0398913042263386
Model Type:  ElasticNet
r2 score:  0.828065370162928
mse:  0.039891928030044134
Model Type:  OMP
r2 score:  0.7866251039994681
mse:  0.06820216709280362
Model Type:  Ridge
r2 score:  0.887736376068946
mse:  0.03538607684919299


In [61]:
score_df = df.loc[X_test.index][['AverageTemperature']]
score_df['Model Prediction'] = lm.predict(X_test)
score_df['Minus1 Prediction'] = df.loc[X_test.index]['TempMinus1']
score_df = score_df.sort_values(by = 'AverageTemperature')
score_df[['AverageTemperature','Model Prediction','Minus1 Prediction']]

Unnamed: 0,AverageTemperature,Model Prediction,Minus1 Prediction
17,14.900671,14.93904,14.479931
14,15.049933,14.656628,14.510896
32,15.247377,15.49168,15.255487
31,15.255487,15.326292,15.289953
19,15.267058,15.34675,15.256991
26,15.326454,15.104865,15.154312
27,15.456948,15.195905,15.326454
6,15.936502,16.036697,15.837643
2,15.986446,16.079545,16.058908
11,16.016611,16.175804,16.562689


In [62]:
print("Model Scores: ")
print("r2 score: ",r2_score(lm.predict(X_test),y_test))
print("mse: ",mean_squared_error(lm.predict(X_test),y_test))
print()
print("Simple Lag Model Scores: ")
print("r2 score: ",r2_score(X_test['TempMinus1'],y_test))
print("mse: ",mean_squared_error(X_test['TempMinus1'],y_test))

Model Scores: 
r2 score:  0.887736376068946
mse:  0.03538607684919299

Simple Lag Model Scores: 
r2 score:  0.8040510061971596
mse:  0.08081815163263606
