In [171]:
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
import datetime as dt
from math import cos,sin
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn import linear_model
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
pd.set_option('display.max_columns', 100)
import warnings
warnings.filterwarnings('ignore')


In [172]:
# Function to transform latitudes and longitudes into x,y,z 

def x_and_y(row,lat,lon):
    row['x'] = cos(lat) * cos(lon)
    row['y'] = cos(lat) * sin(lon)
    row['z'] = sin(lat)
    return row

In [205]:
# Load data and finalize dataset variables

lats = pd.read_csv('countries_lat_long.csv')

df = pd.read_csv('YearlyByCountry.csv')

# Drop index column from csv
df = df.drop('Unnamed: 0',axis=1)

# Merge in latitudes/longitudes and set up dataframe for a fancy apply step to convert lats/lon to x,y,z
df = df.merge(lats[['Country','Latitude','Longitude']],on='Country')
df_copy = df.copy()
df_copy['x'] = 0
df_copy['y'] = 0
df_copy['z'] = 0

# This applies the x,y,z transformation to a copy and then merges them into df
df=df.merge(df_copy[['Latitude','Longitude','x','y','z']].apply( \
                lambda row: x_and_y(row,row["Latitude"],row["Longitude"]),axis=1)[['x','y','z']], \
            left_index=True,right_index=True)
df.tail(3)

Unnamed: 0,Year,Country,Population,Gas consumption,Coal consumption,Oil consumption,FossilFuelGrowth,CoalGrowth,GasGrowth,OilGrowth,AverageTemperature,AverageTemperatureUncertainty,TempMinus1,TempMinus2,TempMinus5,Gas cumsum,Coal cumsum,Oil cumsum,Latitude,Longitude,x,y,z
2323,2011,Slovakia,5409000.0,5631000000.0,7209000.0,4818000.0,-8.171,-2.329,-4.296,-1.547,8.846,0.321583,8.073,9.001667,8.528334,126963000000.0,181710000.0,82634000.0,48.67,19.5,-0.019644,-0.014947,-0.999695
2324,2012,Slovakia,5415000.0,5290000000.0,6893000.0,4417000.0,-8.02,-2.417,-3.16,-2.442,8.873167,0.390667,8.846,8.073,9.478917,132253000000.0,188603000.0,87051000.0,48.67,19.5,-0.019644,-0.014947,-0.999695
2325,2013,Slovakia,5422000.0,5511000000.0,6647000.0,4485000.0,2.052,-0.24,2.241,0.051,9.685625,0.387125,8.873167,8.846,9.397417,137764000000.0,195250000.0,91536000.0,48.67,19.5,-0.019644,-0.014947,-0.999695


In [174]:
def train_model(X,y,type="Ridge"):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    if type == "LinearRegression":
        lm = LinearRegression().fit(X_train,y_train)
    elif type == "ElasticNet":
        pipe = Pipeline([('scaler', StandardScaler()), ('EN', ElasticNet(alpha=0.1))])
        lm = pipe.fit(X_train, y_train)
        #lm = ElasticNet(alpha=0.1).fit(X_train,y_train)
    elif type == "Ridge":
        lm = Ridge(alpha=0.1).fit(X_train,y_train)
    elif type == "Lasso":
        lm = Lasso(alpha=0.1).fit(X_train,y_train)
    else:
        lm = LinearRegression().fit(X_train,y_train)
    
    print("r2 score: ",r2_score(lm.predict(X_test),y_test))
    print("mse: ",mean_squared_error(lm.predict(X_test),y_test))
    
    return lm, X_train, X_test, y_train, y_test
    

In [189]:
pca_data_columns = ['Population',
 'Gas consumption',
 'Coal consumption',
 'Oil consumption',
 'Gas cumsum',
 'Coal cumsum',
 'Oil cumsum',
 'FossilFuelGrowth',
 'CoalGrowth',
 'GasGrowth',
 'OilGrowth',
 'TempMinus1',
 'TempMinus2',
 #'TempMinus5', # <- I removed this because it causes too many missing values.
 'x','y','z',
 'Latitude',
 'Longitude']

pca_df = df[pca_data_columns+['AverageTemperature']].dropna()

target_columns = ['AverageTemperature']


# create the RFE model and select 10 attributes
rfe = RFE(estimator=RandomForestRegressor(), n_features_to_select=10)
rfe = rfe.fit(pca_df[pca_data_columns], pca_df[target_columns])
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

[ True  True False  True  True False False  True False False False  True
  True  True  True False  True False]
[1 1 2 1 1 3 6 1 8 5 9 1 1 1 1 4 1 7]


In [198]:
# Use features from RFE
data_columns = list(rfe.feature_names_in_[rfe.support_])
target_columns = ['AverageTemperature']

model_df = df[data_columns+['AverageTemperature']].dropna()

# Grab data and target columns
y = model_df[target_columns]
X = model_df[data_columns]

#
for model in ['LinearRegression','Lasso','ElasticNet','Ridge']:
    print("Model Type: ",model)
    lm, X_train, X_test, y_train, y_test = train_model(X,y,model)


Model Type:  LinearRegression
r2 score:  0.9958093362739631
mse:  0.3465202454769953
Model Type:  Lasso
r2 score:  0.9958013874161913
mse:  0.346204115706394
Model Type:  ElasticNet
r2 score:  0.9943571805026353
mse:  0.43957183705909747
Model Type:  Ridge
r2 score:  0.9958093241677802
mse:  0.3465208462486449


In [199]:
score_df = df.loc[X_test.index][['AverageTemperature']]
score_df['Model Prediction'] = lm.predict(X_test)
score_df['Minus1 Prediction'] = df.loc[X_test.index]['TempMinus1']
score_df = score_df.sort_values(by = 'AverageTemperature')
score_df[['AverageTemperature','Model Prediction','Minus1 Prediction']]

Unnamed: 0,AverageTemperature,Model Prediction,Minus1 Prediction
451,-18.784584,-17.525716,-17.325000
452,-17.870333,-18.018846,-18.784584
453,-17.827917,-18.088910,-17.870333
450,-17.325000,-17.803124,-18.201666
463,-17.067750,-17.016354,-16.824500
...,...,...,...
1897,29.156084,29.041489,28.984083
1901,29.205416,29.598235,29.671333
1889,29.298416,28.919290,29.292583
1903,29.370625,29.491349,29.425833


In [204]:
print("Model Scores: ")
print("r2 score: ",r2_score(lm.predict(X_test),y_test))
print("mse: ",mean_squared_error(lm.predict(X_test),y_test))
print()
print("Simple Lag Model Scores: ")
print("r2 score: ",r2_score(X_test['TempMinus1'],y_test))
print("mse: ",mean_squared_error(X_test['TempMinus1'],y_test))

Model Scores: 
r2 score:  0.9958093241677802
mse:  0.3465208462486449

Simple Lag Model Scores: 
r2 score:  0.9951199076859735
mse:  0.4070568207752911
