In [27]:
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn import linear_model
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
pd.set_option('display.max_columns', 100)
import warnings
warnings.filterwarnings('ignore')


In [28]:
#Models to try:
# Lasso, Gaussian Process

In [29]:
# Load data and finalize dataset variables

def load_dataset():
    
    by_city = pd.read_csv('GlobalLandTemperaturesByCity.csv').astype({'dt':'datetime64','AverageTemperature':'float32', \
                                    'AverageTemperatureUncertainty':'float32'}).rename(columns={'dt':'Date'})                                    
    by_city['Year']= by_city['Date'].dt.year
    by_city['Month']= by_city['Date'].dt.month
    by_city = by_city.astype({'Year':'int32','Month':'int32'})
    
    by_country = pd.read_csv('GlobalLandTemperaturesByCountry.csv').astype({'dt':'datetime64','AverageTemperature':'float32', \
                                    'AverageTemperatureUncertainty':'float32'}).rename(columns={'dt':'Date'})
                                    
    by_country['Year']= by_country['Date'].dt.year
    by_country['Month']= by_country['Date'].dt.month
    by_country = by_country.astype({'Year':'int32','Month':'int32'})
    
    globaltemp_df = pd.read_csv("GlobalTemperatures.csv").astype({'dt':'datetime64'}).rename(columns={'dt':'Date'})
    globaltemp_df['Year']= globaltemp_df['Date'].dt.year
    globaltemp_df['Month']= globaltemp_df['Date'].dt.month
    
    df_fuel = pd.read_csv("annual-change-fossil-fuels-coal-gas-oil.csv")
    df_fuel.rename(columns = {'Entity':'Country', 'Fossil.fuels..TWh.growth...sub.method.':'FossilFuelGrowth', 
                          'Coal..TWh.growth...sub.method.':'CoalGrowth', 'Gas..TWh.growth...sub.method.':'GasGrowth',
                         'Oil..TWh.growth...sub.method.':'OilGrowth'}, inplace = True)
    df = df_fuel.merge(by_country,on=["Country","Year"]).reset_index()
    natural_resources = pd.read_csv("natural-resources.csv").rename(columns={'Entity':'Country'})
    natural_resources = natural_resources[["Year","Country","Gas consumption","Coal consumption","Oil consumption","Population"]]
    df = df.merge(natural_resources,on=['Year','Country'])
    df = df.drop('index',axis=1)[["Year","Country","Population","Date","Month","Gas consumption","Coal consumption","Oil consumption"]]
    
    
    df = df.merge(by_country,on=['Year','Date','Country']).dropna().drop('Month_x',axis=1).rename(columns={'Month_y': \
                                                   'Month'}).groupby(['Year','Country']).mean().drop("Month",axis=1).reset_index()
    
    model_df = pd.DataFrame()
    for country in df.Country.unique():
        mydf = df.query("Country == '"+country+"'")
        mydf['TempMinus1'] = mydf.shift(1)['AverageTemperature']
        mydf['TempMinus2'] = mydf.shift(2)['AverageTemperature']
        mydf['TempMinus3'] = mydf.shift(3)['AverageTemperature']
        model_df = pd.concat([model_df,mydf])
    model_df = model_df.dropna()
    return model_df
    

In [30]:
def train_model(X,y,type="Ridge"):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    if type == "LinearRegression":
        lm = LinearRegression().fit(X_train,y_train)
    elif type == "ElasticNet":
        lm = ElasticNet(alpha=0.1).fit(X_train,y_train)
    elif type == "Ridge":
        lm = Ridge(alpha=0.1).fit(X_train,y_train)
    elif type == "Lasso":
        lm = Lasso(alpha=0.1).fit(X_train,y_train)
    else:
        lm = LinearRegression().fit(X_train,y_train)
    
    print("r2 score: ",r2_score(lm.predict(X_test),y_test))
    print("mse: ",mean_squared_error(lm.predict(X_test),y_test))
    return lm
    

In [31]:
df = load_dataset()
y = df[['AverageTemperature']]
X = df[['Population','Gas consumption','Oil consumption','Coal consumption','TempMinus1','TempMinus2','TempMinus3']]
for model in ['LinearRegression','Lasso','Ridge','ElasticNet']:
    print("Model Type: ",model)
    lm = lm = train_model(X,y,model)


Model Type:  LinearRegression
r2 score:  0.996561525038971
mse:  0.32067360842370746
Model Type:  Lasso
r2 score:  0.9965583140293849
mse:  0.320137753471614
Model Type:  Ridge
r2 score:  0.9965615285088939
mse:  0.32067308592105975
Model Type:  ElasticNet
r2 score:  0.9965602342414729
mse:  0.3202293422920476
