In [1]:
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
import datetime as dt
from math import cos,sin
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn import linear_model
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
pd.set_option('display.max_columns', 100)
import warnings
warnings.filterwarnings('ignore')


In [2]:
# Function to transform latitudes and longitudes into x,y,z 

def x_and_y(row,lat,lon):
    row['x'] = cos(lat) * cos(lon)
    row['y'] = cos(lat) * sin(lon)
    row['z'] = sin(lat)
    return row

In [4]:
# Load data and finalize dataset variables

lats = pd.read_csv('data/countries_lat_long.csv')

df = pd.read_csv('data/YearlyByCountry.csv')

# Drop index column from csv
df = df.drop('Unnamed: 0',axis=1)

# Merge in latitudes/longitudes and set up dataframe for a fancy apply step to convert lats/lon to x,y,z
df = df.merge(lats[['Country','Latitude','Longitude']],on='Country')
df_copy = df.copy()
df_copy['x'] = 0
df_copy['y'] = 0
df_copy['z'] = 0

# This applies the x,y,z transformation to a copy and then merges them into df
df=df.merge(df_copy[['Latitude','Longitude','x','y','z']].apply( \
                lambda row: x_and_y(row,row["Latitude"],row["Longitude"]),axis=1)[['x','y','z']], \
            left_index=True,right_index=True)
df.tail(3)
df.to_csv('data/countries_processed_data.csv')

In [26]:
def train_model(X,y,type="Ridge"):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    if type == "LinearRegression":
        lm = LinearRegression().fit(X_train,y_train)
    elif type == "ElasticNet":
        pipe = Pipeline([('scaler', StandardScaler()), ('EN', ElasticNet(alpha=0.1))])
        lm = pipe.fit(X_train, y_train)
        #lm = ElasticNet(alpha=0.1).fit(X_train,y_train)
    elif type == "Ridge":
        lm = Ridge(alpha=0.1).fit(X_train,y_train)
    elif type == "Lasso":
        lm = Lasso(alpha=0.1).fit(X_train,y_train)
    else:
        lm = LinearRegression().fit(X_train,y_train)
    
    print("r2 score: ",r2_score(lm.predict(X_test),y_test))
    print("mse: ",mean_squared_error(lm.predict(X_test),y_test))
    
    return lm, X_train, X_test, y_train, y_test
    

In [39]:
pca_data_columns = ['Population',
 'Gas consumption',
 'Coal consumption',
 'Oil consumption',
 'Gas cumsum',
 'Coal cumsum',
 'Oil cumsum',
 'FossilFuelGrowth',
 'CoalGrowth',
 'GasGrowth',
 'OilGrowth',
 'TempMinus1',
 'TempMinus2',
 #'TempMinus5', # <- I removed this because it causes too many missing values.
 'x','y','z',
 'Latitude',
 'Longitude']

pca_df = df[pca_data_columns+['AverageTemperature']].dropna()

target_columns = ['AverageTemperature']

# create the RFE model and select 10 attributes
rfe = RFE(estimator=RandomForestRegressor(), n_features_to_select=10)
rfe = rfe.fit(pca_df[pca_data_columns], pca_df[target_columns])
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

[ True  True  True  True  True False False False False False False  True
  True  True  True False  True False]
[1 1 1 1 1 5 7 2 8 3 6 1 1 1 1 9 1 4]


In [37]:
rfe.feature_names_in_[rfe.support_]

AttributeError: 'RFE' object has no attribute 'feature_names_in_'

In [32]:
# Use features from RFE
data_columns = list(rfe.feature_names_in_[rfe.support_])
target_columns = ['AverageTemperature']

model_df = df[data_columns+['AverageTemperature']].dropna()

# Grab data and target columns
y = model_df[target_columns]
X = model_df[data_columns]

#
for model in ['LinearRegression','Lasso','ElasticNet','Ridge']:
    print("Model Type: ",model)
    lm, X_train, X_test, y_train, y_test = train_model(X,y,model)


AttributeError: 'RFE' object has no attribute 'feature_names_in_'

In [30]:
score_df = df.loc[X_test.index][['AverageTemperature']]
score_df['Model Prediction'] = lm.predict(X_test)
score_df['Minus1 Prediction'] = df.loc[X_test.index]['TempMinus1']
score_df = score_df.sort_values(by = 'AverageTemperature')
score_df[['AverageTemperature','Model Prediction','Minus1 Prediction']]

NameError: name 'X_test' is not defined

In [None]:
print("Model Scores: ")
print("r2 score: ",r2_score(lm.predict(X_test),y_test))
print("mse: ",mean_squared_error(lm.predict(X_test),y_test))
print()
print("Simple Lag Model Scores: ")
print("r2 score: ",r2_score(X_test['TempMinus1'],y_test))
print("mse: ",mean_squared_error(X_test['TempMinus1'],y_test))