In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 90)
from sklearn.preprocessing import StandardScaler , PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
import sklearn.utils
import warnings
warnings.filterwarnings('ignore')
%load_ext autoreload

%autoreload 1

%aimport energy_clean

In [2]:
def multi_scores(model, train_pred, test_pred):
    r2_train = r2_score(y_train, train_pred)
    r2_test = r2_score(y_test, test_pred)
    mse_train = mean_squared_error(y_train, train_pred)
    mse_test = mean_squared_error(y_test, test_pred)
    score_table.loc[model,:] = r2_train, r2_test, mse_train, mse_test

In [3]:
index = ['LinearRegression','Lasso','Ridge','RandomForestRegressor','KNeighborsRegressor']
score_table = pd.DataFrame(index = index, columns= ['r2_train','r2_test','mse_train','mse_test'])

In [4]:
df_clean = energy_clean.clean_data()

In [5]:
df_clean['Year'].unique()

array([2010, 2011, 2012, 2013, 2014], dtype=int64)

In [6]:
# Turn Categorical fields to dummy numerical variables
state = pd.get_dummies(df_clean['State'], drop_first=True, )
division = pd.get_dummies(df_clean['Division'], drop_first=True)
energy_type = pd.get_dummies(df_clean['Energy_type'], drop_first=True)

# Merge in the dummy table rows
ml_df = pd.concat([df_clean, state, division, energy_type], axis = 1)


# Drop original catetgorical fields
ml_df.drop(['State','StateCodes', 'Division', 'Energy_type', 'TotalConsumption'], axis=1, inplace=True)

In [7]:
# Separate out training and test sets. Consumption will be our goal for predictions.
# We'll be using the last year in our dataset as our test set
train = ml_df[ml_df['Year'] != 2014]
test = ml_df[ml_df['Year'] == 2014]

# Split into predictors and column to predict
X_train = train.drop(['Consumption'], axis=1)
y_train = train['Consumption']
X_test = test.drop(['Consumption'], axis=1)
y_test = test['Consumption']


In [8]:
ml_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2040 entries, 0 to 2078
Data columns (total 90 columns):
RNETMIG                 2040 non-null float64
Coast                   2040 non-null float64
GDPQ3                   2040 non-null int64
POPESTIMATE             2040 non-null int64
TotalProduction         2040 non-null int64
Unemp_rate              2040 non-null float64
GDP                     2040 non-null float64
Great Lakes             2040 non-null float64
GDPQ1                   2040 non-null int64
RINTERNATIONALMIG       2040 non-null float64
Region                  2040 non-null float64
GDPQ2                   2040 non-null int64
RDEATH                  2040 non-null float64
TotalExpenditures       2040 non-null float64
RNATURALINC             2040 non-null float64
Year                    2040 non-null int64
RDOMESTICMIG            2040 non-null float64
GDPQ4                   2040 non-null int64
GDP_per_cap             2040 non-null int64
RBIRTH                  2040 non-nu

In [9]:
#Linear Regression
linreg = Pipeline(steps = [('scaler', StandardScaler()), ('linreg', LinearRegression())])
linreg.fit(X_train, y_train)
train_pred = linreg.predict(X_train)
test_pred = linreg.predict(X_test)
features = pd.DataFrame({'Feature':X_train.columns, 'Coefficient':linreg.named_steps.linreg.coef_})

multi_scores('LinearRegression', train_pred, test_pred)
features.sort_values('Coefficient', ascending=False)

Unnamed: 0,Feature,Coefficient
6,GDP,6.013068e+17
10,Region,1.953792e+17
76,4,1.897719e+17
75,3,1.394231e+17
1,Coast,1.375432e+17
79,7,1.296407e+17
61,Pennsylvania,9.793007e+16
81,9,8.646276e+16
74,2,8.130776e+16
55,New York,5.960516e+16


In [10]:
#LASSO 

lasso = Pipeline([('scaler', StandardScaler()), ('lasso', Lasso())])

parms = {'lasso__alpha': np.arange(0, 0.5, 0.05)}
lasso_cv = GridSearchCV(lasso, parms, scoring = 'r2')
lasso_cv.fit(X_train, y_train)

best_parm = lasso_cv.best_params_
print(best_parm, lasso_cv.best_score_)

lasso = Pipeline([('scaler', StandardScaler()), ('lasso', Lasso(alpha = best_parm['lasso__alpha']))])
lasso.fit(X_train, y_train)
train_pred = lasso.predict(X_train)
test_pred = lasso.predict(X_test)

features = pd.DataFrame({'Feature':X_train.columns, 'Coefficient':lasso.named_steps.lasso.coef_})

multi_scores('Lasso', train_pred, test_pred)
features.sort_values('Coefficient', ascending=False)

{'lasso__alpha': 0.45} -53.822735242505


Unnamed: 0,Feature,Coefficient
84,FossFuel,486721.073048
66,Texas,178029.573926
2,GDPQ3,164253.567634
88,NatGas,139989.168001
3,POPESTIMATE,106366.438282
82,Coal,80267.568542
7,Great Lakes,73005.245478
1,Coast,62991.977230
83,Elec,54693.008457
41,Louisiana,51203.680329


In [11]:
#Ridge
ridge = Pipeline([('scaler', StandardScaler()), ('ridge', Ridge())])

parms = {'ridge__alpha': np.arange(0.05, 5, 0.05)}
ridge_cv = GridSearchCV(ridge, parms, scoring = 'r2')
ridge_cv.fit(X_train, y_train)

best_parm = ridge_cv.best_params_
print(best_parm, ridge_cv.best_score_)

ridge = Pipeline([('scaler', StandardScaler()), ('ridge', Ridge(alpha = best_parm['ridge__alpha']))])
ridge.fit(X_train, y_train)
train_pred = ridge.predict(X_train)
test_pred = ridge.predict(X_test)

features = pd.DataFrame({'Feature':X_train.columns, 'Coefficient':ridge.named_steps.ridge.coef_})

multi_scores('Ridge', train_pred, test_pred)
features.sort_values('Coefficient', ascending=False)

{'ridge__alpha': 4.95} -53.26062678277982


Unnamed: 0,Feature,Coefficient
84,FossFuel,483605.599881
66,Texas,138413.720930
88,NatGas,137272.599639
82,Coal,78105.306097
3,POPESTIMATE,66419.321601
79,7,54736.847413
13,TotalExpenditures,51248.802319
83,Elec,51047.745435
22,Production,47294.708790
4,TotalProduction,44676.163540


In [12]:
#KNeighbors sample with default settings


sample = ml_df.sample(n=1000)
X_sample = sample.drop(['Consumption'], axis=1)
y_sample = sample['Consumption']
X_sample_train, X_sample_test, y_sample_train, y_sample_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

knn_default = Pipeline([('scaler', StandardScaler()),('knn',KNeighborsRegressor())])
knn_default.fit(X_sample_train, y_sample_train)
y_sample_pred = knn_default.predict(X_sample_test)
knn_default.score(X_sample_test, y_sample_test)

0.29493914267477483

In [13]:
#KNeighbors determine best parameters

knn = Pipeline([('scaler', StandardScaler()),('knn',KNeighborsRegressor())])
parameters = {'knn__n_neighbors': np.arange(5,40,5),'knn__weights':['distance','uniform']}
knn_cv = RandomizedSearchCV(knn, parameters, scoring='r2')
knn_cv.fit(X_sample_train, y_sample_train)
print(knn_cv.best_params_)
print('Best Score: ', knn_cv.best_score_)

best_n = knn_cv.best_params_['knn__n_neighbors']
best_weights = knn_cv.best_params_['knn__weights']

{'knn__weights': 'distance', 'knn__n_neighbors': 25}
Best Score:  0.2642520491636651


In [14]:
#KNeighbors continued

knn = Pipeline([('scaler', StandardScaler()),('knn',KNeighborsRegressor(n_neighbors=best_n, weights=best_weights))])
knn.fit(X_train, y_train)
train_pred = knn.predict(X_train)
test_pred = knn.predict(X_test)

multi_scores('KNeighborsRegressor', train_pred, test_pred)

In [15]:
# Random Forest sample with default settings

rf_default = Pipeline([('scaler', StandardScaler()),('random_forest',RandomForestRegressor())])
rf_default.fit(X_sample_train, y_sample_train)
y_sample_pred = rf_default.predict(X_sample_test)
rf_default.score(X_sample_test, y_sample_test)

0.8366810799620532

In [16]:
#RF determine best parameters

rf = Pipeline([('scaler', StandardScaler()),('rf',RandomForestRegressor())])
best_score = 0
for n in range(100):
    sample = ml_df.sample(n=1500)
    X_sample = sample.drop(['Consumption'], axis=1)
    y_sample = sample['Consumption']
    X_sample_train, X_sample_test, y_sample_train, y_sample_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)
    parameters = {'rf__n_estimators': np.arange(150,200,10), 'rf__max_features':['auto','sqrt','log2'], 'rf__max_depth': np.arange(1,101,5), 'rf__criterion':['mse','mae'], 'rf__min_impurity_decrease': np.arange(0,0.5,0.05), 'rf__min_weight_fraction_leaf': np.arange(0,0.5,0.05), 'rf__min_samples_split':np.arange(2,16,2) }
    random_forest_cv = RandomizedSearchCV(rf, parameters, scoring='r2')
    random_forest_cv.fit(X_sample_train, y_sample_train)
    if random_forest_cv.best_score_ > best_score:
        print("this run increased score: ", n)
        print("new best score: ", random_forest_cv.best_score_)
        print("and new parms", random_forest_cv.best_params_)
        best_score = random_forest_cv.best_score_
        best_est = random_forest_cv.best_params_['rf__n_estimators']
        best_feat = random_forest_cv.best_params_['rf__max_features']
        best_depth = random_forest_cv.best_params_['rf__max_depth']
        best_cri = random_forest_cv.best_params_['rf__criterion']
        best_imp = random_forest_cv.best_params_['rf__min_impurity_decrease']
        best_leaf = random_forest_cv.best_params_['rf__min_weight_fraction_leaf']
        best_ss = random_forest_cv.best_params_['rf__min_samples_split']

this run increased score:  0
new best score:  0.9090731567209425
and new parms {'rf__n_estimators': 170, 'rf__min_weight_fraction_leaf': 0.0, 'rf__min_samples_split': 2, 'rf__min_impurity_decrease': 0.05, 'rf__max_features': 'auto', 'rf__max_depth': 56, 'rf__criterion': 'mse'}
this run increased score:  8
new best score:  0.9233564450967371
and new parms {'rf__n_estimators': 180, 'rf__min_weight_fraction_leaf': 0.0, 'rf__min_samples_split': 2, 'rf__min_impurity_decrease': 0.2, 'rf__max_features': 'auto', 'rf__max_depth': 21, 'rf__criterion': 'mse'}
this run increased score:  64
new best score:  0.9271001920122016
and new parms {'rf__n_estimators': 160, 'rf__min_weight_fraction_leaf': 0.0, 'rf__min_samples_split': 2, 'rf__min_impurity_decrease': 0.45, 'rf__max_features': 'auto', 'rf__max_depth': 11, 'rf__criterion': 'mae'}
this run increased score:  65
new best score:  0.9507052511265626
and new parms {'rf__n_estimators': 170, 'rf__min_weight_fraction_leaf': 0.0, 'rf__min_samples_split'

In [17]:
#Random Forest full 

random_forest = Pipeline([('scaler', StandardScaler()),('random_forest', RandomForestRegressor(n_estimators=best_est, max_features=best_feat, n_jobs=2, max_depth=best_depth, criterion=best_cri, min_samples_split=best_ss, min_impurity_decrease=best_imp))])
random_forest.fit(X_train, y_train)
train_pred = random_forest.predict(X_train)
test_pred = random_forest.predict(X_test)

features = pd.DataFrame({'Feature':X_train.columns, 'Importance':random_forest.named_steps.random_forest.feature_importances_})

multi_scores('RandomForestRegressor', train_pred, test_pred)
features.sort_values('Importance', ascending=False)

Unnamed: 0,Feature,Importance
13,TotalExpenditures,0.233136
84,FossFuel,0.192146
21,Expenditure,0.163043
23,Price,0.070228
3,POPESTIMATE,0.054274
4,TotalProduction,0.046416
22,Production,0.035414
66,Texas,0.032281
18,GDP_per_cap,0.017895
88,NatGas,0.010908


In [18]:
score_table

Unnamed: 0,r2_train,r2_test,mse_train,mse_test
LinearRegression,0.544134,0.53622,320856000000.0,348258000000.0
Lasso,0.544119,0.53624,320866000000.0,348243000000.0
Ridge,0.544086,0.536131,320889000000.0,348325000000.0
RandomForestRegressor,0.995121,0.987111,3434310000.0,9678400000.0
KNeighborsRegressor,1.0,0.442412,0.0,418700000000.0
