In [28]:
# Import all the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import cross_val_predict
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import neighbors
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn import ensemble
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from mlxtend.regressor import StackingRegressor
from sklearn.metrics import r2_score

In [46]:
# Nested Cross Validation to compare accuracies across models
def nested_cv_evaluation(model_name, parameters, X_train, y_train, scaler = None):
    NUM_TRIALS = 10
    nested_scores_err = np.zeros(NUM_TRIALS)
    nested_scores_r2 = np.zeros(NUM_TRIALS)

    for i in range(NUM_TRIALS):
        pipeline = Pipeline([('transformer', scaler), ('estimator', model_name)])

        inner_cv = KFold(n_splits=3, shuffle=True, random_state=i)
        outer_cv = KFold(n_splits=3, shuffle=True, random_state=i)

        #clf = GridSearchCV(estimator = model_name, param_grid = parameters, cv=inner_cv)
        clf = GridSearchCV(pipeline, param_grid = parameters, cv=inner_cv)
        clf.fit(X_train, y_train)
        
        pipeline2 = Pipeline([('transformer', scaler), ('estimator', clf)])

        # Nested CV with parameter optimization
        nested_score_error = cross_val_score(pipeline2, X_train, y_train, cv=outer_cv, scoring='neg_mean_squared_error')
        nested_score_r2 = cross_val_score(pipeline2, X_train, y_train, cv=outer_cv, scoring='r2')
        
        nested_scores_err[i] = nested_score_error.mean()
        nested_scores_r2[i] = nested_score_r2.mean()
        
    estimators.append(model_name)
    avg_error_all_models_nested_cv.append(nested_scores_err.mean())
    std_error_all_models_nested_cv.append(nested_scores_err.std())
    avg_Rsquare_all_models_nested_cv.append(nested_scores_r2.mean())
    std_Rsquare_all_models_nested_cv.append(nested_scores_r2.std())

    #std_all_models_nested_cv.append(nested_scores.std())
    print("Average error is {0} and standard deviation is {1}".format(nested_scores_err.mean(), nested_scores_err.std()))
    print("Average R-Square is {0} and standard deviation is {1}".format(nested_scores_r2.mean(), nested_scores_r2.std()))

In [47]:
X_train.head()

def feature_selection_func(estimator, params, X_train, y_train, scaler = None):
    sfs1 = sfs(estimator, k_features="best", forward=True, floating=False, cv=10)

    sfs2 = sfs1.fit(X_train, y_train)
    fig = plot_sfs(sfs2.get_metric_dict(), kind='std_err')
    plt.title('Sequential Forward Selection (w. StdErr)')
    plt.grid()
    plt.show()
    print('Selected features:', sfs2.k_feature_idx_)
    X_train_sfs = sfs2.transform(X_train)
    X_test_sfs = sfs2.transform(X_test)
    nested_cv_evaluation(estimator, params, X_train_sfs, y_train, scaler)

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train['Dt'] = pd.to_datetime(train['Date']).dt.day
train['Mon'] = pd.to_datetime(train['Date']).dt.month
train['Year'] = pd.to_datetime(train['Date']).dt.year
train['StateHoliday'] = train['StateHoliday'].apply(str)

test['Dt'] = pd.to_datetime(test['Date']).dt.day
test['Mon'] = pd.to_datetime(test['Date']).dt.month
test['Year'] = pd.to_datetime(test['Date']).dt.year
test['StateHoliday'] = test['StateHoliday'].apply(str)

In [None]:
# Initial feature selection 
# Not taking Store because it might create a lot of features
X_train = train[['DayOfWeek','Open','Promo','StateHoliday','SchoolHoliday','Dt','Mon','Year']]
y_train = train[['Sales']]
X_test = test[['DayOfWeek','Open','Promo','StateHoliday','SchoolHoliday','Dt','Mon','Year']]

In [44]:
X_train = pd.get_dummies(X_train, columns=['DayOfWeek','Open','Promo','StateHoliday',
                                           'SchoolHoliday','Dt','Mon','Year'], dummy_na=False)
X_test = pd.get_dummies(X_test, columns=['DayOfWeek','Open','Promo','StateHoliday',
                                         'SchoolHoliday','Dt','Mon','Year'], dummy_na=False)

In [50]:
X_test = X_test.rename(columns = {'Open_0.0' : 'Open_0', 'Open_1.0': 'Open_1'})
for c in X_train.columns:
    if c not in X_test.columns:
        X_test[c] = 0
X_test.columns

Index(['DayOfWeek_1', 'DayOfWeek_2', 'DayOfWeek_3', 'DayOfWeek_4',
       'DayOfWeek_5', 'DayOfWeek_6', 'DayOfWeek_7', 'Open_0', 'Open_1',
       'Promo_0', 'Promo_1', 'StateHoliday_0', 'StateHoliday_a',
       'SchoolHoliday_0', 'SchoolHoliday_1', 'Dt_1', 'Dt_2', 'Dt_3', 'Dt_4',
       'Dt_5', 'Dt_6', 'Dt_7', 'Dt_8', 'Dt_9', 'Dt_10', 'Dt_11', 'Dt_12',
       'Dt_13', 'Dt_14', 'Dt_15', 'Dt_16', 'Dt_17', 'Dt_18', 'Dt_19', 'Dt_20',
       'Dt_21', 'Dt_22', 'Dt_23', 'Dt_24', 'Dt_25', 'Dt_26', 'Dt_27', 'Dt_28',
       'Dt_29', 'Dt_30', 'Dt_31', 'Mon_8', 'Mon_9', 'Year_2015',
       'StateHoliday_b', 'StateHoliday_c', 'Mon_1', 'Mon_2', 'Mon_3', 'Mon_4',
       'Mon_5', 'Mon_6', 'Mon_7', 'Mon_10', 'Mon_11', 'Mon_12', 'Year_2013',
       'Year_2014'],
      dtype='object')

In [52]:
# Simple linear regression
# p_grid = {'penalty': ['l1', 'l2'],'C' : np.logspace(0, 4, 10)} This is not working. Ask
param = {'estimator__fit_intercept':[True,False], 'estimator__normalize':[True,False], 'estimator__copy_X':[True, False]}
lin_reg = linear_model.LinearRegression()
fit = lin_reg.fit(X_train, y_train)

In [None]:
# Linear Regression with feature selection
feature_selection_func(lin_reg, param, X_train, y_train)

In [53]:
y_pred = fit.predict(X_test)

In [55]:
lin_reg = linear_model.LinearRegression()
scores = cross_val_score(lin_reg,X_train,y_train,cv=3,scoring='neg_mean_squared_error')
scores_r2 = cross_val_score(lin_reg,X_train,y_train,cv=3,scoring='r2')
print(scores)
print(scores_r2*100)

In [59]:
ridge_reg = linear_model.Ridge()
scores = cross_val_score(ridge_reg,X_train,y_train,cv=3,scoring='neg_mean_squared_error')
scores_r2 = cross_val_score(ridge_reg,X_train,y_train,cv=3,scoring='r2')
print(np.sqrt(-1*scores))
print(scores_r2*100)

[2593.43990356 2593.07937437 2492.11791323]
[56.82561482 56.32411481 53.63110006]


In [62]:
clf_rf = GradientBoostingRegressor(random_state=123)
scores = cross_val_score(clf_rf,X_train,y_train,cv=3,scoring='neg_mean_squared_error')
scores_r2 = cross_val_score(clf_rf,X_train,y_train,cv=3,scoring='r2')
print(np.sqrt(-1*scores))
print(scores_r2*100)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[2564.31036017 2545.80871117 2468.08099115]
[57.7900383  57.90198383 54.52125902]


In [64]:
fit = clf_rf.fit(X_train, y_train)
y_pred = fit.predict(X_test)

  y = column_or_1d(y, warn=True)


In [66]:
print(y_train.min())
print(y_train.max())

Sales    0
dtype: int64
Sales    41551
dtype: int64


In [67]:
y_pred

array([8685.43609311, 8685.43609311, 8685.43609311, ..., 6385.36306696,
       6385.36306696, 6385.36306696])