In [28]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import re
import xgboost

from pandas.api.types import CategoricalDtype

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from statsmodels.graphics.tsaplots import plot_pacf
from xgboost import XGBRegressor
import lightgbm
from sklearn.model_selection import GridSearchCV
import joblib
import logging
# import keras

In [29]:
all_csv = pd.read_csv('cleaned.csv.gz',
                 dtype = {
                     'store_nbr' : 'category',
                     'family' : 'category',
                     'sales': 'float',
                     'city': 'category',
                     'state': 'category',
                     'type': 'category',
                     'holiday_type': 'category',
                     'holiday_transferred': 'category'
                 },
                  parse_dates=['date'])
all_csv['date'] = pd.to_datetime(all_csv['date']).dt.to_period('D')

In [30]:
all = all_csv.copy()  # we can start experimenting from here without reloading the csv file

In [31]:
# this is for experimentation

filter_by_stores = None  # note: please use string here (unlike Mine.ipynb)
filter_by_family = None
filter_by_dates = None

#filter_by_stores = ['1', '2']  # note: please use string here (unlike Mine.ipynb)
#filter_by_family = ['DAIRY', 'PRODUCE']
#filter_by_family = ['']
#filter_by_dates = '2014-06-05'

In [32]:
if filter_by_dates == None:
    train_start_date = '2013-01-01'
else:
    train_start_date = filter_by_dates
train_end_date = '2017-08-15'
test_start_date = '2017-08-16'
test_end_date = '2017-08-31'

In [33]:
if filter_by_family != None:
    all = all[all['family'].isin(filter_by_family)]
if filter_by_stores != None:
    all = all[all['store_nbr'].isin(filter_by_stores)]
if filter_by_dates != None:
    all = all[all['date'] >= filter_by_dates]

In [34]:
all.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3036528 entries, 0 to 3036527
Data columns (total 38 columns):
 #   Column               Dtype    
---  ------               -----    
 0   date                 period[D]
 1   store_nbr            category 
 2   family               category 
 3   sales                float64  
 4   onpromotion          int64    
 5   sales_lag_01         float64  
 6   sales_lag_02         float64  
 7   sales_lag_03         float64  
 8   sales_lag_04         float64  
 9   sales_lag_05         float64  
 10  sales_lag_06         float64  
 11  sales_lag_07         float64  
 12  sales_lag_08         float64  
 13  sales_lag_09         float64  
 14  sales_lag_10         float64  
 15  sales_lag_11         float64  
 16  sales_lag_12         float64  
 17  sales_lag_13         float64  
 18  sales_lag_14         float64  
 19  sales_lag_15         float64  
 20  sales_lag_16         float64  
 21  sales_lag_17         float64  
 22  sales_lag_18      

In [35]:
def one_hot_encode(df):
    return pd.get_dummies(data=df, columns=['store_nbr', 'family', 'city', 'state', 'type',
                                     'cluster', 'holiday_type', 'holiday_transferred', 'weekday']) 

In [38]:
all_ohe = one_hot_encode(all)
all_ohe = all_ohe.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))  # remove bad char in column names

X = all_ohe[all_ohe['date'] <= train_end_date]
X = X.drop(['sales'], axis=1)
y = all_ohe[['date', 'sales']][all_ohe['date'] <= train_end_date]
y.set_index('date', inplace=True)

X_test = all_ohe[all_ohe['date'] >= test_start_date]
X_test = X_test.drop(['sales'], axis=1)

X.drop('date', axis=1, inplace=True)
X_test.drop('date', axis=1, inplace=True)
y.set_index(X.index, inplace=True)

# from sklearn.decomposition import PCA
# pca = PCA(n_components=180)
# principalComponents = pca.fit_transform(X)
# principalDf = pd.DataFrame(data = principalComponents)

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=1)

In [None]:

# from sklearn.decomposition import PCA
# pca = PCA(n_components=180)
# principalComponents = pca.fit_transform(X)
# principalDf = pd.DataFrame(data = principalComponents)

In [None]:
# X_train, X_val, y_train, y_val = train_test_split(principalDf, y, random_state=1)

In [None]:
# X_train.columns.size

XGBoost

In [39]:
xgb = xgboost.XGBRegressor(n_estimators=110, early_stopping_rounds=10, learning_rate=0.4)
history_callback =xgb.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],
        verbose=True)

loss_history = history_callback.history["verbose"]


numpy_loss_history = np.array(loss_history)
np.savetxt("loss_history.txt", numpy_loss_history, delimiter=",")


[0]	validation_0-rmse:2.36204	validation_1-rmse:2.36777
[1]	validation_0-rmse:1.47818	validation_1-rmse:1.48185
[2]	validation_0-rmse:0.970803	validation_1-rmse:0.973856
[3]	validation_0-rmse:0.692293	validation_1-rmse:0.695077
[4]	validation_0-rmse:0.557023	validation_1-rmse:0.559747
[5]	validation_0-rmse:0.496438	validation_1-rmse:0.498558
[6]	validation_0-rmse:0.470354	validation_1-rmse:0.472591
[7]	validation_0-rmse:0.45842	validation_1-rmse:0.460446
[8]	validation_0-rmse:0.452052	validation_1-rmse:0.453944
[9]	validation_0-rmse:0.448984	validation_1-rmse:0.450909
[10]	validation_0-rmse:0.446967	validation_1-rmse:0.44886
[11]	validation_0-rmse:0.445542	validation_1-rmse:0.447475
[12]	validation_0-rmse:0.443623	validation_1-rmse:0.445501
[13]	validation_0-rmse:0.442402	validation_1-rmse:0.44425
[14]	validation_0-rmse:0.441134	validation_1-rmse:0.442886
[15]	validation_0-rmse:0.440038	validation_1-rmse:0.441878
[16]	validation_0-rmse:0.437443	validation_1-rmse:0.439267
[17]	validatio

AttributeError: 'XGBRegressor' object has no attribute 'history'

In [40]:
xgb.save_model("xgbfinalmodel.json")
xgb.save_model("xgbfinalmodel.txt")

In [41]:
y_pred_train = xgb.predict(X_train)
y_pred_train[y_pred_train < 0] = 0
y_pred_val = xgb.predict(X_val)
y_pred_val[y_pred_val < 0] = 0

print("RMS log-error train: ", np.sqrt(mean_squared_log_error(y_train, y_pred_train)))
print("RMS log-error val: ", np.sqrt(mean_squared_log_error(y_val, y_pred_val)))
print("RMS log-error train (actual): ",
          np.sqrt(mean_squared_log_error(np.expm1(y_train), np.expm1(y_pred_train))))
print("RMS log-error val (actual): ",
          np.sqrt(mean_squared_log_error(np.expm1(y_val), np.expm1(y_pred_val))))


RMS log-error train:  0.15301606007499857
RMS log-error val:  0.1541208706889701
RMS log-error train (actual):  0.40065622873672846
RMS log-error val (actual):  0.40287645083933715


In [43]:
print(xgb.summary())

AttributeError: 'XGBRegressor' object has no attribute 'summary'

In [44]:
def main_predict(model, X_test):
    X_test_mod = X_test.copy()
    output = np.array([])
    start_day, end_day = X_test['day_of_month'].min(), X_test['day_of_month'].max()
        # we lost the dates, but we still have day_of_month, which is good enough for our experiment
        
    for day in range(start_day, end_day + 1):
        pred = model.predict(X_test_mod[X_test_mod['day_of_month'] == day])
        pred[pred < 0] = 0
        print(pred)
        output = np.concatenate([output, pred], axis=0)
        for future in range(day + 1, end_day + 1):
            X_test_mod.loc[X_test_mod[X_test_mod['day_of_month'] == future].index,
                           f'sales_lag_{(future - day):02d}'] = pred
            # fill out future values now that this sales figure is available
            
    return output

In [45]:
y_pred_test = main_predict(xgb, X_test)

[1.6030245  0.06487405 1.6090407  ... 7.318695   4.41523    2.816975  ]
[1.3723085  0.48425683 1.5762546  ... 7.123762   4.621746   2.7673562 ]
[1.5091102  0.34571868 1.4901665  ... 7.2624784  4.6534514  2.881816  ]
[1.449044  0.3658272 1.3249067 ... 7.3239083 4.9125714 3.1157362]
[1.3693335  0.53638554 1.2826777  ... 7.4323797  4.9647255  3.0348468 ]
[1.4960632 0.57153   1.6091845 ... 7.149037  4.7922344 2.9714577]
[1.4854034  0.39065775 1.3508189  ... 7.6213856  4.527902   2.821136  ]
[1.4637    0.4610877 1.5483489 ... 7.1153436 4.4269657 2.8312593]
[1.4504821 0.5739996 1.4708084 ... 7.0782733 4.590386  2.7400453]
[1.4958229  0.46088344 1.3727589  ... 7.0828004  4.6334486  2.8919709 ]
[1.4550418  0.40220675 1.250483   ... 7.1746626  4.921728   3.2882009 ]
[1.3992629 0.4285711 1.4084591 ... 7.489747  4.922186  3.0725062]
[1.485692  0.6704218 1.548816  ... 7.0633464 4.562362  2.8987868]
[1.4885128 0.5164683 1.4998486 ... 7.5372934 4.472973  2.8121877]
[1.6039565 0.4773856 1.5232067 ...

In [46]:
delta_index = 3008016 - 3000888  # we inserted 4 Christmas days, 4 x 54 x 33 = 7128, which is the difference
submission = pd.DataFrame({'id': X_test.index - delta_index, 'sales': np.expm1(y_pred_test)})
submission.to_csv('submissionxgb.csv', index=False)