<a href="https://colab.research.google.com/github/skywalker0803r/Job-OEM/blob/main/Store_Sales_Time_Series_Forecast.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [76]:
import pandas as pd
pd.options.plotting.backend = "plotly"
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_squared_log_error
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from statsmodels.graphics import tsaplots
from sklearn.multioutput import RegressorChain, MultiOutputRegressor

In [53]:
holidays_events = pd.read_csv(
    '/content/drive/MyDrive/store-sales-time-series-forecasting/holidays_events.csv',
    dtype={
        'type': 'category',
        'locale': 'category',
        'locale_name': 'category',
        'description': 'category',
        'transferred': 'bool',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
holidays_events = holidays_events.set_index('date').to_period('D')

store_sales = pd.read_csv(
   '/content/drive/MyDrive/store-sales-time-series-forecasting/train.csv',
    usecols=['store_nbr', 'family', 'date', 'sales', 'onpromotion'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
store_sales['date'] = store_sales.date.dt.to_period('D')
store_sales = store_sales.set_index(['store_nbr', 'family', 'date']).sort_index()
sales = store_sales.copy()
store_sales

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales,onpromotion
store_nbr,family,date,Unnamed: 3_level_1,Unnamed: 4_level_1
1,AUTOMOTIVE,2013-01-01,0.000000,0
1,AUTOMOTIVE,2013-01-02,2.000000,0
1,AUTOMOTIVE,2013-01-03,3.000000,0
1,AUTOMOTIVE,2013-01-04,3.000000,0
1,AUTOMOTIVE,2013-01-05,5.000000,0
...,...,...,...,...
9,SEAFOOD,2017-08-11,23.830999,0
9,SEAFOOD,2017-08-12,16.859001,4
9,SEAFOOD,2017-08-13,20.000000,0
9,SEAFOOD,2017-08-14,17.000000,0


In [54]:
sales.reset_index(inplace=True)
sales['dayofweek'] = sales['date'].dt.dayofweek
# sales

In [55]:
seasonal = sales.groupby(['family','dayofweek']).sum().reset_index()
# seasonal

In [58]:
dayOfWeekfig = go.Figure()
for fam in seasonal['family'].unique():
    x = seasonal[seasonal['family'] == fam]['dayofweek'].values.tolist()
    y = seasonal[seasonal['family'] == fam]['sales'].values.tolist()
    dayOfWeekfig.add_trace(go.Scatter(x=x, y=y, name= fam))
    
dayOfWeekfig.update_layout(title='Sales per family', xaxis_title='Day of Week', yaxis_title='Sales(M)')
dayOfWeekfig.show()

In [59]:
average_sales = (sales.groupby('date').mean().squeeze())
# average_sales

In [60]:
x = average_sales.reset_index()['date'].astype('str').values.tolist()
y = average_sales['sales'].values.tolist()
trendfig = go.Figure()
trendfig.add_trace(go.Scatter(x=x, y=y, name='Average Sales'))
trendfig.update_layout(title='Sales', xaxis_title='Date', yaxis_title='Sales(M)')
trendfig.show()

In [61]:
moving_avg_365 = average_sales['sales'].rolling(window=365, center=False).mean().reset_index()
# moving_avg_365

In [62]:
x = moving_avg_365['date'].astype('str').values.tolist()
y = moving_avg_365['sales'].values.tolist()
trendfig.add_trace(go.Scatter(x=x, y=y, name='Moving Average 365'))
trendfig.show()

In [63]:
dp = DeterministicProcess(
    index=average_sales.index,  # dates from the training data
    constant=True,       # dummy feature for the bias (y_intercept)
    order=1,             # the time dummy (trend)
    drop=True,           # drop terms if necessary to avoid collinearity
)
# `in_sample` creates features for the dates given in the `index` argument
X = dp.in_sample()
y = average_sales['sales']
# X.head()

In [64]:
lr = LinearRegression(fit_intercept=False)
lr.fit(X,y)

y_pred = pd.Series(lr.predict(X), index=X.index)

In [65]:
X = dp.out_of_sample(steps=180)
y_fore = pd.Series(lr.predict(X), index=X.index).reset_index().set_axis(['date', 'sales'], axis=1)
# y_fore

In [66]:
x = y_fore['date'].astype('str').values.tolist()
y = y_fore['sales'].values.tolist()
trendfig.add_trace(go.Scatter(x=x, y=y, name='Prediction'))
trendfig.show()

In [67]:
fourier = CalendarFourier(freq="A", order=10)  # 10 sin/cos pairs for "A"nnual seasonality

dp = DeterministicProcess(
    index=average_sales.index,
    constant=True,   # dummy feature for bias (y-intercept)
    order=1,         # trend ( order 1 means linear)
    seasonal=True,   # weekly seasonality (indicators)
    additional_terms=[fourier], # annual seasonality
    drop=True,       # drop terms to avoid collinearity
)

X = dp.in_sample() # create features for dates in tunnel.index
y = average_sales["sales"]
#X.head()

In [68]:
lr = LinearRegression().fit(X,y)
y_pred = pd.Series(lr.predict(X), index=X.index)
y_deseason = y - y_pred

X_fore = dp.out_of_sample(steps=180)
y_fore = pd.Series(lr.predict(X_fore), index=X_fore.index).reset_index().set_axis(['date', 'sales'], axis=1)

seasonalfig = go.Figure()
seasonalfig.add_trace(go.Scatter(x=y.reset_index()['date'].astype('str').values.tolist(), 
                                 y=y.values.tolist(),
                                 mode='lines+markers',
                                 name='Average Sales'))
seasonalfig.add_trace(go.Scatter(x=y_pred.reset_index()['date'].astype('str').values.tolist(),
                                 y=y_pred.values.tolist(),
                                 name='seasonal'))
seasonalfig.add_trace(go.Scatter(x=y_deseason.reset_index()['date'].astype('str').values.tolist(),
                                 y=y_deseason.values.tolist(),
                                 name='deseasonal'))
seasonalfig.add_trace(go.Scatter(x=y_fore['date'].astype('str').values.tolist(),
                                 y=y_fore['sales'].values.tolist(),
                                 name='predict'))
seasonalfig.update_layout(title='Seasonal')
seasonalfig.show()

In [69]:
family_sales = (
    store_sales
    .groupby(['store_nbr','family', 'date'])
    .mean() 
    .unstack(['family', 'store_nbr'])
    .loc['2017', ['sales','onpromotion']]
)
# family_sales = (
#     store_sales
#     .groupby(['family', 'date'])
#     .mean() 
#     .unstack('family')
#     .loc['2017', ['sales','onpromotion']]
# )

# display(family_sales)
# display(store_sales)
df_sales = family_sales.loc(axis=1)[:, 'MAGAZINES']
display(df_sales)

Unnamed: 0_level_0,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,...,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion
family,MAGAZINES,MAGAZINES,MAGAZINES,MAGAZINES,MAGAZINES,MAGAZINES,MAGAZINES,MAGAZINES,MAGAZINES,MAGAZINES,...,MAGAZINES,MAGAZINES,MAGAZINES,MAGAZINES,MAGAZINES,MAGAZINES,MAGAZINES,MAGAZINES,MAGAZINES,MAGAZINES
store_nbr,1,10,11,12,13,14,15,16,17,18,...,5,50,51,52,53,54,6,7,8,9
date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2017-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-01-02,1.0,1.0,5.0,3.0,1.0,4.0,1.0,1.0,7.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-01-03,1.0,0.0,3.0,1.0,0.0,1.0,1.0,0.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-01-04,6.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-01-05,6.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,4.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-11,4.0,1.0,1.0,2.0,0.0,2.0,2.0,4.0,8.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-08-12,6.0,0.0,2.0,0.0,0.0,4.0,2.0,2.0,8.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-08-13,4.0,1.0,1.0,1.0,1.0,3.0,2.0,0.0,4.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-08-14,8.0,0.0,3.0,3.0,1.0,4.0,3.0,2.0,6.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [70]:
# Target series
y = family_sales.loc[:, 'sales']
# display(y)

# X_1: Features for Linear Regression
fourier = CalendarFourier(freq='D', order=4)
dp = DeterministicProcess(
    index=y.index,
    constant=True,
    order=5,
    seasonal=True,
    additional_terms=[fourier],
    drop=True,
)
X_1 = dp.in_sample()


# X_2: Features for XGBoost
X_2 = family_sales.drop('sales', axis=1).stack()  # onpromotion feature
# display(X_2)
# Label encoding for 'family'
le = LabelEncoder()  # from sklearn.preprocessing
X_2 = X_2.stack()
X_2 = X_2.reset_index(['family', 'store_nbr'])

X_2['family'] = le.fit_transform(X_2['family'])

# Label encoding for seasonality
X_2["day"] = X_2.index.day  # values are day of the month

display(X_1)
display(X_2)

Unnamed: 0_level_0,const,trend,trend_squared,trend_cubed,trend**4,trend**5,"s(2,7)","s(3,7)","s(4,7)","s(5,7)","s(6,7)","s(7,7)"
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2017-01-01,1.0,1.0,1.0,1.0,1.000000e+00,1.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0
2017-01-02,1.0,2.0,4.0,8.0,1.600000e+01,3.200000e+01,1.0,0.0,0.0,0.0,0.0,0.0
2017-01-03,1.0,3.0,9.0,27.0,8.100000e+01,2.430000e+02,0.0,1.0,0.0,0.0,0.0,0.0
2017-01-04,1.0,4.0,16.0,64.0,2.560000e+02,1.024000e+03,0.0,0.0,1.0,0.0,0.0,0.0
2017-01-05,1.0,5.0,25.0,125.0,6.250000e+02,3.125000e+03,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-11,1.0,223.0,49729.0,11089567.0,2.472973e+09,5.514731e+11,0.0,0.0,0.0,0.0,1.0,0.0
2017-08-12,1.0,224.0,50176.0,11239424.0,2.517631e+09,5.639493e+11,0.0,0.0,0.0,0.0,0.0,1.0
2017-08-13,1.0,225.0,50625.0,11390625.0,2.562891e+09,5.766504e+11,0.0,0.0,0.0,0.0,0.0,0.0
2017-08-14,1.0,226.0,51076.0,11543176.0,2.608758e+09,5.895793e+11,1.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0_level_0,store_nbr,family,onpromotion,day
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-01,1,0,0.0,1
2017-01-01,1,1,0.0,1
2017-01-01,1,2,0.0,1
2017-01-01,1,3,0.0,1
2017-01-01,1,4,0.0,1
...,...,...,...,...
2017-08-15,9,28,0.0,15
2017-08-15,9,29,1.0,15
2017-08-15,9,30,148.0,15
2017-08-15,9,31,8.0,15


In [71]:
class BoostedHybrid:
    def __init__(self, model_1, model_2):
        self.model_1 = model_1
        self.model_2 = model_2
        self.y_columns = None  # store column names from fit method
    
    def fit(self, X_1, X_2, y):
        # Train model_1
        self.model_1.fit(X_1, y)

        # Make predictions
        y_fit = pd.DataFrame(
            self.model_1.predict(X_1), 
            index=X_1.index, 
            columns=y.columns,
        )

        # Compute residuals
        y_resid = y - y_fit
        y_resid = y_resid.stack().squeeze() # wide to long
#         display(y_resid.stack())
        # Train model_2 on residuals
        self.model_2.fit(X_2, y_resid.stack())

        # Save column names for predict method
        self.y_columns = y.columns
        # Save data for question checking
        self.y_fit = y_fit
        self.y_resid = y_resid
        
    def predict(self, X_1, X_2):
        # Predict with model_1
        y_pred = pd.DataFrame(
            self.model_1.predict(X_1), 
            index=X_1.index, columns=self.y_columns,
        )
#         display('model 1 predict', self.model_1.predict(X_1), self.model_1.predict(X_1).shape)
#         display('y_pred before stack squeeze', y_pred)
        y_pred = y_pred.stack().squeeze()  # wide to long

        # Add model_2 predictions to model_1 predictions
#         display('y_pred after stack squeeze', y_pred)
#         display('model 2 predict', np.reshape(self.model_2.predict(X_2), (-1,33)))
        y_pred += np.reshape(self.model_2.predict(X_2), (-1, 33))
#         display(y_pred.unstack())
        return y_pred.unstack()

In [77]:
model = BoostedHybrid(
    model_1=Ridge(),
    model_2=KNeighborsRegressor(),
)

model.fit(X_1, X_2, y)
y_pred = model.predict(X_1, X_2)

y_pred = y_pred.clip(0.0)


Ill-conditioned matrix (rcond=1.86813e-25): result may not be accurate.


Arrays of bytes/strings is being converted to decimal numbers if dtype='numeric'. This behavior is deprecated in 0.24 and will be removed in 1.1 (renaming of 0.26). Please convert your data to numeric values explicitly instead.


Arrays of bytes/strings is being converted to decimal numbers if dtype='numeric'. This behavior is deprecated in 0.24 and will be removed in 1.1 (renaming of 0.26). Please convert your data to numeric values explicitly instead.



In [78]:
y_train, y_valid = y[:"2017-07-01"], y["2017-07-02":]
X1_train, X1_valid = X_1[: "2017-07-01"], X_1["2017-07-02" :]
X2_train, X2_valid = X_2.loc[:"2017-07-01"], X_2.loc["2017-07-02":]

# model.fit(X1_train, X2_train, y_train)
y_fit = model.predict(X1_train, X2_train).clip(0.0)
y_pred = model.predict(X1_valid, X2_valid).clip(0.0)

families = y.columns[0:6]

# display(y.loc(axis=1)[families])
# display(y_fit.loc(axis=1)[families])
# display(y_pred.loc(axis=1)[families])


Arrays of bytes/strings is being converted to decimal numbers if dtype='numeric'. This behavior is deprecated in 0.24 and will be removed in 1.1 (renaming of 0.26). Please convert your data to numeric values explicitly instead.


Arrays of bytes/strings is being converted to decimal numbers if dtype='numeric'. This behavior is deprecated in 0.24 and will be removed in 1.1 (renaming of 0.26). Please convert your data to numeric values explicitly instead.



In [79]:
total_training = 0
total_validation = 0
for fam in families:
    fam_string = " ".join(fam)
    famplot = go.Figure()
    x_train_val = y.loc(axis=1)[families].index.astype('str')
    y_train_val = y.loc(axis=1)[families][fam].values
#     print(f'x_train_val : {x_train_val} \n y_train_val : {y_train_val} \n fam : {fam_string}')
    famplot.add_trace(go.Scatter(x=x_train_val,y=y_train_val, name=fam_string+"-sales", mode='lines+markers'))
    
    x_fit_val = y_fit.loc(axis=1)[families].index.astype('str')
    y_fit_val = y_fit.loc(axis=1)[families][fam].values
    famplot.add_trace(go.Scatter(x=x_fit_val,y=y_fit_val, name=fam_string+"-fit"))
    
    x_predict = y_pred.loc(axis=1)[families].index.astype('str')
    y_predict = y_pred.loc(axis=1)[families][fam].values
    famplot.add_trace(go.Scatter(x=x_predict,y=y_predict, name=fam_string+"-predict"))
    famplot.update_layout(title=fam_string)
    famplot.show()

    rmsle_train = mean_squared_log_error(y_train_val[:len(y_fit_val)], y_fit_val) ** 0.5
    rmsle_valid = mean_squared_log_error(y_train_val[:len(y_predict)], y_predict) ** 0.5
    print(f'Training RMSLE for {fam} : {rmsle_train:.5f}')
    print(f'Validation RMSLE for {fam} : {rmsle_valid:.5f}')
    total_training += rmsle_train
    total_validation += rmsle_valid
    
print(f'Total _training : {total_training}')
print(f'Total _validation : {total_validation}')

Training RMSLE for ('AUTOMOTIVE', '1') : 0.50442
Validation RMSLE for ('AUTOMOTIVE', '1') : 0.66296


Training RMSLE for ('BABY CARE', '1') : 0.03952
Validation RMSLE for ('BABY CARE', '1') : 0.04588


Training RMSLE for ('BEAUTY', '1') : 0.44444
Validation RMSLE for ('BEAUTY', '1') : 0.75957


Training RMSLE for ('BEVERAGES', '1') : 0.55740
Validation RMSLE for ('BEVERAGES', '1') : 1.13580


Training RMSLE for ('BOOKS', '1') : 0.39395
Validation RMSLE for ('BOOKS', '1') : 0.57192


Training RMSLE for ('BREAD/BAKERY', '1') : 0.44099
Validation RMSLE for ('BREAD/BAKERY', '1') : 0.83738
Total _training : 2.3807179472551545
Total _validation : 4.013510332192631


In [81]:
df_test = pd.read_csv(
    '/content/drive/MyDrive/store-sales-time-series-forecasting/test.csv',
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'onpromotion': 'uint32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
df_test['date'] = df_test.date.dt.to_period('D')
df_test = df_test.set_index(['store_nbr', 'family', 'date']).sort_index()
# df_test

In [82]:
y = df_test.unstack(['store_nbr', 'family']).loc["2017"]
# print(y)
# Create training data
# fourier = CalendarFourier(freq='M', order=4)
# dp = DeterministicProcess(
#     index=y.index,
#     constant=True,
#     order=5,
#     seasonal=True,
#     additional_terms=[fourier],
#     drop=True,
# )
# X = dp.in_sample()

fourier = CalendarFourier(freq='D', order=4)
dp = DeterministicProcess(
    index=y.index,
    constant=True,
    order=5,
    seasonal=True,
    additional_terms=[fourier],
    drop=True,
)

X_1 = dp.in_sample()
# display(X_1)
# X['NewYear'] = (X.index.dayofyear == 1)

In [83]:
# X_1: Features for Linear Regression
X_1 = dp.in_sample()

# X_2: Features for XGBoost
X_2 = df_test  # onpromotion feature

# Label encoding for 'family'
le = LabelEncoder()  # from sklearn.preprocessing
X_2 = X_2.reset_index('family')

X_2['family'] = le.fit_transform(X_2['family'])
X_2.reset_index(inplace=True)
# display(X_2['date'].dt.day)
# Label encoding for seasonality
X_2["day"] = X_2['date'].dt.day  # values are day of the month
X_2 = X_2.groupby(['date', 'family', 'store_nbr']).sum()
X_2 = X_2.reset_index().set_index('date')[['family', 'onpromotion', 'day', 'store_nbr']]
# display(X_1)
# display(X_2)

model.predict(X_1, X_2)


The feature names should match those that were passed during fit. Starting version 1.2, an error will be raised.
Feature names must be in the same order as they were in fit.



Arrays of bytes/strings is being converted to decimal numbers if dtype='numeric'. This behavior is deprecated in 0.24 and will be removed in 1.1 (renaming of 0.26). Please convert your data to numeric values explicitly instead.



family,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,...,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD
store_nbr,1,10,11,12,13,14,15,16,17,18,...,5,50,51,52,53,54,6,7,8,9
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2017-08-16,-5.318682,-1.911125,46.522914,113.823792,601.325995,22.508722,-8.898647,-498.499127,17.477179,-7.140043,...,125.726895,-464.263785,510.845508,-163.291305,-160.744816,8.944322,107.698278,108.073774,130.947821,29.784956
2017-08-17,-103.833581,81.125735,38.856512,9.034239,600.969833,19.477534,6.293606,-704.547891,1.850865,-9.996678,...,181.971847,-246.065265,505.344207,-280.731252,17.230195,558.192692,121.192809,86.135508,311.429407,-287.311006
2017-08-18,155.709944,-21.323037,25.750277,40.519451,599.713994,31.993207,-8.097501,-736.351867,41.26167,-9.951537,...,246.94051,-80.667331,293.984924,-525.263135,284.007389,443.740968,166.031035,99.626725,-19.022991,-166.457215
2017-08-19,155.96476,-4.776664,12.091729,41.371861,-143.176058,49.623691,23.870562,-321.766815,125.629323,19.297976,...,246.572996,44.536227,486.632949,-275.300003,125.655904,446.864011,-2.44186,146.881368,87.876229,-135.519744
2017-08-20,408.698698,-4.68186,-29.097784,-67.201253,-110.556284,6.445223,-10.625129,20.49583,136.636672,-5.381136,...,504.76608,-167.297679,472.94902,-480.322226,107.632897,445.657466,-154.551112,158.259842,-3.906751,-203.639232
2017-08-21,414.161039,54.237079,31.874338,33.657803,-31.756158,2.388805,27.289595,80.927463,46.712007,10.62977,...,236.674045,-207.501648,512.748461,-396.300997,-130.458369,444.663821,-217.203211,145.030231,32.232239,-139.741148
2017-08-22,45.031146,-44.432423,52.862356,37.92577,119.658136,41.34939,22.545495,179.113776,156.91847,-56.999135,...,113.037819,-242.326753,430.352474,-304.452758,-25.516012,8.513433,-88.956024,312.163211,-83.056704,-44.31887
2017-08-23,266.81749,17.342142,-33.304231,-17.508191,-148.333054,-445.832387,32.872317,145.08726,94.154766,-81.349653,...,-66.635833,-93.991542,433.524124,-415.344018,-120.962026,317.670078,-70.990979,153.235587,135.810533,-79.781306
2017-08-24,327.91508,167.417398,69.495762,-51.88621,-81.302992,-553.517041,-1.606773,-46.44099,90.407989,-8.880424,...,-309.815838,-248.392061,207.512719,-224.269701,-126.561529,200.587257,173.139589,282.903821,120.423444,-270.936023
2017-08-25,464.528704,132.918912,143.413431,-53.153873,-17.735602,-215.978071,-2.07938,-210.533413,-43.064678,-8.775932,...,-193.162025,-93.036605,58.006972,-356.240452,-115.795177,221.88417,150.166433,257.353845,-238.719559,44.871811


In [84]:
y_submit = pd.DataFrame(model.predict(X_1, X_2), index=X_1.index)
y_submit = y_submit.stack(['store_nbr', 'family'])


The feature names should match those that were passed during fit. Starting version 1.2, an error will be raised.
Feature names must be in the same order as they were in fit.



Arrays of bytes/strings is being converted to decimal numbers if dtype='numeric'. This behavior is deprecated in 0.24 and will be removed in 1.1 (renaming of 0.26). Please convert your data to numeric values explicitly instead.



In [85]:
y_submit.reset_index()
df_test.reset_index()
results = pd.merge(y_submit.reset_index(), df_test.reset_index(), on=['store_nbr', 'family', 'date'])[['id', 0]]
results = results.set_axis(['id', 'sales'], axis=1)
results.to_csv('submission.csv', index=False)