In [None]:
import pandas as pd
import numpy as np
import seaborn as sns


In [None]:
import os

In [None]:
from datetime import datetime

In [None]:
import matplotlib.pyplot as plt
from matplotlib.patches import Patch

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,MinMaxScaler
from sklearn.metrics import mean_squared_log_error

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, make_column_transformer

In [None]:
from sklearn.model_selection import TimeSeriesSplit

In [None]:
import statsmodels.api as sm
from statsmodels.api import OLS
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [None]:
plt.rcParams.update({'figure.max_open_warning': 0})
plt.style.use('fivethirtyeight')
cmap_data = plt.cm.Paired
cmap_cv = plt.cm.coolwarm

In [None]:
def plot_cv_indices(cv, n_splits, X, y, date_col = None):
    """Create a sample plot for indices of a cross-validation object."""
    
    fig, ax = plt.subplots(1, 1, figsize = (11, 7))
    
    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(range(len(indices)), [ii + .5] * len(indices),
                   c=indices, marker='_', lw=10, cmap=cmap_cv,
                   vmin=-.2, vmax=1.2)


    # Formatting
    yticklabels = list(range(n_splits))
    
    if date_col is not None:
        tick_locations  = ax.get_xticks()
        tick_dates = [" "] + date_col.iloc[list(tick_locations[1:-1])].astype(str).tolist() + [" "]

        tick_locations_str = [str(int(i)) for i in tick_locations]
        new_labels = ['\n\n'.join(x) for x in zip(list(tick_locations_str), tick_dates) ]
        ax.set_xticks(tick_locations)
        ax.set_xticklabels(new_labels)
    
#     ax.set(yticks=np.arange(n_splits+2) + .5, yticklabels=yticklabels,
#            xlabel='Sample index', ylabel="CV iteration",
#            ylim=[n_splits+0.2, -.2])
    ax.legend([Patch(color=cmap_cv(.8)), Patch(color=cmap_cv(.02))],
              ['Testing set', 'Training set'], loc=(1.02, .8))
    ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
    

In [None]:
class StoreCatTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, input_features = [], feature_name = "StoreCat"):
        print("\n>>>>>>>>>>init() called.\n")
        self.feature_name = feature_name
        self.input_features = input_features
        
    def fit(self, X, y=None):
        print("\n>>>>>>>>>>fit() called.\n")
        return self
    
    def transform(self, X, y = None):
        print("\n>>>>>>>>>>transform() called.\n")
        X_ = X.copy()
        X_[self.feature_name] = ''
        for col in self.input_features:
            X_[self.feature_name] = X_[self.feature_name] + X_[col]
        le = LabelEncoder()
        X_[self.feature_name] = le.fit_transform(X_[self.feature_name])
        return X_

In [None]:
class DayOfWeekTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        print("\n>>>>>>>>>>init() called.\n")
        self.dayOfWeekDict = {0:"Monday",
                 1:"Tuesday",
                 2:"Wednesday",
                 3:"Thursday",
                 4:"Friday",
                 5:"Saturday",
                 6:"Sunday"}
        
    def fit(self, X, y=None):
        print("\n>>>>>>>>>>fit() called.\n")
        return self
    
    def transform(self, X, y = None):
        print("\n>>>>>>>>>>transform() called.\n")
        X_ = X.copy()
        X_['Date'] = pd.to_datetime(X_['Date'])
        X_['Year'] = X_['Date'].dt.year
        X_['Month'] = X_['Date'].dt.month
        X_['DayOfWeek'] = X_['Date'].dt.dayofweek
        X_['DayOfWeek'] = X_['DayOfWeek'].apply(lambda x : self.dayOfWeekDict.get(x))
        le = LabelEncoder()
        X_["DayOfWeek"] = le.fit_transform(X_["DayOfWeek"])
        return X_

In [None]:
class CustomLabelEncode(BaseEstimator, TransformerMixin):
    def __init__(self, feature):
        self.feature = feature
        
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.feature] = LabelEncoder().fit_transform(X_[self.feature])
        return X_

In [None]:
filename = "TRAIN.csv"
testfilename = "TEST_FINAL.csv"

In [None]:
homedir = os.path.abspath(os.path.dirname("__name__"))

In [None]:
fp = os.path.join(homedir, filename)
testfp = os.path.join(homedir, testfilename)

In [None]:
mms = MinMaxScaler()

In [None]:
df = pd.read_csv(fp)


In [None]:
df.head()

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        
        ("DiscountEncoder", CustomLabelEncode("Discount"))
    ], remainder='passthrough'
)

In [None]:
mycols = ['Store_id', "Date", "Holiday", "Discount", "StoreCat", "DayOfWeek"]

In [None]:
pipe = Pipeline(steps = [
    ("preprocess",CustomLabelEncode("Discount")),
    ("transform1", StoreCatTransformer(["Store_Type", "Location_Type", "Region_Code"], "StoreCat")),
    ("transform2", DayOfWeekTransformer()),
#     ("selector", ColumnTransformer([
#         ("selector", "passthrough", mycols)
#     ], remainder="drop")),
    
])

In [None]:
df2 = pipe.fit_transform(df)

In [None]:
df2.head()

In [None]:
df2['Date'] = pd.to_datetime(df2['Date'])

In [None]:
df2.dtypes

In [None]:
df2.columns

In [None]:
traindf = df2.loc["2018-01-01":"2018-12-31"]
testdf = df2.loc["2018-10-01":"2019-05-31"]


# traindf =storedf[:trainlen] 
# testdf = storedf[trainlen:] 

In [None]:
traindf.reset_index(inplace=True)

In [None]:
traindf["60SMA"] = traindf.groupby("Store_id")['Sales'].rolling(window=60).mean().reset_index(0,drop=True)
traindf["60EMA"] = traindf.groupby("Store_id")['Sales'].ewm(span=60).mean().reset_index(0,drop=True)

traindf["30SMA"] = traindf.groupby("Store_id")['Sales'].rolling(window=30).mean().reset_index(0,drop=True)
traindf["30EMA"] = traindf.groupby("Store_id")['Sales'].ewm(span=30).mean().reset_index(0,drop=True)

traindf["90SMA"] = traindf.groupby("Store_id")['Sales'].rolling(window=90).mean().reset_index(0,drop=True)
traindf["90EMA"] = traindf.groupby("Store_id")['Sales'].ewm(span=90).mean().reset_index(0,drop=True)

In [None]:
traindf.set_index("Date", inplace=True)

In [None]:
traindf.dropna(inplace=True)

In [None]:
yval = mms.fit_transform(np.array(traindf['Sales']).reshape(-1,1))

In [None]:
endog = yval#traindf.loc[:, 'Sales']
exog = traindf.loc[:, ('Store_id', 'DayOfWeek', 'StoreCat', 'Discount', 'Holiday', 
                                       "30SMA", "60SMA", "90SMA", "30EMA", "60EMA", "90EMA")]

In [None]:
exog.dtypes

In [None]:
# Fit the model
mod = sm.tsa.statespace.SARIMAX(endog, exog, order=(1,0,1))
res = mod.fit(disp=False)
print(res.summary())

In [None]:
testdf['zSales'] = testdf['Sales']


In [None]:
testdf.loc["2019-01-01":"2019-05-31", "zSales"] = 0

In [None]:
testdf.head()

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
pd.options.mode.chained_assignment = None 

In [None]:
predlist = []
for sid in testdf['Store_id'].unique().tolist():
    sdf = testdf[testdf['Store_id']==sid]
    for ix in [x for x in sdf.index.strftime("%Y-%m-%d").tolist() if x > "2018-12-31"]:
        tempdf = sdf.loc[:ix]
        tempdf = tempdf[-90:]
        
        tempdf["60SMA"] = tempdf.rolling(window=60)['zSales'].mean()
        tempdf["60EMA"] = tempdf.ewm(span=60)['zSales'].mean()

        tempdf["30SMA"] = tempdf.rolling(window=30)['zSales'].mean()
        tempdf["30EMA"] = tempdf.ewm(span=30)['zSales'].mean()

        tempdf["90SMA"] = tempdf.rolling(window=90)['zSales'].mean()
        tempdf["90EMA"] = tempdf.ewm(span=90)['zSales'].mean()
        tempdf.dropna(inplace=True)
#         print(ix)
#         print(tempdf.shape)
#         print(tempdf)
        texog = sm.add_constant(tempdf.loc[:, ('Store_id', 'DayOfWeek', 'StoreCat', 'Discount', 'Holiday',
                                      "30SMA", "60SMA", "90SMA", "30EMA", "60EMA", "90EMA")])
#         print(texog.dtypes)
        pred = res.forecast(exog=texog, steps=len(texog))
        predlist.append({"ID":tempdf.iloc[0, 0],"Date":ix, "Pred":pred.tolist()[0]})
        sdf.loc[ix, "zSales"] = pred.tolist()[0]
        
#         print(tempdf.tail(1))
    

In [None]:
len(predlist)

In [None]:
outdf = pd.DataFrame(predlist).set_index("Date")

In [None]:
outdf.head()

In [None]:
outdf['zPred'] = mms.inverse_transform(np.array(outdf['Pred']).reshape(-1,1))

In [None]:
testdf2 = pd.merge(testdf, outdf[["ID", "zPred"]], on="ID", how='left')

In [None]:
testdf2.dropna(inplace=True)

In [None]:
testdf2['Pred'] = testdf2['zPred'].apply(lambda x : 0 if x<0 else x)

In [None]:
testdf2.head()

In [None]:
negativedf = testdf2[testdf2['Pred'] <0]

In [None]:
negativedf.shape

In [None]:
negativedf.Store_id.value_counts()

In [None]:
negativedf.StoreCat.value_counts()

In [None]:
negativedf.Holiday.value_counts()

In [None]:
negativedf.Discount.value_counts()

In [None]:
negativedf.DayOfWeek.value_counts()

In [None]:
testdf2.dropna(inplace=True)

In [None]:
mean_squared_log_error(testdf2['Sales'], testdf2['Pred'])*1000

In [None]:
store1 = testdf[testdf['Store_id']==1]
store1[92:93]

In [None]:
testdf["60SMA"] = testdf.rolling(window=60)['zSales'].mean()
testdf["60EMA"] = testdf.ewm(span=60)['zSales'].mean()

testdf["30SMA"] = testdf.rolling(window=30)['zSales'].mean()
testdf["30EMA"] = testdf.ewm(span=30)['zSales'].mean()

testdf["90SMA"] = testdf.rolling(window=90)['zSales'].mean()
testdf["90EMA"] = testdf.ewm(span=90)['zSales'].mean()

In [None]:
texog = sm.add_constant(testdf.loc[:, ('Store_id', 'DayOfWeek', 'StoreCat', 'Discount', 'Holiday',
                                      "30SMA", "60SMA", "90SMA", "30EMA", "60EMA", "90EMA")])

In [None]:
pred = res.forecast(exog=texog, steps=len(texog))

In [None]:
testdf['pred'] = pred.tolist()

In [None]:
testdf[testdf['pred']<0]

In [None]:
mean_squared_log_error(testdf['Sales'], testdf['pred'])*1000

# Test 

In [None]:
combdf = testdf.loc['2019-03-01':'2019-05-31']

In [None]:
combdf[combdf['Store_id']==1].shape

In [None]:
combdf

In [None]:
combdf.shape

In [None]:
validdf2.shape

In [None]:
validdf2.shape[0] + combdf.shape[0]

In [None]:
mixdf = pd.concat([combdf, validdf2])

In [None]:
mixdf.shape[0]

In [None]:
mixdf['zSales'] = mixdf['Sales']

In [None]:
mixdf.head()

In [None]:
mixdf['zSales'].fillna(0, inplace=True)

In [None]:
mixdf['#Order'].fillna(0, inplace=True)

In [None]:
mixdf.tail()

In [None]:
validdf = pd.read_csv(testfp)

In [None]:
validdf2 = pipe.fit_transform(validdf)

In [None]:
validdf2.head()

In [None]:
validdf2['Date'] = pd.to_datetime(validdf2['Date'])

In [None]:
validdf2.set_index("Date", inplace=True)

In [None]:
vexog = sm.add_constant(validdf2.loc[:, ('Store_id', 'DayOfWeek', 'StoreCat', 'Discount', 'Holiday')])

In [None]:
vpred = res.forecast(exog=vexog, steps=len(vexog))

In [None]:
validdf['Sales'] = vpred.tolist()

In [None]:
validdf[["ID", "Sales"]].to_csv("Submission_{}.csv".format(datetime.now().microsecond), index=None)

In [None]:
predlist = []
for sid in mixdf['Store_id'].unique().tolist():
    sdf = mixdf[mixdf['Store_id']==sid]
    for ix in [x for x in sdf.index.strftime("%Y-%m-%d").tolist() if x > "2019-05-31"]:
        tempdf = sdf.loc[:ix]
        tempdf.drop("Sales", axis=1, inplace=True)
        tempdf = tempdf[-90:]
        
        tempdf["60SMA"] = tempdf.rolling(window=60)['zSales'].mean()
        tempdf["60EMA"] = tempdf.ewm(span=60)['zSales'].mean()

        tempdf["30SMA"] = tempdf.rolling(window=30)['zSales'].mean()
        tempdf["30EMA"] = tempdf.ewm(span=30)['zSales'].mean()

        tempdf["90SMA"] = tempdf.rolling(window=90)['zSales'].mean()
        tempdf["90EMA"] = tempdf.ewm(span=90)['zSales'].mean()
#         print(tempdf.head())
#         print(tempdf.tail())
        tempdf.dropna(inplace=True)
        print(ix)
        print(tempdf.shape)
        print(tempdf)
        texog = sm.add_constant(tempdf.loc[:, ('Store_id', 'DayOfWeek', 'StoreCat', 'Discount', 'Holiday',
                                      "30SMA", "60SMA", "90SMA", "30EMA", "60EMA", "90EMA")])
#         print(texog.dtypes)
        pred = res.forecast(exog=texog, steps=len(texog))
        predlist.append({"ID":tempdf.iloc[0, 0],"Date":ix, "Pred":pred.tolist()[0]})
        sdf.loc[ix, "zSales"] = pred.tolist()[0]
        
#         print(tempdf.tail(1))


In [None]:
outdf = pd.DataFrame(predlist).set_index("Date")

In [None]:
outdf['zPred'] = mms.inverse_transform(np.array(outdf['Pred']).reshape(-1,1))

In [None]:
validmixdf = pd.merge(validdf, outdf[["ID", "zPred"]], on="ID", how='left')

In [None]:
validmixdf['Sales'] = validmixdf['zPred'].apply(lambda x : 0 if x<0 else x)

In [None]:
validmixdf[["ID", "Sales"]].to_csv("Submission_{}.csv".format(datetime.now().microsecond), index=None)

In [None]:
n_splits = 5

In [None]:
tscv = TimeSeriesSplit(n_splits)

In [None]:
tscv

In [None]:

X = df2[['Store_id',"Date", "Holiday", "Discount", "StoreCat", "DayOfWeek"]]
y = df2[['Sales']]

for fold, (train_index, test_index) in enumerate(tscv.split(X)):
    print("Fold: {}".format(fold))
    print("TRAIN indices:", train_index, "\n", "TEST indices:", test_index)
    print("\n")
    X_train, X_test = X.loc[train_index, :], X.loc[test_index, :]
    y_train, y_test = y.loc[train_index, :], y.loc[test_index, :]


plot_cv_indices(tscv,n_splits, X, y, date_col=df2["Date"])

In [None]:
for fold, (train_index, test_index) in enumerate(tscv.split(X)):
    print(fold)