In [None]:
import pandas as pd
import numpy as np
import seaborn as sns


In [None]:
import os

In [None]:
from datetime import datetime

In [None]:
import matplotlib.pyplot as plt
from matplotlib.patches import Patch

In [None]:
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,MinMaxScaler
from sklearn.metrics import mean_squared_log_error

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, make_column_transformer

In [None]:
from sklearn.model_selection import TimeSeriesSplit

In [None]:
import statsmodels.api as sm
from statsmodels.api import OLS
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [None]:
plt.rcParams.update({'figure.max_open_warning': 0})
plt.style.use('fivethirtyeight')
cmap_data = plt.cm.Paired
cmap_cv = plt.cm.coolwarm

In [None]:
def plot_cv_indices(cv, n_splits, X, y, date_col = None):
    """Create a sample plot for indices of a cross-validation object."""
    
    fig, ax = plt.subplots(1, 1, figsize = (11, 7))
    
    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(range(len(indices)), [ii + .5] * len(indices),
                   c=indices, marker='_', lw=10, cmap=cmap_cv,
                   vmin=-.2, vmax=1.2)


    # Formatting
    yticklabels = list(range(n_splits))
    
    if date_col is not None:
        tick_locations  = ax.get_xticks()
        tick_dates = [" "] + date_col.iloc[list(tick_locations[1:-1])].astype(str).tolist() + [" "]

        tick_locations_str = [str(int(i)) for i in tick_locations]
        new_labels = ['\n\n'.join(x) for x in zip(list(tick_locations_str), tick_dates) ]
        ax.set_xticks(tick_locations)
        ax.set_xticklabels(new_labels)
    
#     ax.set(yticks=np.arange(n_splits+2) + .5, yticklabels=yticklabels,
#            xlabel='Sample index', ylabel="CV iteration",
#            ylim=[n_splits+0.2, -.2])
    ax.legend([Patch(color=cmap_cv(.8)), Patch(color=cmap_cv(.02))],
              ['Testing set', 'Training set'], loc=(1.02, .8))
    ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
    

In [None]:
class StoreCatTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, input_features = [], feature_name = "StoreCat"):
        print("\n>>>>>>>>>>init() called.\n")
        self.feature_name = feature_name
        self.input_features = input_features
        
    def fit(self, X, y=None):
        print("\n>>>>>>>>>>fit() called.\n")
        return self
    
    def transform(self, X, y = None):
        print("\n>>>>>>>>>>transform() called.\n")
        X_ = X.copy()
        X_[self.feature_name] = ''
        for col in self.input_features:
            X_[self.feature_name] = X_[self.feature_name] + X_[col]
        le = LabelEncoder()
        X_[self.feature_name] = le.fit_transform(X_[self.feature_name])
        return X_

In [None]:
class DayOfWeekTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        print("\n>>>>>>>>>>init() called.\n")
        self.dayOfWeekDict = {0:"Monday",
                 1:"Tuesday",
                 2:"Wednesday",
                 3:"Thursday",
                 4:"Friday",
                 5:"Saturday",
                 6:"Sunday"}
        
    def fit(self, X, y=None):
        print("\n>>>>>>>>>>fit() called.\n")
        return self
    
    def transform(self, X, y = None):
        print("\n>>>>>>>>>>transform() called.\n")
        X_ = X.copy()
        X_['Date'] = pd.to_datetime(X_['Date'])
        X_['Year'] = X_['Date'].dt.year
        X_['Month'] = X_['Date'].dt.month
        X_['DayOfWeek'] = X_['Date'].dt.dayofweek
        X_['DayOfWeek'] = X_['DayOfWeek'].apply(lambda x : self.dayOfWeekDict.get(x))
        le = LabelEncoder()
        X_["DayOfWeek"] = le.fit_transform(X_["DayOfWeek"])
        return X_

In [None]:
class CustomLabelEncode(BaseEstimator, TransformerMixin):
    def __init__(self, feature):
        self.feature = feature
        
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.feature] = LabelEncoder().fit_transform(X_[self.feature])
        return X_

In [None]:
filename = "TRAIN.csv"
testfilename = "TEST_FINAL.csv"

In [None]:
homedir = os.path.abspath(os.path.dirname("__name__"))

In [None]:
fp = os.path.join(homedir, filename)
testfp = os.path.join(homedir, testfilename)

In [None]:
mms = MinMaxScaler()

In [None]:
df = pd.read_csv(fp)


In [None]:
df.head()

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        
        ("DiscountEncoder", CustomLabelEncode("Discount"))
    ], remainder='passthrough'
)

In [None]:
mycols = ['Store_id', "Date", "Holiday", "Discount", "StoreCat", "DayOfWeek"]

In [None]:
pipe = Pipeline(steps = [
    ("preprocess",CustomLabelEncode("Discount")),
    ("transform1", StoreCatTransformer(["Store_Type", "Location_Type", "Region_Code"], "StoreCat")),
    ("transform2", DayOfWeekTransformer()),
#     ("selector", ColumnTransformer([
#         ("selector", "passthrough", mycols)
#     ], remainder="drop")),
    
])

In [None]:
df2 = pipe.fit_transform(df)

In [None]:
df2[df2['Store_id']==1].head()

In [None]:
df2['SalesBak'] = df2['Sales']

In [None]:
df2['Sales'] = mms.fit_transform(np.array(df2['Sales']).reshape(-1,1))

In [None]:
df2["Sales_t1"] = df2.groupby("Store_id").Sales.shift(1)
df2["Sales_t2"] = df2.groupby("Store_id").Sales.shift(2)
df2["Sales_t3"] = df2.groupby("Store_id").Sales.shift(3)
df2["Sales_t4"] = df2.groupby("Store_id").Sales.shift(4)
df2["Sales_t5"] = df2.groupby("Store_id").Sales.shift(5)
df2["Sales_t6"] = df2.groupby("Store_id").Sales.shift(6)
df2["Sales_t7"] = df2.groupby("Store_id").Sales.shift(7)

In [None]:
df2.dropna(inplace=True)

In [None]:
df2['Date'] = pd.to_datetime(df2['Date'])

In [None]:
df2.dtypes

In [None]:
df2.columns

In [None]:
df2.set_index("Date", inplace=True)

In [None]:
df2.head()

In [None]:
traindf = df2.loc["2018-01-01":"2018-12-31"]
testdf = df2.loc["2018-10-01":"2019-05-31"]


# traindf =storedf[:trainlen] 
# testdf = storedf[trainlen:] 

In [None]:
traindf.reset_index(inplace=True)

In [None]:
traindf.shape

In [None]:
traindf["7SMA"] = traindf.groupby("Store_id")['Sales'].rolling(window=7).mean().reset_index(0,drop=True)
traindf["7EMA"] = traindf.groupby("Store_id")['Sales'].ewm(span=7).mean().reset_index(0,drop=True)

traindf["14SMA"] = traindf.groupby("Store_id")['Sales'].rolling(window=14).mean().reset_index(0,drop=True)
traindf["14EMA"] = traindf.groupby("Store_id")['Sales'].ewm(span=14).mean().reset_index(0,drop=True)

traindf["30SMA"] = traindf.groupby("Store_id")['Sales'].rolling(window=30).mean().reset_index(0,drop=True)
traindf["30EMA"] = traindf.groupby("Store_id")['Sales'].ewm(span=30).mean().reset_index(0,drop=True)

traindf["60SMA"] = traindf.groupby("Store_id")['Sales'].rolling(window=60).mean().reset_index(0,drop=True)
traindf["60EMA"] = traindf.groupby("Store_id")['Sales'].ewm(span=60).mean().reset_index(0,drop=True)

traindf["90SMA"] = traindf.groupby("Store_id")['Sales'].rolling(window=90).mean().reset_index(0,drop=True)
traindf["90EMA"] = traindf.groupby("Store_id")['Sales'].ewm(span=90).mean().reset_index(0,drop=True)

In [None]:
traindf.dropna(inplace=True)

In [None]:
traindf.head()

In [None]:
# yval = mms.fit_transform(np.array(traindf['Sales']).reshape(-1,1))

In [None]:
model_svr = SVR()

In [None]:
X = traindf.loc[:, ('Store_id', 'DayOfWeek', 'StoreCat', 'Discount', 'Holiday',
                    'Sales_t1', 'Sales_t2', 'Sales_t3', 'Sales_t4', 'Sales_t5', 'Sales_t6', 'Sales_t7',
                    "7SMA","7EMA", "14SMA", "14EMA","30SMA", "60SMA", "90SMA", "30EMA", "60EMA", "90EMA")]

In [None]:
model_svr.fit(X, yval)

In [None]:
testdf['zSales'] = testdf['Sales']


In [None]:
testdf.loc["2019-01-01":"2019-05-31", "zSales"] = 0
testdf.loc["2019-01-01":"2019-05-31", "Sales_t1"] = 0
testdf.loc["2019-01-01":"2019-05-31", "Sales_t2"] = 0
testdf.loc["2019-01-01":"2019-05-31", "Sales_t3"] = 0
testdf.loc["2019-01-01":"2019-05-31", "Sales_t4"] = 0
testdf.loc["2019-01-01":"2019-05-31", "Sales_t5"] = 0
testdf.loc["2019-01-01":"2019-05-31", "Sales_t6"] = 0
testdf.loc["2019-01-01":"2019-05-31", "Sales_t7"] = 0

In [None]:
testdf.head()

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
pd.options.mode.chained_assignment = None 

In [None]:
predlist = []
for sid in testdf['Store_id'].unique().tolist():
    sdf = testdf[testdf['Store_id']==sid]
    for ix in [x for x in sdf.index.strftime("%Y-%m-%d").tolist() if x > "2018-12-31"]:
        tempdf = sdf.loc[:ix]
        tempdf = tempdf[-90:]
        
        tempdf["7SMA"] = tempdf.groupby("Store_id")['Sales'].rolling(window=7).mean().reset_index(0,drop=True)
        tempdf["7EMA"] = tempdf.groupby("Store_id")['Sales'].ewm(span=7).mean().reset_index(0,drop=True)

        tempdf["14SMA"] = tempdf.groupby("Store_id")['Sales'].rolling(window=14).mean().reset_index(0,drop=True)
        tempdf["14EMA"] = tempdf.groupby("Store_id")['Sales'].ewm(span=14).mean().reset_index(0,drop=True)
        
        tempdf["60SMA"] = tempdf.rolling(window=60)['zSales'].mean()
        tempdf["60EMA"] = tempdf.ewm(span=60)['zSales'].mean()

        tempdf["30SMA"] = tempdf.rolling(window=30)['zSales'].mean()
        tempdf["30EMA"] = tempdf.ewm(span=30)['zSales'].mean()

        tempdf["90SMA"] = tempdf.rolling(window=90)['zSales'].mean()
        tempdf["90EMA"] = tempdf.ewm(span=90)['zSales'].mean()
        tempdf.dropna(inplace=True)
#         print(ix)
#         print(tempdf.shape)
#         print(tempdf)
        texog = tempdf.loc[:, ('Store_id', 'DayOfWeek', 'StoreCat', 'Discount', 'Holiday',
                               'Sales_t1', 'Sales_t2', 'Sales_t3', 'Sales_t4', 'Sales_t5', 'Sales_t6', 'Sales_t7',
                                      "7SMA","7EMA", "14SMA", "14EMA","30SMA", "60SMA", "90SMA", "30EMA", 
                                               "60EMA", "90EMA")]
#         print(texog.dtypes)
        pred = model_svr.predict(texog)
        predlist.append({"ID":tempdf.iloc[0, 0],"Date":ix, "Pred":pred.tolist()[0]})
        sdf.loc[ix, "zSales"] = pred.tolist()[0]
        sdf.loc[ix, "Sales_t1"] = sdf.loc[ix, "Sales_t2"]
        sdf.loc[ix, "Sales_t2"] = sdf.loc[ix, "Sales_t3"]
        sdf.loc[ix, "Sales_t3"] = sdf.loc[ix, "Sales_t4"]
        sdf.loc[ix, "Sales_t4"] = sdf.loc[ix, "Sales_t5"]
        sdf.loc[ix, "Sales_t5"] = sdf.loc[ix, "Sales_t6"]
        sdf.loc[ix, "Sales_t6"] = sdf.loc[ix, "Sales_t7"]
        sdf.loc[ix, "Sales_t7"] = sdf.loc[ix, "zSales"]
        
#         print(tempdf.tail(1))
    

In [None]:
len(predlist)

In [None]:
outdf = pd.DataFrame(predlist).set_index("Date")

In [None]:
outdf.head()

In [None]:
outdf['zPred'] = mms.inverse_transform(np.array(outdf['Pred']).reshape(-1,1))

In [None]:
testdf2 = pd.merge(testdf, outdf[["ID", "zPred"]], on="ID", how='left')

In [None]:
testdf2.dropna(inplace=True)

In [None]:
testdf2['Pred'] = testdf2['zPred'].apply(lambda x : 0 if x<0 else x)

In [None]:
testdf2.head()

In [None]:
testdf2['Diff'] = testdf2['Pred'] / testdf2['Sales']

In [None]:
testdf2['Pred_Signal'] = testdf2['Diff'].apply(lambda x : 'Acceptable' if (x < 1.1)&(x>0.9)  else 'NotAcceptable')

In [None]:
testdf2['Pred_Signal'].value_counts()

In [None]:
notacceptabledf = testdf2[testdf2['Pred_Signal']=='NotAcceptable']

In [None]:
pd.pivot_table(notacceptabledf, index="Store_id", columns="Pred_Signal", values="ID", aggfunc='count', fill_value=0)

In [None]:
negativedf = testdf2[testdf2['Pred'] <0]

In [None]:
negativedf.shape

In [None]:
testdf2.dropna(inplace=True)

In [None]:
mean_squared_log_error(testdf2['SalesBak'], testdf2['Pred'])*1000

In [None]:
359.1428448111441

# Test 

In [None]:
combdf = testdf.loc['2019-03-01':'2019-05-31']

In [None]:
combdf[combdf['Store_id']==1].shape

In [None]:
combdf

In [None]:
combdf.shape

In [None]:
validdf = pd.read_csv(testfp)

In [None]:
validdf2 = pipe.fit_transform(validdf)

In [None]:
validdf2['Date'] = pd.to_datetime(validdf2['Date'])

In [None]:
validdf2.set_index("Date", inplace=True)

In [None]:
validdf2.shape

In [None]:
validdf2.shape[0] + combdf.shape[0]

In [None]:
mixdf = pd.concat([combdf, validdf2])

In [None]:
mixdf.shape[0]

In [None]:
mixdf['zSales'] = mixdf['Sales']

In [None]:
mixdf.head()

In [None]:
mixdf['zSales'].fillna(0, inplace=True)

In [None]:
mixdf['#Order'].fillna(0, inplace=True)

In [None]:
mixdf['Sales_t1'].fillna(0, inplace=True)
mixdf['Sales_t2'].fillna(0, inplace=True)
mixdf['Sales_t3'].fillna(0, inplace=True)
mixdf['Sales_t4'].fillna(0, inplace=True)
mixdf['Sales_t5'].fillna(0, inplace=True)
mixdf['Sales_t6'].fillna(0, inplace=True)
mixdf['Sales_t7'].fillna(0, inplace=True)

In [None]:
mixdf.tail()

In [None]:
validdf2.head()

In [None]:
predlist = []
for sid in mixdf['Store_id'].unique().tolist():
    sdf = mixdf[mixdf['Store_id']==sid]
    for ix in [x for x in sdf.index.strftime("%Y-%m-%d").tolist() if x > "2019-05-31"]:
        tempdf = sdf.loc[:ix]
        tempdf.drop("Sales", axis=1, inplace=True)
        tempdf.drop("SalesBak", axis=1, inplace=True)
        tempdf = tempdf[-90:]
        
        tempdf["7SMA"] = tempdf.groupby("Store_id")['zSales'].rolling(window=7).mean().reset_index(0,drop=True)
        tempdf["7EMA"] = tempdf.groupby("Store_id")['zSales'].ewm(span=7).mean().reset_index(0,drop=True)

        tempdf["14SMA"] = tempdf.groupby("Store_id")['zSales'].rolling(window=14).mean().reset_index(0,drop=True)
        tempdf["14EMA"] = tempdf.groupby("Store_id")['zSales'].ewm(span=14).mean().reset_index(0,drop=True)
        
        tempdf["60SMA"] = tempdf.rolling(window=60)['zSales'].mean()
        tempdf["60EMA"] = tempdf.ewm(span=60)['zSales'].mean()

        tempdf["30SMA"] = tempdf.rolling(window=30)['zSales'].mean()
        tempdf["30EMA"] = tempdf.ewm(span=30)['zSales'].mean()

        tempdf["90SMA"] = tempdf.rolling(window=90)['zSales'].mean()
        tempdf["90EMA"] = tempdf.ewm(span=90)['zSales'].mean()
#         print(tempdf.head())
#         print(tempdf.tail())
        tempdf.dropna(inplace=True)
#         print(ix)
#         print(tempdf.shape)
#         print(tempdf)
        texog = tempdf.loc[:, ('Store_id', 'DayOfWeek', 'StoreCat', 'Discount', 'Holiday',
                               'Sales_t1', 'Sales_t2', 'Sales_t3', 'Sales_t4', 'Sales_t5', 'Sales_t6', 'Sales_t7',
                                      "7SMA","7EMA", "14SMA", "14EMA",
                               "30SMA", "60SMA", "90SMA", "30EMA", "60EMA", "90EMA")]
#         print(texog.dtypes)
        pred = model_svr.predict(texog)
        predlist.append({"ID":tempdf.iloc[0, 0],"Date":ix, "Pred":pred.tolist()[0]})
        sdf.loc[ix, "zSales"] = pred.tolist()[0]
        sdf.loc[ix, "Sales_t1"] = sdf.loc[ix, "Sales_t2"]
        sdf.loc[ix, "Sales_t2"] = sdf.loc[ix, "Sales_t3"]
        sdf.loc[ix, "Sales_t3"] = sdf.loc[ix, "Sales_t4"]
        sdf.loc[ix, "Sales_t4"] = sdf.loc[ix, "Sales_t5"]
        sdf.loc[ix, "Sales_t5"] = sdf.loc[ix, "Sales_t6"]
        sdf.loc[ix, "Sales_t6"] = sdf.loc[ix, "Sales_t7"]
        sdf.loc[ix, "Sales_t7"] = sdf.loc[ix, "zSales"]
#         print(tempdf.tail(1))


In [None]:
outdf = pd.DataFrame(predlist).set_index("Date")

In [None]:
outdf['zPred'] = mms.inverse_transform(np.array(outdf['Pred']).reshape(-1,1))

In [None]:
validmixdf = pd.merge(validdf, outdf[["ID", "zPred"]], on="ID", how='left')

In [None]:
validmixdf['Sales'] = validmixdf['zPred'].apply(lambda x : 0 if x<0 else x)

In [None]:
validmixdf[["ID", "Sales"]].to_csv("Submission_{}.csv".format(datetime.now().microsecond), index=None)