In [None]:
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
sns.set(style="darkgrid")
pd.set_option('display.float_format', lambda x: '%.2f' % x)
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv('../input/train.csv', index_col='date', parse_dates=True)
test = pd.read_csv('../input/test.csv', index_col='date', parse_dates=True)
train_len = len(train)
df = pd.concat([train, test], axis=0)
df.describe().T

In [None]:
df.T

In [None]:
# THe train and test data is dividedd into dates prior and post year 2018 (test starting january 2018)

In [None]:
def correlation_heatmap(df):
    _ , ax = plt.subplots(figsize =(14, 12))
    colormap = sns.diverging_palette(220, 10, as_cmap = True)
    
    _ = sns.heatmap(
        df.corr(), 
        cmap = colormap,
        square=True, 
        cbar_kws={'shrink':.9 }, 
        ax=ax,
        annot=True, 
        linewidths=0.1,vmax=1.0, linecolor='white',
        annot_kws={'fontsize':12 }
    )
    plt.title('Pearson Correlation of Features', y=1.05, size=15)
correlation_heatmap(df)

In [None]:
bins = np.arange(0,11,1)
labels=np.arange(1,11,1).astype(str)
cuts = pd.cut(df['store'], bins=bins, labels=labels)

df.groupby(cuts)['sales'].sum().plot(kind='bar',logy=True, color='purple')

In [None]:
number_of_items = df.item.nunique()+1
bins = np.arange(0,number_of_items,1)
labels=np.arange(1,number_of_items,1).astype(str)
cuts = pd.cut(df['item'], bins=bins, labels=labels)

to_plot = df.groupby(cuts)['sales'].sum()
ax =to_plot.plot(kind='bar', color='purple')
n = 5
ticks = ax.xaxis.get_ticklocs()
ticklabels = [l.get_text() for l in ax.xaxis.get_ticklabels()]
ax.xaxis.set_ticks(ticks[::n])
ax.xaxis.set_ticklabels(ticklabels[::n])

ax.figure.show()

In [None]:
df.item.nunique()

In [None]:
df['year'] = df.index.year.astype('int32')
df['month'] = df.index.month.astype('int32')
df['week'] = df.index.week.astype('int32')
df['dayofweek'] = df.index.dayofweek.astype('int32')
df['dayofmonth'] = df.index.day.astype('int32')

In [None]:
df.head().T

In [None]:
df['year'] = np.exp((df['year'] - 2018)/5)
#correlation_heatmap(df)

In [None]:
df = df.fillna(np.NaN)
df.isnull().sum()

In [None]:
first_year = df[df['year']==2013]
yearplot = sns.lineplot(x="year", y="sales", data=df, hue='store')

In [None]:
monthplot = sns.lineplot(x="month", y="sales", data=first_year, hue='store')

In [None]:
weekplot = sns.lineplot(x="week", y="sales", data=first_year, hue='store')

In [None]:
weekplot = sns.lineplot(x="week", y="sales", data=first_year, hue='item')

In [None]:
df_sales = df.sales.copy()
# grouping sample into trimesters
df_sales = df_sales.resample('3M').sum()
df_sales.plot(figsize=(9,6),)

In [None]:
df_sales = df_sales.fillna(np.NaN)
df_sales.dropna(axis=0,inplace=True)
df_sales.isnull().sum()

In [None]:
import statsmodels.api as sm
decomposition = sm.tsa.seasonal_decompose(df_sales, model='additive')
decomposition.plot()

In [None]:
# adding medians and means grouped by store and item

In [None]:
df["mean-store_item"] = df.groupby(["item", "store"])["sales"].transform("mean")
df["mean-month_item"] = df.groupby(["month", "item"])["sales"].transform("mean")

df["mean-item"] = df.groupby(["item"])["sales"].transform("mean")
df["mean-store"] = df.groupby(["store"])["sales"].transform("mean")

In [None]:
df.head().T

In [None]:
#df["median-store_item-month"] = df.groupby(['month', "item", "store"])["sales"].transform("median")
df["mean-store_item-week"] = df.groupby(['week', "item", "store"])["sales"].transform("mean")
df["item-month-mean"] = df.groupby(['month', "item"])["sales"].transform("mean")
df["store-month-mean"] = df.groupby(['month', "store"])["sales"].transform("mean") 

In [None]:
# Series with data shifted by a year

In [None]:
df['store_item_shifted-365'] = df.groupby(["item", "store"])['sales'].transform(
    lambda x: x.shift(365)) 
df["item-week_shifted-90"] = df.groupby(['week', "item"])["sales"].transform(
    lambda x: x.shift(12).mean())

df['store_item_shifted-365'].fillna(df['store_item_shifted-365'].mode()[0], inplace=True)
df["item-week_shifted-90"].fillna(df["item-week_shifted-90"].mode()[0], inplace=True)

In [None]:
correlation_heatmap(df)

In [None]:
def hot_encoder(df, column):
    hot_encoder = pd.get_dummies(df[column])
    hot_encoder.reset_index(drop=True)
    df = df.join(hot_encoder).drop(columns=[column])
    return df
# cannot use the funcion due to memory error 

In [None]:
df_copy = pd.get_dummies(df['store'])

In [None]:
df_copy.sales = df.sales.transform(lambda x:(x-min(x))/(max(x)-min(x)))

In [None]:
correlation_heatmap(df_copy)

In [None]:
# store dummies are not corelated with sales!

In [None]:
correlation_heatmap(df)

In [None]:
train = df[0:train_len]
test = df[train_len::]
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
y = train.sales
X = train.drop(columns=['sales','id','store','item','dayofmonth','month','week'])
to_predict = test.drop(columns=['sales','id','store','item','dayofmonth','month','week'])

In [None]:
from sklearn.model_selection import KFold
import xgboost as xgb

from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

def XGBmodel(X_train, X_test, y_train, y_test, model):
    matrix_train = xgb.DMatrix(X_train,label=y_train)
    matrix_test = xgb.DMatrix(X_test,label=y_test)
    model=xgb.train(params={'objective':'reg:linear','eval_metric':'mae'}
                    ,dtrain=matrix_train,num_boost_round=500, 
                    early_stopping_rounds=5,evals=[(matrix_test,'test')],
                    xgb_model=model)
    return model
def KFold_XGB_trainer(X, y, to_predict):
    kF = KFold(n_splits=10, shuffle=False, random_state=None)

    model = None
    for train_index, test_index in kF.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
        model = XGBmodel(X_train,X_test,y_train,y_test, model)
    
    y_pred = model.predict(xgb.DMatrix(to_predict), ntree_limit = model.best_ntree_limit)
    return y_pred

def Polynomial_trainer(X, y, to_predict, degree=3):
    model = make_pipeline(PolynomialFeatures(degree), Ridge())
    model.fit(X, y)
    y_pred = model.predict(to_predict)
    return y_pred

y_pred = Polynomial_trainer(X, y, to_predict, 3)   

In [None]:
#submission = pd.DataFrame(test.id.astype('int32'))

#submission['sales']= pd.DataFrame(y_pred)
#submission.set_index('id')
#submission.to_csv('store_forecast.csv',index=False)

In [None]:
#submission.head()

In [None]:
submission = pd.DataFrame(test.id.astype('int32'))
submission.reset_index(drop=True,inplace=True)
submission['sales'] = pd.DataFrame(y_pred)
submission.to_csv('store_forecast.csv',index=False)
submission.head()