## Predict Future Sales


### Problem Statement
With daily historical data, our main goal is to create and optimize a model that makes a forecast on the total number of items for **each** item id sold in **every** shop in the test set (month of November 2015) as their prices fluctuate across time. 

In [None]:
# saving all required libraries under requirements.txt
# ! pip freeze > requirements.txt

In [None]:
# import necessary libraries
import warnings
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from keras.models import Sequential
from keras.layers import Dense,LSTM, TimeDistributed, Flatten, MaxPooling1D,Conv1D,Dropout

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso,ElasticNet,HuberRegressor,PassiveAggressiveRegressor,SGDRegressor
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor,ExtraTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor,BaggingRegressor,RandomForestRegressor,ExtraTreesRegressor,GradientBoostingRegressor
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.arima.model import ARIMA
pd.set_option('display.float_format', lambda x: '%.3f' % x)

warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
# with open('requirements.txt') as f:
#     print(f.read())

### Reading data

In [None]:
# Load all data
train = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
test = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')
items = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
items_cat = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
shops = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')

## Exploratory Data Analysis
Quick EDA and data cleaning, mainly to remove anomalies that might skew our data and to establish a few assumptions! 

### Data visualiztion

In [None]:
# first 5 rows of train data
train.head()

In [None]:
# first 5 rows of test data
test.head()

In [None]:
# first 5 rows of shop name and id
shops.head()

In [None]:
# item name, id and category-id
items.head()

In [None]:
# item category and category id, to mape to item names in items
items_cat.head()

In [None]:
# statistical summary of each feature, assuming negative means more items are returned than sold
train.describe()

In [None]:
# checking for missing values
train.isnull().sum()

In [None]:
# check for duplicates
train[train.duplicated(keep = False)]

In [None]:
# drop duplicates, keep one copy
train.drop_duplicates(keep = 'first',inplace = True)

In [None]:
# Changing the date column to datetime format and date as index
train.date = pd.to_datetime(train.date, format = '%d.%m.%Y' )
# train = train.set_index('date')

train.head()

In [None]:
# Looking at the correlation between features
mask = np.triu(np.ones_like(train.corr(),dtype = bool))
f,ax = plt.subplots()
sns.heatmap(train.corr(),mask = mask,annot = True);

In [None]:
# setting date as index
train_1 = train.copy()
train_1 = train_1.set_index('date').sort_index()
train_1

In [None]:
# looking at total number of products sold throughout the months
# sales is the highest in Jan of every month
fig,ax = plt.subplots(figsize = (20,10))
ax.plot(train_1.resample('M').agg({'item_price':'mean','item_cnt_day':'sum'})['item_cnt_day'])
ax.set_title('Total number of products sold per month for all shops',fontsize = 14)
ax.set_ylabel('Total monthly products sold',fontsize = 14)
ax.set_xlabel('Year - Month',fontsize = 14)

### `item_cnt_day`

In [None]:
# Looking at the distribution of target variable
sns.boxplot(train.item_cnt_day)

In [None]:
train[train.item_cnt_day > 2000]

In [None]:
# based on the distribution of past item_cnt_day of the same item, seems like its an anomaly 
sns.boxplot(train[train.item_id == 11373]["item_cnt_day"])

In [None]:
# Drop the anomaly
train.drop(2909818,inplace = True)

### `item_price`

In [None]:
fig,ax = plt.subplots(figsize = (20,10))
sns.violinplot(train.item_price)

In [None]:
# There is an extreme outlier with price > 300000
train[train.item_price > 250000]

In [None]:
# Only 1 item with that price, might be an outlier
train[train.item_id == 6066]

In [None]:
# Drop the outlier
train.drop(1163158,inplace = True)

### Pre-target variable: `item_cnt_day`
target variable is the monthly value: `item_cnt_month`

In [None]:
# extracting year and month
train_1['year_month'] = train_1.index.strftime('%Y-%m')

In [None]:
# group according to shop_id and item_id
df = pd.pivot_table(train_1, index = ['shop_id','item_id'], 
                              columns = 'year_month',
                              values = 'item_cnt_day',
                              aggfunc = np.sum,
                              fill_value = 0)
df = df.reset_index().rename_axis(None,axis = 1)
df

In [None]:
# to filter out those item counts that we want to predict afterwards
df_test = test.merge(df, on = ['shop_id','item_id'],how = 'left').fillna(0).drop(columns = 'ID')

# drop redundant columns for df_test
df_test = df_test.drop(columns = ['shop_id','item_id'])
df_test.head()

In [None]:
# df_test_train: Training set where we use all dates except the latest one to train model
# df_test_val: Test set where we use latest date to validate model
df_test_train = df_test.iloc[:,:-1]
df_test_val = df_test.iloc[:,-1]

In [None]:
print(f'validation set: {df_test_val.shape}')
print(f'train set: {df_test_train.shape}')

### Data Modeling 
Using **2015-10** as our target feature and earlier time periods as our predictor features

In [None]:
# train test split df: X = all columns except last one, y = last column
X = df_test_train
y = df_test_val
X_train,X_test,y_train,y_test = train_test_split(X,y,shuffle = False,test_size = 0.2, random_state = 42)

In [None]:
# looking at rows, columns for train and validation set
print(f'train: {X.shape}')
print(f'test: {y.shape}')
print(f'val train: {X_test.shape}')
print(f'val test: {y_test.shape}')

### StandardScaler
Normalize scales of features to improve accuracy of predictions especially if our variables are on different scales/magnitudes. This is because this would affect the performances of models that specifically rely on distance metrics(k-NN, PCA) as well as to speed up gradient descent convergence for deep neural networks during backpropagation. Mainly to ensure that every feature contributes equally to the models! 

In [None]:
# scale data
ss = StandardScaler()
ss.fit(X_train)
X_train_ss = ss.transform(X_train)
X_test_ss = ss.transform(X_test)

In [None]:
def get_models(models=dict()):
# linear models
    models['lr'] = LinearRegression()
    models['lasso'] = Lasso()
    models['ridge'] = Ridge()
    models['en'] = ElasticNet()
    models['huber'] = HuberRegressor()
    models['pa'] = PassiveAggressiveRegressor(max_iter=1000, tol=1e-3)
   
    return models

def get_models_nl(models=dict()):
# non-linear models
    models['svr'] = SVR()
# ensemble models
    n_trees = 100
    models['ada'] = AdaBoostRegressor(n_estimators=n_trees)
    models['bag'] = BaggingRegressor(n_estimators=n_trees)
    models['rf'] = RandomForestRegressor(n_estimators=n_trees)
    models['et'] = ExtraTreesRegressor(n_estimators=n_trees)
    models['gbm'] = GradientBoostingRegressor(n_estimators=n_trees)
    return models

def pipeline(model):
    pipe = Pipeline([(model, model_dict[model])])
    return pipe

def params(model):
    

    if model == 'lasso':
        return {"alpha":[0.01,0.1,1,2,5,10],
               }
    
    
    elif model == 'ridge':
        return {
            "alpha":[0.01,0.1,1,2,5,10],
            }
    
    elif model == 'en':
        return {
            'alpha':[0.01,0.1,1,10],
            'l1_ratio':[0.2,0.3,0.4,0.5,0.6]
            }
    elif model == 'knn':
        return {
            'n_neighbors':[4,5,6,7]}

    elif model == 'dt':
        return {
            'max_depth':[3,4,5],
            'min_samples_split':[2,3,4],
            'min_samples_leaf':[2,3,4]
        }
    elif model == 'bag':
        return {
            'max_features':[100, 150]
        }
        
    elif model == 'rf':
        return {
            'n_estimators':[100,150],
            'max_depth':[4],
            'min_samples_leaf':[2,3,4]
        }
    elif model == 'et':
        return {
            'n_estimators':[50,100,150],
            'max_depth':[5],
            'min_samples_leaf':[2,3],
        }
    elif model == 'abc':
        return {
            'n_estimators':[50, 100,150],
            'learning_rate':[0.3,0.6,1]
        }
    elif model == 'gbc':
        return {
            'learning_rate':[0.2],
            'max_depth':[5],
            'min_samples_split':[2,5]
            
        }
    elif model == 'xgb':
        return {
            'eval_metric' : ['auc'],
            'subsample' : [0.8], 
            'colsample_bytree' : [0.5], 
            'learning_rate' : [0.1],
            'max_depth' : [5], 
            'scale_pos_weight': [5], 
            'n_estimators' : [100,200],
            'reg_alpha' : [0, 0.05],
            'reg_lambda' : [2,3],
            'gamma' : [0.01]
                             
        }
    elif model == 'svr':
        return {
            'kernel': ['rbf', 'linear','poly'], 
            'C': [1,20,50,100],
            'gamma':['scale','auto'],
            'epsilon':[0.1,1,10]
        }
    elif model == 'ada':
        return {
            'n_estimators':[50,100,150],
            'learning_rate':[0.01,0.1,1],
            
        }
    elif model == 'bag':
        return {
            'n_estimators':[20,50,100,150],
            'max_features':[5,10,20],
            'max_samples':[0.1,0.2,0.3,0.5,0.7],
            'bootstrap':[True]
            
        }
    elif model == 'rf':
        return {
             'bootstrap': [True],
             'max_depth': [5,10,20],
             'max_features': ["auto", "sqrt", "log2"],
             'min_samples_leaf': [2,4,6,8,10],
             'min_samples_split': [2,5,8,10],
             'n_estimators': [50,200,300,400],
             'random_state': 42,
             }
    elif model == 'et':
        return {
             'bootstrap': [True],
             'max_depth': [5,10,20],
             'max_features': ["auto", "sqrt", "log2"],
             'min_samples_leaf': [2,4,6,8,10],
             'min_samples_split': [2,5,8,10],
             'n_estimators': [50,200,300,400],
             'random_state': 42,
        }
            
    elif model == 'gbm':
        return {
            'learning_rate' : [0.1,0.3,0.6,1], 
            'min_samples_split':[500,1000,2000,3000,5000],
            'min_samples_leaf': [50,200,400,1000],
            'max_depth' : [8,10,20,30]
        }

In [None]:
def evaluate_models(models, X_train_ss,y_train,X_test_ss,y_test):
    for name, model in models.items():
    # fit models
        model_fit = model.fit(X_train_ss,y_train)
        # make predictions
        train_preds = model_fit.predict(X_train_ss)
        test_preds = model_fit.predict(X_test_ss)
        # evaluate forecast
        train_mse = mean_squared_error(y_train,train_preds)
        test_mse = mean_squared_error(y_test,test_preds)
        print(f'{name}:')
        print(f'----')
        print(f'Train MAE: {round(train_mse,2)}')
        print(f'Test MAE: {round(test_mse,2)}')
        print(f'\n')
    



In [None]:
# grid search with gridsearchcv
def grid_search(model,models,X_train = X_train_ss,y_train = y_train,X_test = X_test_ss,y_test=y_test):
    pipe_params = params(model)
    model = models[model]
    gs = GridSearchCV(model,param_grid = pipe_params,cv = 5,scoring = 'neg_mean_squared_error', verbose=True, n_jobs=8)
    gs.fit(X_train_ss,y_train)
    train_score = gs.score(X_train_ss,y_train)
    test_score = gs.score(X_test_ss,y_test)
    
    print(f'Results from: {model}')
    print(f'-----------------------------------')
    print(f'Best Hyperparameters: {gs.best_params_}')
    print(f'Mean MSE: {round(gs.best_score_,4)}')
    print(f'Train Score: {round(train_score,4)}')
    print(f'Test Score: {round(test_score,4)}')
    print(' ')

In [None]:
# grid search with randomizedsearchcv
def grid_search_rs(model,models,X_train = X_train_ss,y_train = y_train,X_test = X_test_ss,y_test=y_test):
    pipe_params = params(model)
    model = models[model]
    gs = RandomizedSearchCV(model,param_distributions = pipe_params,cv = 5,scoring = 'neg_mean_squared_error', verbose=True, n_jobs=8)
    gs.fit(X_train_ss,y_train)
    train_score = gs.score(X_train_ss,y_train)
    test_score = gs.score(X_test_ss,y_test)
    
    print(f'Results from: {model}')
    print(f'-----------------------------------')
    print(f'Best Hyperparameters: {gs.best_params_}')
    print(f'Mean MSE: {round(gs.best_score_,4)}')
    print(f'Train Score: {round(train_score,4)}')
    print(f'Test Score: {round(test_score,4)}')
    print(' ')

### Linear models

In [None]:
models = get_models()
evaluate_models(models,X_train_ss,y_train,X_test_ss,y_test)

In [None]:
# best params of lasso
%time grid_search('lasso',models)

In [None]:
%%time
grid_search_rs('lasso',models)

In [None]:
# best params of en:
%time grid_search_rs('en',models)

In [None]:
# best params of en:
%time grid_search('en',models)

### LSTM
An extension of RNN that overcomes the vanishing gradient problem and to learn long term dependencies in sequence prediction problems using the memory cells present in the hidden states!

In [None]:
X_train_ss.shape

In [None]:
# reshape [samples,timesteps] into expected shape [samples,timesteps,n_features]
# samples = number of records ,timestep = how far back are we looking? , n_features = no. of variables, 1 as we are only using cnt to predict next month's cnt
X_train_ss_rs = X_train_ss.reshape((X_train_ss.shape[0],X_train_ss.shape[1],1))
X_test_ss_rs = X_test_ss.reshape((X_test_ss.shape[0],X_test_ss.shape[1],1))
ss1 = StandardScaler()
ss1.fit(y_train.values.reshape(y_train.shape[0],1))
y_train_ss = ss1.transform(y_train.values.reshape(y_train.shape[0],1))

In [None]:
%%time
model = Sequential()
model.add(LSTM(50, input_shape = (X_train_ss_rs.shape[1],X_train_ss_rs.shape[2]),activation = 'relu'))
model.add(Dense(16,activation = 'relu'))
model.add(Dropout(0.4))
model.add(Dense(1))
model.compile(loss = 'mae',optimizer = 'adam')
history = model.fit(X_train_ss_rs,y_train_ss,epochs = 30, batch_size = 50000, verbose = 2,shuffle = False)
plt.plot(history.history['loss'],label = 'loss')

In [None]:
%%time
model_1 = Sequential()
# [samples,timesteps,features]
model_1.add(LSTM(50, input_shape = (X_train_ss_rs.shape[1],X_train_ss_rs.shape[2]),activation = 'relu'))
model_1.add(Dense(16,activation = 'relu'))
model_1.add(Dropout(0.4))
model_1.add(Dense(1))
model_1.compile(loss = 'mae',optimizer = 'adam')
history_1 = model_1.fit(X_train_ss_rs,y_train_ss,epochs = 50, batch_size = 100000, verbose = 2,shuffle = False)
plt.plot(history_1.history['loss'],label = 'loss')

In [None]:
fig,ax = plt.subplots(figsize = (20,10))
ax.plot(history.history['loss'],label = 'Smaller batch size')
ax.plot(history_1.history['loss'],label = 'Larger batch size, more epochs')
# ax.plot(history_2.history['loss'],label = 'Larger batch size, more epochs, no relu')
plt.legend()

In [None]:
# function to evaluate performance of models
def mae_train_test(model,model_name, X_train,y_train,X_test,y_test,ss1):
    trainpreds = ss1.inverse_transform(model.predict(X_train)).reshape((y_train.shape[0],))
    testpreds = ss1.inverse_transform(model.predict(X_test)).reshape((y_test.shape[0],))
    print(f'Model: {model_name}')
    print('----')
    print(f'Train MAE: {mean_squared_error(y_train,trainpreds)}')
    print(f'Test MAE: {mean_squared_error(y_test,testpreds)}')
    print('\n')
    

In [None]:
mae_train_test(model,"LSTM - Smaller Batch Size", X_train_ss_rs,y_train,X_test_ss_rs,y_test,ss1)
mae_train_test(model_1,"LSTM - Larger Batch Size, more epochs", X_train_ss_rs,y_train,X_test_ss_rs,y_test,ss1)
# mae_train_test(model_2,"LSTM -Larger Batch Size, more epochs, no relu", X_train_ss_rs,y_train,X_test_ss_rs,y_test,ss1)

### CNN-LSTM
CNN: For additional feature engineering, LSTM: to overcome vanishing gradient problem and to learn long-term dependencies. 

In [None]:
print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of X_test: {X_test.shape}')

In [None]:
# required input shape: [samples, subsequences, timesteps, features]
# number of samples: 
subseq = 1
ts = 33
ss_cnn = StandardScaler()
ss_cnn.fit(X_train)
X_train_sub = ss_cnn.transform(X_train).reshape((X_train.shape[0],subseq,ts,1))
X_test_sub = ss_cnn.transform(X_test).reshape((X_test.shape[0],subseq,ts,1))
print(f'Shape of X_train_sub: {X_train_sub.shape}')
print(f'Shape of X_test_sub: {X_test_sub.shape}')

In [None]:
%%time 
cnn_lstm = Sequential()
cnn_lstm.add(TimeDistributed(Conv1D(filters = 64, kernel_size = 2, input_shape = (None,X_train_sub.shape[2],X_train_sub.shape[3]),activation = 'relu')))
cnn_lstm.add(TimeDistributed(Dropout(0.4)))
cnn_lstm.add(TimeDistributed(MaxPooling1D(pool_size = 1)))
cnn_lstm.add(TimeDistributed(Flatten()))
cnn_lstm.add(LSTM(50, activation = 'relu',return_sequences = True))
cnn_lstm.add(Dense(16,activation = 'relu'))
cnn_lstm.add(Dropout(0.4))
cnn_lstm.add(Dense(1))
cnn_lstm.compile(loss= 'mae',optimizer = 'adam')
cnn_lstm_history = cnn_lstm.fit(X_train_sub,y_train_ss, epochs = 50, batch_size = 100000, verbose = 2,shuffle = False)


In [None]:
# cnn_lstm.reset_states()

In [None]:
%%time
cnn_lstm_1 = Sequential()
cnn_lstm_1.add(TimeDistributed(Conv1D(filters = 64, kernel_size = 11, input_shape = (None,1,X_train_sub.shape[2],X_train_sub.shape[3]),activation = 'relu')))
cnn_lstm_1.add(TimeDistributed(Dropout(0.4)))
cnn_lstm_1.add(TimeDistributed(MaxPooling1D(pool_size = 1)))
cnn_lstm_1.add(TimeDistributed(Flatten()))
cnn_lstm_1.add(LSTM(50, activation = 'relu'))
cnn_lstm_1.add(Dense(16))
cnn_lstm_1.add((Dropout(0.4)))
cnn_lstm_1.add(Dense(1))
cnn_lstm_1.compile(loss= 'mae',optimizer = 'adam')
cnn_lstm_1_history = cnn_lstm_1.fit(X_train_sub,y_train_ss, epochs = 30, batch_size = 100000, verbose = 2,shuffle = False)


In [None]:
fig,ax = plt.subplots(figsize = (20,10))
ax.plot(cnn_lstm_history.history['loss'],label = 'more epochs, return state')
ax.plot(cnn_lstm_1_history.history['loss'],label = 'fewer epochs, without relu')
ax.legend()

In [None]:
mae_train_test(cnn_lstm,"CNN-LSTM", X_train_sub,y_train,X_test_sub,y_test,ss1)

In [None]:
mae_train_test(cnn_lstm_1,"CNN-LSTM - No relu", X_train_sub,y_train,X_test_sub,y_test,ss1)

### Generating forecasts
Since our model is trained on the past 33 months of data, we will feed the same duration of historical data to make our forecasts! 

In [None]:
print(f'test: {df_test.shape}')
print(f'train: {df.shape}')

In [None]:
## Using only 33 columns to make our next forecasts
df_test_1 = df_test.iloc[:,1:]
ss_test = StandardScaler()
ss_test.fit(df_test_1)
df_test_1_sub = ss_test.transform(df_test_1).reshape((df_test_1.shape[0],subseq,ts,1))

In [None]:
df_test_1.shape

In [None]:
print(f'Shape of test: {df_test_1_sub.shape}')

In [None]:
# generating forecasts, input = 33 months of data, output = 34th month
preds = cnn_lstm_1.predict(df_test_1_sub)

In [None]:
# since we scaled 33 variables, the standardscaler expects 33 variables so we'll just leave them as 0s
forecasts = np.zeros(shape=(len(preds), 33) )
# replace the first column with the actual preds
forecasts[:,0] = preds[:,0]
# inverse transform and then select the same preds column
forecasts = ss_test.inverse_transform(forecasts)[:,0]

In [None]:
# saving as a new dataframe
submissions = pd.DataFrame({"ID":test.ID, "item_cnt_month":forecasts})
submissions.head()

In [None]:
# ready for submissions!
submissions.to_csv('submissions.csv',index = False)