In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Problem Statement

1) We have a dataset which included data about the demand of soda, using which we can predict the quantity. The data is in the form of a timeseries. 

2) We have created a Nested Cross Validation class, which also needs to be implemented while creating the model.  


# Importing all the libraries

We import pandas for reading and handling the data. Numpy provides us with very efficient ways to perform mathematical operations on arrays as well as easy array creation. Sklearn provides us with the tools that we need to build a model in the later stages of the pipeline. Seaborn and matplotlab are two important and highly used libraries for displaying and plotting graphs


In [None]:
#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.plotting import scatter_matrix

#Configure Visualization Defaults
#%matplotlib inline = show plots in Jupyter Notebook browser

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics


%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8

# Here is the code for our nested class

In [None]:
import logging
from types import GeneratorType
from sklearn.utils import indexable
from sklearn.utils.validation import _num_samples

LOGGER = logging.getLogger(__name__)

class NestedCV():
    
    
    def __init__(self, k, delay: int = 0):
         
         

        if k and k < 3:
            raise ValueError(f'Cannot have n_splits less than 3 (k={k})')
        self.k = k
        
        #super().__init__(k, shuffle=False, random_state=None)

        
        if delay < 0:
            raise ValueError(f'Cannot have negative values of delay (delay={delay})')
        self.delay = delay

    

    def split(self, X, date_column = None ,y=None, groups=None):
        """Generate indices to split data into training and test set.

        Parameters:
            X : array-like, shape (n_samples, n_features)
                Training data, where n_samples is the number of samples  and n_features is the number of features.

            y : array-like, shape (n_samples,)
                Always ignored, exists for compatibility.

            groups : array-like, with shape (n_samples,), optional
                Always ignored, exists for compatibility.

        Yields:
            train : ndarray
                The training set indices for that split.

            test : ndarray
                The testing set indices for that split.
        """
        X, y, groups = indexable(X, y, groups)  # pylint: disable=unbalanced-tuple-unpacking
        n_samples = _num_samples(X)
        n_splits = self.k
        n_folds = n_splits + 1
        delay = self.delay

        if n_folds > n_samples:
            raise ValueError(f'Cannot have number of folds={n_folds} greater than the number of samples: {n_samples}.')

        indices = np.arange(n_samples)
        split_size = n_samples // n_folds

        train_size = split_size * self.k
        test_size = n_samples // n_folds
        full_test = test_size + delay

        if full_test + n_splits > n_samples:
            raise ValueError(f'test_size\\({test_size}\\) + delay\\({delay}\\) = {test_size + delay} + '
                             f'n_splits={n_splits} \n'
                             f' greater than the number of samples: {n_samples}. Cannot create fold logic.')

        # Generate logic for splits.
        # Overwrite fold test_starts ranges if force_step_size is specified.
        
        
        step_size = split_size
        range_start = (split_size - full_test) + split_size + (n_samples % n_folds)
        test_starts = range(range_start, n_samples, step_size)

        # Generate data splits.
        for test_start in test_starts:
            id_start =  0
            # Ensure we always return a test set of the same size
            if indices[test_start:test_start + full_test].size < full_test:
                continue
            yield (indices[id_start:test_start],
                   indices[test_start + delay:test_start + full_test])

#this is our main method.

if __name__ == '__main__':
    #creating fake values of x and x to test our algorithm
    xx = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
    yy = np.array([1, 2, 3, 4, 5, 6])
    tscv = NestedCV(k=3)  # This is where we create an object of our class.
    print(tscv)  
    for train_index, test_index in tscv.split(xx):         # Calling our split function that yields a generator.
        print('TRAIN:', train_index, 'TEST:', test_index)
        X_train, X_test = xx[train_index], xx[test_index]
        y_train, y_test = yy[train_index], yy[test_index]
    print("---------------------------------------------")

# 1) Reading The Data

In [None]:
train = pd.read_csv("../input/predict-demand/train.csv", index_col = 1)
not_index = pd.read_csv("../input/predict-demand/train.csv") # This is the same as train but with a regular index, intsead of DateTime
test = pd.read_csv("../input/predict-demand/test.csv",index_col = 1)


# Converting the index to datetime format. 

train.index = pd.to_datetime(train.index)
test.index = pd.to_datetime(test.index)
#not_index["date"] = pd.to_datetime(not_index["date"]) 


print(train.head())

data1  = train.copy(deep = True) # We create a new value of train to mess around with, while keeping train intact.

data_cleaner = [data1,test,not_index] # Create a list of all the dataframes, which will help us while cleaning the Data. 

In [None]:
train.describe().T # describe gives us statistical insights about our data, while .T is used to transpose.

In [None]:
test.describe().T

From this we can infer that,  there are 6480 values in our train set and 1058 values in our test set.

# 2) Cleaning The Dataset

In [None]:
for dataset in data_cleaner:
    
    dataset.drop(columns = 'id', inplace = True) #We drop id because it doesn't help us in prediction
    
    # checking for null values
    
    dataset.info() 
    print(dataset.isnull().sum()) 

## Handling Null Values

We can see that there are many missing values which would either need to be filled or removed. We can also infer that most of the columns are floats, so they dont need to be encoded anyhow. However columns like 'city,shop,brand,capacity,container' are of the object type, so we need to handle that as well.

We could impute our values based on the other values of the same column or we could just drop all the rows with Na values. Here we'll be dropping the rows because we wont lose alot of data.

Also we can see that the number of rows have increased from 6480 to 7560 after converting setting the index as datetime. Given that all those rows have null values we can just drop it.

We also drop the ID column because it doesn't help us in our prediction. We can also drop lat and long because we have city code and shop code, which gives us the location

In [None]:
# loopoing over our dataset list to drop null values.
for dataset in data_cleaner:
    dataset.drop(dataset.index[6480:],inplace = True)  #drop all the values after 6480
    dataset.dropna(axis = 0, how = 'any',inplace = True) # drop all the rows with null values
    dataset.drop(columns = ['lat','long'], inplace = True) #We drop id because it doesn't help us in prediction
    

## Handling Categorical variables

Here, we are using LabelEncoder to convert our categorical variables to numerical values. 

In [None]:
label = LabelEncoder() # creating label encoder instance
for dataset in data_cleaner:    
    dataset['city_Code'] = label.fit_transform(dataset['city'])
    dataset['shop_Code'] = label.fit_transform(dataset['shop'])
    dataset['brand_Code'] = label.fit_transform(dataset['brand'])
    dataset['container_Code'] = label.fit_transform(dataset['container'])
    dataset['capacity_Code'] = label.fit_transform(dataset['capacity'])


## After Pre-Processing Our Data

In [None]:
for dataset in data_cleaner:
    
    dataset.info()
    print(dataset.isnull().sum())

Now as we can see, all the null values have been taken care of and all the categorical values have been converted to numerical values which our model will be able to understand, in all our dataframes.

In [None]:
data1.columns # Prints all the columns of data1

# 3) EDA

In [None]:
count_plot_column_name = [ 'city_Code', 'shop_Code', 'brand_Code', 'container_Code', 'capacity_Code','quantity'] #this is a list of all the column names with numerical values

Here, we are plotting a graph to count the number of observation in each type. Since count plot is mainly used for cateorical data, we will only use those. 

In [None]:
for i in count_plot_column_name:
    
    
    plt.figure()
    print(data1[i].value_counts())

    sns.set_style('whitegrid')
    sns.countplot(x=data1[i],data=data1, palette='YlGnBu_r')


The correlation heatmap helps us in identifying how every variable is co-related with each other. It is really helpful in figuring which features play a role in deciding our outcome. The below function helps us eaily create a correlation matrix

In [None]:
#correlation heatmap of dataset
def correlation_heatmap(df):
    _ , ax = plt.subplots(figsize =(14, 12))
    colormap = sns.diverging_palette(220, 10, as_cmap = True)
    
    _ = sns.heatmap(
        df.corr(), 
        cmap = colormap,
        square=True, 
        cbar_kws={'shrink':.9 }, 
        ax=ax,
        annot=True, 
        linewidths=0.1,vmax=1.0, linecolor='white',
        annot_kws={'fontsize':12 }
    )
    
    plt.title('Pearson Correlation of Features', y=1.05, size=15)

correlation_heatmap(data1)

# Building Our Model

In [None]:
column_names = [ 'pop','price', 'city_Code', 'shop_Code', 'brand_Code','container_Code', 'capacity_Code','quantity']

x = data1[column_names]
y = data1['quantity']

In order for our data to make sense and not throw off the predictions with need to normalize/scale our data. SkLearn provides a great tool MinMaxScaler for this very purpose. We first fit the MinMaxScaler object to our train values. We then store out target variable in Y and delete it from the training features. giving us two scaled x and y variables. 

We also apply PCA(Principal Component Analysis) in order to reduce the dimensionaltiy of our data. The steps followed are the same as MinMaxScaler.

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

norm = MinMaxScaler().fit(x)
x_norm = norm.transform(x)
x_norm = pd.DataFrame(x_norm)

pca = PCA()

pca.fit(x)

pca_train = pca.transform(x_norm)
pca_train = pd.DataFrame(pca_train)
pca_test = pca_train[7]


y_norm = x_norm[7]
print(y_norm)

x_norm.drop(7, axis = 1,inplace=True)
pca_train.drop(7,axis = 1 , inplace = True)
print(x_norm)

correlation_heatmap(x)

As we can conclude from our heat map, Quantity has the highest correlation with the price variable. Let's see how those two are related in a scatter plot.

In [None]:
sns.set_palette('RdPu')
plt.figure()
sns.set_context("poster", font_scale=0.7)
sns.scatterplot(data = data1, y='price', x='quantity', hue='capacity')

# Splitting the data

## Here we are using our NestedCV class to split the data

In [None]:

from pandas import read_csv
from sklearn.model_selection import TimeSeriesSplit
from matplotlib import pyplot

X = x.values
splits = NestedCV(k=3) # Here is where we create an istance of the class.
index = 1
for train_index, test_index in splits.split(X): #callng our split function which returns train and validation sets.
    trainn = X[train_index]
    testt = X[test_index]
    print('Observations: %d' % (len(trainn) + len(testt)))
    print('Training Observations: %d' % (len(trainn)))
    print('Testing Observations: %d' % (len(testt)))
    


 ## As we can see our data has been split itto 3 equal folds

# Checking stationarity

In [None]:
fig,ax = plt.subplots(8,1,figsize=(20,15))
for i,column in enumerate([col for col in x.columns if col != 'hi']):
    x[column].plot(ax=ax[i])
    ax[i].set_title(column)

**The function 'coint_johansen' is a function we use to check whether our multivariate data is stationary or not. If we have a single variable then we can check the stationarity using the adfuller test.**

In [None]:
from statsmodels.tsa.vector_ar.vecm import coint_johansen

coint_johansen(x_norm,-1,1).eig


## Since all our values are above 1 can say that our data is not stationary.

# Here we are going to use our Nested KFold with Machine Learning Models With a Pipeline

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

from sklearn.linear_model           import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(pca_train,pca_test,test_size=0.3)

pipelines = []
pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()),('LR',LinearRegression())])))
pipelines.append(('ScaledLASSO', Pipeline([('Scaler', StandardScaler()),('LASSO', Lasso())])))
pipelines.append(('ScaledEN', Pipeline([('Scaler', StandardScaler()),('EN', ElasticNet())])))
pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('KNN', KNeighborsRegressor())])))
pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()),('CART', DecisionTreeRegressor())])))
pipelines.append(('ScaledGBM', Pipeline([('Scaler', StandardScaler()),('GBM', GradientBoostingRegressor())])))

results = []
names = []
for name, model in pipelines:
    kfold = KFold(n_splits=10)
    cv_results = cross_val_score(model,x ,y, cv=tscv, scoring='r2')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)


# As we can see ScaleGBM has the highest accuracy score. The Score of LR ad LASSO are too high, which probably means that we've overfit our model