# This notebook is for Basic EDA

In [None]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm_notebook
from sklearn.preprocessing import LabelEncoder
import gc

DATA_FOLDER = '..//data//'
d_parser = lambda x: pd.datetime.strptime(x,'%Y-%m-%d')

import warnings
warnings.filterwarnings("ignore")

In [None]:
df_sample_sub = pd.read_csv(os.path.join(DATA_FOLDER,'sample_submission.csv'))
df_stv        = pd.read_csv(os.path.join(DATA_FOLDER,'sales_train_validation.csv'))
df_ste        = pd.read_csv(os.path.join(DATA_FOLDER,'sales_train_evaluation.csv'))
df_prices     = pd.read_csv(os.path.join(DATA_FOLDER,'sell_prices.csv'))
df_calander   = pd.read_csv(os.path.join(DATA_FOLDER,'calendar.csv'), parse_dates=["date"], date_parser=d_parser)

### Checking the Sample Submission file

In [None]:
df_sample_sub.head()

**Important things to be noted about the Submission file**
* The submission file is a format, which we have to follow when putting our submissions
* The total number of rows in the submission files will be (total_items x total_stores x last 28 days) 

In [None]:
df_stv.head()

* **It is quite clear now that the columns which start from d_ are representing days.**
* **We need to convert those days into rows, so that we should be able to process them**

### Checking for the Uniques 

In [None]:
print('Evaluation Dataset: \n', df_ste.agg({'id':'nunique','item_id':'nunique', 'store_id':'nunique'}))
print('\n')
print('Validation Dataset: \n', df_stv.agg({'id':'nunique','item_id':'nunique', 'store_id':'nunique'}))

**So, the unique ID in both evaluation and validation is unique_item_id X unique_store_id**

### Converting the Days in the Evaluation and Validation Datasets 
* The column format to be converted into row format

In [None]:
df = df_ste.iloc[:, 6:].T
dates_s = df_calander["date"].values
ids_s = df_ste["id"].values
df.columns = ids_s

dfs = []
for col in tqdm_notebook(df.columns):
    _df = df[[col]].reset_index(drop=True)
    _dr = _df.rename(columns={col:'qty'})
    _df['id'] = col
    dfs.append(_df)
    
df_tidy_ste = pd.concat(dfs)    
df_tidy_ste['date'] = dates_s

#saving the dataframe
#df_tidy_ste.to_csv('df_ste_tidy.csv')

In [None]:
df_tidy_ste.head()

In [None]:
df_tidy_ste.info()

In [None]:
df_ste_rows = df_ste.melt(
id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
var_name ='d',
value_name ='target'
)

df_stv_rows = df_stv.melt(
id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
var_name ='d',
value_name ='target'
)

In [None]:
df_ste_rows.info()

In [None]:
df_stv_rows.info()

In [None]:
df_ste.info()

In [None]:
df_stv.info()

**the colum d has a suffix d_ , we can remove that**

In [None]:
df_ste_rows['d']   = df_ste_rows['d'].apply(lambda x: x.replace('d_',''))
df_ste_rows['d']   = df_ste_rows['d'].astype('int16')

df_stv_rows['d']   = df_stv_rows['d'].apply(lambda x: x.replace('d_',''))
df_stv_rows['d']   = df_stv_rows['d'].astype('int16')

**checking for the rows after conversion**

In [None]:
len(df_ste_rows), len(df_stv_rows), len(df_ste), len(df_stv)

**as we can see that there are 60 million rows, we have to do some serious down casting here...**
* I am also beginning to think that there must be a way to manage the data without having to do **melt**

In [None]:
df_ste_rows.info()

In [None]:
df_stv_rows.info()

# Checking the difference between the Evaluation and Validation Sales data

In [None]:
max_d_in_e = df_ste_rows.d.max()
max_d_in_v = df_stv_rows.d.max()

d = 1449
s = df_ste_rows[(df_ste_rows.d==d) & (df_ste_rows.state_id=='CA')]['target'].sum()
t = df_stv_rows[(df_stv_rows.d==d) & (df_stv_rows.state_id=='CA')]['target'].sum()


print(f'Last day in evaluation: {df_ste_rows.d.max()} and last day in Validation: {df_stv_rows.d.max()}, means 28 days more')
print(f'Evaluation Dataset , total sales for day {d} is {s}, While in Validation it is {t}')
print(f'max for validation is {max_d_in_v} and max in evaluation is {max_d_in_e}')
print(f'total additional days in evaluation are {max_d_in_e - max_d_in_v}')

**This means that both data sets are same, and we have to train our model on validation dataset, and** 

### Step-1
* train our model on validation dataset which is until 1913
* predict for  1914 + 28
* evaluate the performance of our dataset from the evaluate dataset, as these dates are available.

### Step-2 (final predictions)
* train our model on evaluation dataset which is until 1941
* predict for 1942 + 28
* submit to kaggle

<font color=red> Or rather we don't use the validation data at all, and extract (last 28 days) from evaluation for test</font>

# Getting rid of validation data frame

In [None]:
del df_stv, df_stv_rows
gc.collect()

In [None]:
df_ste_rows.info()

In [None]:
df_ste_rows.head()

### Down casting

In [None]:
df_ste_rows.d.max(), df_ste_rows.target.max()

In [None]:
df_ste_rows.d   = df_ste_rows.d.astype('int16')
df_ste_rows.target   = df_ste_rows.target.astype('int16')
gc.collect()

In [None]:
df_ste_rows.info()

### Analyzing for Calendar

In [None]:
df_calander.head()

In [None]:
total_weeks_2015 = df_calander[df_calander.year==2015]['wm_yr_wk'].sort_values().nunique()
print('We have ',df_calander[df_calander.year==2015]['wm_yr_wk'].sort_values().nunique(), ' weeks in 2015\n')
df_calander[df_calander.year==2015]['wm_yr_wk'].sort_values().unique()

In [None]:
df_calander[(df_calander.year==2015) &
           (df_calander.wm_yr_wk==11450)]

In [None]:
t = df_ste_rows[(df_ste_rows.d==1443) & (df_ste_rows.state_id=='CA')]['target'].sum()
print(f'So in california state, we had {t} items sold on day 1443')

# snap

There are 3 binary variables with a prefix "snap_" plus the state name.

snapCA, snapTX, and snap_WI: A binary variable (0 or 1) indicating whether the stores of CA, TX or WI allow SNAP purchases on the examined date. 1 indicates that SNAP purchases are allowed.

For those who is not familiar with SNAP like me;
"The United States federal government provides a nutrition assistance benefit called the Supplement Nutrition Assistance Program (SNAP). SNAP provides low income families and individuals with an Electronic Benefits Transfer debit card to purchase food products. In many states, the monetary benefits are dispersed to people across 10 days of the month and on each of these days 1/10 of the people will receive the benefit on their card."
Source: https://www.fns.usda.gov/snap/supplemental-nutrition-assistance-program

**there is not point in keeping a prefix of d_ with the d column as we all know that this is a day number sequence**

In [None]:
df_calander['d'] = df_calander['d'].apply(lambda x: x.replace('d_',''))
df_calander['d'] = df_calander['d'].astype('int16')
gc.collect()

# Join Calendar and Sales (Evaluation)

In [None]:
dfmain = df_ste_rows.merge(df_calander[['date','wm_yr_wk','wday','d','month','year']], on=['d'], how='left')

**Verifying the merge**

In [None]:
t = df_ste_rows[(df_ste_rows.d==1443) & (df_ste_rows.state_id=='CA')]['target'].sum()
print(f'So in california state, we had {t} items sold on day 1443')

In [None]:
s = dfmain[(dfmain.d==1443) & (dfmain.state_id=='CA')]['target'].sum()
print(f'So in california state, we had {s} items sold on day 1443')

In [None]:
# delete the unwanted data sets
del df_ste,df_ste_rows, df_calander
gc.collect()

In [None]:
dfmain[(dfmain.d==1443)][['date','d','wm_yr_wk']].drop_duplicates()

# Checking the Prices

In [None]:
df_prices.head()

In [None]:
dfmain.head()

**so, this is basically item prices on a particular week in a particular store**
* We can easily join the item prices to the main data frame.

In [None]:
dfmain = dfmain.merge(df_prices, how='left', on=['store_id','item_id','wm_yr_wk'])

In [None]:
dfmain.info()

### Adding Group by for Item and store for lags

* what is the sales of a particular item across the country on a particular day
    * That will be used as a lagged feature
* what is the performance of a particular department across the country on a particular day 
    * That too, can be used as a lagged feature

In [None]:
# by item
gb_item  = dfmain.groupby(['item_id','d'], as_index=False).target.sum()
gb_item.rename(columns={'target':'target_item'}, inplace=True)    

# by dept
gb_dept  = dfmain.groupby(['dept_id','d'], as_index=False).target.sum()
gb_dept.rename(columns={'target':'target_dept'}, inplace=True)    

In [None]:
#adding the item groupped target and store groupped target as feature
dfmain = dfmain.merge(gb_dept, how='left', on=['dept_id','d']).fillna(0)
dfmain = dfmain.merge(gb_item, how='left', on=['item_id','d']).fillna(0)

In [None]:
del gb_dept, gb_item
gc.collect()

In [None]:
dfmain.info()

### down casting, label encoding and removing unwanted columns

In [None]:
#converting all the ids to label encoded values
le = LabelEncoder()
dfmain['dept_id_code'] = le.fit_transform(dfmain.dept_id)
dfmain['cat_id_code'] = le.fit_transform(dfmain.cat_id)
dfmain['store_id_code'] = le.fit_transform(dfmain.store_id)
dfmain['state_id_code'] = le.fit_transform(dfmain.state_id)
dfmain['item_id_code'] = le.fit_transform(dfmain.item_id)

#deleting all such columns
dfmain.drop(['dept_id','cat_id','store_id','state_id','item_id'], axis=1, inplace=True)

dfmain.dept_id_code.max(), dfmain.cat_id_code.max(),dfmain.store_id_code.max(),dfmain.item_id_code.max()

In [None]:
dfmain.item_id_code.max()

In [None]:
dfmain.wm_yr_wk     = dfmain.wm_yr_wk.astype('int16')
dfmain.wday         = dfmain.wday.astype('int8')
dfmain.month        = dfmain.month.astype('int8')
dfmain.year         = dfmain.year.astype('int16')
dfmain.sell_price   = dfmain.sell_price.astype('float16')
dfmain.dept_id_code = dfmain.dept_id_code.astype('int8')
dfmain.cat_id_code  = dfmain.cat_id_code.astype('int8')
dfmain.store_id_code= dfmain.store_id_code.astype('int8')
dfmain.state_id_code= dfmain.state_id_code.astype('int8')
dfmain.item_id_code= dfmain.item_id_code.astype('int16')

**delete the date as well, as it is not needed**

In [None]:
#but before deleting the date, we may add dofm (day of month)
def get_d_of_m(df):
    df['day'] = df['date'].dt.day
    
dfmain['dom'] = dfmain['date'].apply(get_d_of_m)

In [None]:
dfmain.drop(['date'], inplace=True, axis=1)

In [None]:
dfmain['sell_price'].fillna(0, inplace=True)

In [None]:
dfmain.info()

In [None]:
dfmain.isna().sum()

# Preparing the Data for Training

# Adding Means

In [None]:
### Adding Means

'''
in future feature sets, we should be adding 
1 - store_state_target_mean 
2 - store_dept_target_mean etc 
3 - wday_target_mean
4 - month_target_mean
  - others
'''

def add_mean(dfmain, col):
    mean_attrib = col + '_target_mean'
    mean_values = dfmain.groupby(col).target.mean()
    dfmain[mean_attrib] = dfmain[col].map(mean_values)
    return dfmain

In [None]:
dfmain = add_mean(dfmain,'store_id_code')
dfmain = add_mean(dfmain,'cat_id_code')
dfmain = add_mean(dfmain,'state_id_code')
dfmain = add_mean(dfmain,'item_id_code')
dfmain = add_mean(dfmain,'dept_id_code')

In [None]:
dfmain[['store_id_code','store_id_code_target_mean']].drop_duplicates()

In [None]:
dfmain[['cat_id_code','cat_id_code_target_mean']].drop_duplicates()

In [None]:
dfmain[['dept_id_code','dept_id_code_target_mean']].drop_duplicates()

In [None]:
dfmain.columns

In [None]:
dfmain.info()

In [None]:
# drop the following attributes, since we've already added their means
df_train = dfmain.drop(['cat_id_code','state_id_code','dept_id_code','wm_yr_wk','wday','sell_price'], axis=1)

In [None]:
del dfmain
gc.collect()

In [None]:
df_train.dept_id_code_target_mean = df_train.dept_id_code_target_mean.astype('float16')
df_train.item_id_code_target_mean = df_train.item_id_code_target_mean.astype('float16')
df_train.state_id_code_target_mean= df_train.state_id_code_target_mean.astype('float16')
df_train.store_id_code_target_mean= df_train.store_id_code_target_mean.astype('float16')
df_train.cat_id_code_target_mean  = df_train.cat_id_code_target_mean.astype('float16')

In [None]:
gc.collect()

In [None]:
df_train.info()

In [None]:
df_train.info()

In [None]:
df_train = df_train[df_train.year > 2013] 

In [None]:
def add_lags(df, shift_range,index_cols, lag_cols, exception_cols):
    cols_to_rename = list(df.columns.difference(index_cols + exception_cols)) 
    
    print('Columns to rename : ',cols_to_rename)
    
    print(index_cols + cols_to_rename)

    for day_shift in tqdm_notebook(shift_range):
        train_shift = df[index_cols + cols_to_rename].copy()
        print('copied to train_shift')
        train_shift['d'] = train_shift['d'] + day_shift
        
        print(f'performed the shifting of {day_shift}')

        foo = lambda x: '{}_lag_{}'.format(x, day_shift) if x in cols_to_rename else x
        train_shift = train_shift.rename(columns=foo)

        
        
        df = pd.merge(df, train_shift, on=index_cols, how='left').fillna(0)
        print('--------------- ', train_shift ,'  ---------------')
        print('performed the merge')
        
    return df

**So the index columns where we will merge the lags will be**
* d
* item_id_code
* store_id_code

the obvious reason to chose these three is the fact that, id column is a concatenation of dept_id and store_id

**while one thing, that I am not able to findout is that the id has either _validation or _evaluation suffix.**

whether we are going to use _validation suffice in the id while submitting, I am not sure but once we submit it. 

In [None]:
mean_enc_cols = [col for col in df_train.columns if 'mean' in str(col)]
exception_cols = mean_enc_cols + ['cat_id', 'date', 'day', 'id','sell_price', 'snap_CA', 'snap_TX', 'snap_WI',
                                  'state_id', 'wday', 'wm_yr_wk','year','month','dom']

index_cols = ['store_id_code','item_id_code','d']
lag_cols = ['target']
shift_range = [x for x in range(1,29)]

df_train = add_lags(df_train,shift_range,index_cols,lag_cols,exception_cols)

In [None]:
df_train[(df_train.item_id_code==1) & (df_train.store_id_code==1)][['d','target_dept','target_dept_lag_1','target_dept_lag_2',
                                                                   'target_dept_lag_3','target_dept_lag_4',
                                                                   'target_dept_lag_5']].head(15)

In [None]:
df_train[(df_train.item_id_code==1) & (df_train.store_id_code==1)][['d','target_item','target_item_lag_1','target_item_lag_2',
                                                                   'target_item_lag_3','target_item_lag_4',
                                                                   'target_item_lag_5']].head(15)

In [None]:
df_train.target_dept.max(), df_train.target_item.max(), df_train.target.max()

In [None]:
df_train.info()

In [None]:
#converting all the lags to be int16
lag_cols = [col for col in df_train.columns if 'lag' in str(col)]
for col in lag_cols:
    df_train[col] = df_train[col].astype('int16')  
    
df_train.info()    

# Prediction Idea

**Since we have to predict 28 days, 3049 items, and 10 stores**
* Days = 28
* Items = 3049
* Stores = 10

Total Predictions = 28 x 3049 x 10 = 853,720

**Let's see if this is roughly the sum of each month entries in our dataset**

**so, this clears the logic, as we can see that feb-2014 and feb-2015 have similar number of items**

**Let's also suppose that we are to predict only one day**

Then the equation would be : 
    
    * 3049 x 10 = 30,490
    
So, 
    There can be two ways to do that

    
* **First Procedure** 
    Train for all days (huge dataset and huge model size)

    for day in range(1,29):
        predict([item,day])
        
        
* **Second Procedure** 

    * seperate each day and create 28 data sets
    for day in range(1,29)
        * load_data(day)
        * train data
        * predict(for day)
