# This notebook is for Basic EDA

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm_notebook
from sklearn.preprocessing import LabelEncoder
import gc

DATA_FOLDER = '..//data//'
d_parser = lambda x: pd.datetime.strptime(x,'%Y-%m-%d')

import warnings
warnings.filterwarnings("ignore")

In [2]:
df_sample_sub = pd.read_csv(os.path.join(DATA_FOLDER,'sample_submission.csv'))
df_stv        = pd.read_csv(os.path.join(DATA_FOLDER,'sales_train_validation.csv'))
df_ste        = pd.read_csv(os.path.join(DATA_FOLDER,'sales_train_evaluation.csv'))
df_prices     = pd.read_csv(os.path.join(DATA_FOLDER,'sell_prices.csv'))
df_calander   = pd.read_csv(os.path.join(DATA_FOLDER,'calendar.csv'), parse_dates=["date"], date_parser=d_parser)

### Checking the Sample Submission file

In [3]:
df_sample_sub.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,HOBBIES_1_002_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,HOBBIES_1_004_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,HOBBIES_1_005_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Important things to be noted about the Submission file**
* The submission file is a format, which we have to follow when putting our submissions
* The total number of rows in the submission files will be (total_items x total_stores x last 28 days) 

In [4]:
df_stv.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4


* **It is quite clear now that the columns which start from d_ are representing days.**
* **We need to convert those days into rows, so that we should be able to process them**

### Checking for the Uniques 

In [5]:
print('Evaluation Dataset: \n', df_ste.agg({'id':'nunique','item_id':'nunique', 'store_id':'nunique'}))
print('\n')
print('Validation Dataset: \n', df_stv.agg({'id':'nunique','item_id':'nunique', 'store_id':'nunique'}))

Evaluation Dataset: 
 id          30490
item_id      3049
store_id       10
dtype: int64


Validation Dataset: 
 id          30490
item_id      3049
store_id       10
dtype: int64


**So, the unique ID in both evaluation and validation is unique_item_id X unique_store_id**

### Converting the Days in the Evaluation and Validation Datasets 
* The column format to be converted into row format

In [6]:
df_ste_rows = df_ste.melt(
id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
var_name ='d',
value_name ='target'
)

df_stv_rows = df_stv.melt(
id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
var_name ='d',
value_name ='target'
)

In [7]:
df_ste_rows.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59181090 entries, 0 to 59181089
Data columns (total 8 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   id        object
 1   item_id   object
 2   dept_id   object
 3   cat_id    object
 4   store_id  object
 5   state_id  object
 6   d         object
 7   target    int64 
dtypes: int64(1), object(7)
memory usage: 3.5+ GB


In [8]:
df_stv_rows.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58327370 entries, 0 to 58327369
Data columns (total 8 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   id        object
 1   item_id   object
 2   dept_id   object
 3   cat_id    object
 4   store_id  object
 5   state_id  object
 6   d         object
 7   target    int64 
dtypes: int64(1), object(7)
memory usage: 3.5+ GB


In [9]:
df_ste.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30490 entries, 0 to 30489
Columns: 1947 entries, id to d_1941
dtypes: int64(1941), object(6)
memory usage: 452.9+ MB


In [10]:
df_stv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30490 entries, 0 to 30489
Columns: 1919 entries, id to d_1913
dtypes: int64(1913), object(6)
memory usage: 446.4+ MB


**the colum d has a suffix d_ , we can remove that**

In [11]:
df_ste_rows['d']   = df_ste_rows['d'].apply(lambda x: x.replace('d_',''))
df_ste_rows['d']   = df_ste_rows['d'].astype('int16')

df_stv_rows['d']   = df_stv_rows['d'].apply(lambda x: x.replace('d_',''))
df_stv_rows['d']   = df_stv_rows['d'].astype('int16')

**checking for the rows after conversion**

In [12]:
len(df_ste_rows), len(df_stv_rows), len(df_ste), len(df_stv)

(59181090, 58327370, 30490, 30490)

**as we can see that there are 60 million rows, we have to do some serious down casting here...**
* I am also beginning to think that there must be a way to manage the data without having to do **melt**

In [13]:
df_ste_rows.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59181090 entries, 0 to 59181089
Data columns (total 8 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   id        object
 1   item_id   object
 2   dept_id   object
 3   cat_id    object
 4   store_id  object
 5   state_id  object
 6   d         int16 
 7   target    int64 
dtypes: int16(1), int64(1), object(6)
memory usage: 3.2+ GB


In [14]:
df_stv_rows.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58327370 entries, 0 to 58327369
Data columns (total 8 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   id        object
 1   item_id   object
 2   dept_id   object
 3   cat_id    object
 4   store_id  object
 5   state_id  object
 6   d         int16 
 7   target    int64 
dtypes: int16(1), int64(1), object(6)
memory usage: 3.2+ GB


# Checking the difference between the Evaluation and Validation Sales data

In [15]:
max_d_in_e = df_ste_rows.d.max()
max_d_in_v = df_stv_rows.d.max()

d = 1449
s = df_ste_rows[(df_ste_rows.d==d) & (df_ste_rows.state_id=='CA')]['target'].sum()
t = df_stv_rows[(df_stv_rows.d==d) & (df_stv_rows.state_id=='CA')]['target'].sum()


print(f'Last day in evaluation: {df_ste_rows.d.max()} and last day in Validation: {df_stv_rows.d.max()}, means 28 days more')
print(f'Evaluation Dataset , total sales for day {d} is {s}, While in Validation it is {t}')
print(f'max for validation is {max_d_in_v} and max in evaluation is {max_d_in_e}')
print(f'total additional days in evaluation are {max_d_in_e - max_d_in_v}')

Last day in evaluation: 1941 and last day in Validation: 1913, means 28 days more
Evaluation Dataset , total sales for day 1449 is 13997, While in Validation it is 13997
max for validation is 1913 and max in evaluation is 1941
total additional days in evaluation are 28


**This means that both data sets are same, and we have to train our model on validation dataset, and** 

### Step-1
* train our model on validation dataset which is until 1913
* predict for  1914 + 28
* evaluate the performance of our dataset from the evaluate dataset, as these dates are available.

### Step-2 (final predictions)
* train our model on evaluation dataset which is until 1941
* predict for 1942 + 28
* submit to kaggle

<font color=red> Or rather we don't use the validation data at all, and extract (last 28 days) from evaluation for test</font>

# Getting rid of validation data frame

In [16]:
del df_stv, df_stv_rows
gc.collect()

40

In [17]:
df_ste_rows.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59181090 entries, 0 to 59181089
Data columns (total 8 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   id        object
 1   item_id   object
 2   dept_id   object
 3   cat_id    object
 4   store_id  object
 5   state_id  object
 6   d         int16 
 7   target    int64 
dtypes: int16(1), int64(1), object(6)
memory usage: 3.2+ GB


In [18]:
df_ste_rows.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,target
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,1,0
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,1,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,1,0
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,1,0
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,1,0


### Down casting

In [19]:
df_ste_rows.d.max(), df_ste_rows.target.max()

(1941, 763)

In [20]:
df_ste_rows.d   = df_ste_rows.d.astype('int16')
df_ste_rows.target   = df_ste_rows.target.astype('int16')
gc.collect()

80

In [21]:
df_ste_rows.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59181090 entries, 0 to 59181089
Data columns (total 8 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   id        object
 1   item_id   object
 2   dept_id   object
 3   cat_id    object
 4   store_id  object
 5   state_id  object
 6   d         int16 
 7   target    int16 
dtypes: int16(2), object(6)
memory usage: 2.9+ GB


### Analyzing for Calendar

In [22]:
df_calander.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1


In [23]:
total_weeks_2015 = df_calander[df_calander.year==2015]['wm_yr_wk'].sort_values().nunique()
print('We have ',df_calander[df_calander.year==2015]['wm_yr_wk'].sort_values().nunique(), ' weeks in 2015\n')
df_calander[df_calander.year==2015]['wm_yr_wk'].sort_values().unique()

We have  53  weeks in 2015



array([11448, 11449, 11450, 11451, 11452, 11501, 11502, 11503, 11504,
       11505, 11506, 11507, 11508, 11509, 11510, 11511, 11512, 11513,
       11514, 11515, 11516, 11517, 11518, 11519, 11520, 11521, 11522,
       11523, 11524, 11525, 11526, 11527, 11528, 11529, 11530, 11531,
       11532, 11533, 11534, 11535, 11536, 11537, 11538, 11539, 11540,
       11541, 11542, 11543, 11544, 11545, 11546, 11547, 11548],
      dtype=int64)

In [24]:
df_calander[(df_calander.year==2015) &
           (df_calander.wm_yr_wk==11450)]

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
1442,2015-01-10,11450,Saturday,1,1,2015,d_1443,,,,,1,0,0
1443,2015-01-11,11450,Sunday,2,1,2015,d_1444,,,,,0,1,1
1444,2015-01-12,11450,Monday,3,1,2015,d_1445,,,,,0,1,1
1445,2015-01-13,11450,Tuesday,4,1,2015,d_1446,,,,,0,1,0
1446,2015-01-14,11450,Wednesday,5,1,2015,d_1447,,,,,0,0,1
1447,2015-01-15,11450,Thursday,6,1,2015,d_1448,,,,,0,1,1
1448,2015-01-16,11450,Friday,7,1,2015,d_1449,,,,,0,0,0


In [25]:
t = df_ste_rows[(df_ste_rows.d==1443) & (df_ste_rows.state_id=='CA')]['target'].sum()
print(f'So in california state, we had {t} items sold on day 1443')

So in california state, we had 19244 items sold on day 1443


# snap

There are 3 binary variables with a prefix "snap_" plus the state name.

snapCA, snapTX, and snap_WI: A binary variable (0 or 1) indicating whether the stores of CA, TX or WI allow SNAP purchases on the examined date. 1 indicates that SNAP purchases are allowed.

For those who is not familiar with SNAP like me;
"The United States federal government provides a nutrition assistance benefit called the Supplement Nutrition Assistance Program (SNAP). SNAP provides low income families and individuals with an Electronic Benefits Transfer debit card to purchase food products. In many states, the monetary benefits are dispersed to people across 10 days of the month and on each of these days 1/10 of the people will receive the benefit on their card."
Source: https://www.fns.usda.gov/snap/supplemental-nutrition-assistance-program

**there is not point in keeping a prefix of d_ with the d column as we all know that this is a day number sequence**

In [26]:
df_calander['d'] = df_calander['d'].apply(lambda x: x.replace('d_',''))
df_calander['d'] = df_calander['d'].astype('int16')
gc.collect()

60

# Join Calendar and Sales (Evaluation)

In [27]:
dfmain = df_ste_rows.merge(df_calander[['date','wm_yr_wk','wday','d','month','year']], on=['d'], how='left')

**Verifying the merge**

In [28]:
t = df_ste_rows[(df_ste_rows.d==1443) & (df_ste_rows.state_id=='CA')]['target'].sum()
print(f'So in california state, we had {t} items sold on day 1443')

So in california state, we had 19244 items sold on day 1443


In [29]:
s = dfmain[(dfmain.d==1443) & (dfmain.state_id=='CA')]['target'].sum()
print(f'So in california state, we had {s} items sold on day 1443')

So in california state, we had 19244 items sold on day 1443


In [30]:
# delete the unwanted data sets
del df_ste,df_ste_rows, df_calander
gc.collect()

80

In [31]:
dfmain[(dfmain.d==1443)][['date','d','wm_yr_wk']].drop_duplicates()

Unnamed: 0,date,d,wm_yr_wk
43966580,2015-01-10,1443,11450


# Checking the Prices

In [32]:
df_prices.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,CA_1,HOBBIES_1_001,11325,9.58
1,CA_1,HOBBIES_1_001,11326,9.58
2,CA_1,HOBBIES_1_001,11327,8.26
3,CA_1,HOBBIES_1_001,11328,8.26
4,CA_1,HOBBIES_1_001,11329,8.26


In [33]:
dfmain.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,target,date,wm_yr_wk,wday,month,year
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,11101,1,1,2011
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,11101,1,1,2011
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,11101,1,1,2011
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,11101,1,1,2011
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,11101,1,1,2011


**so, this is basically item prices on a particular week in a particular store**
* We can easily join the item prices to the main data frame.

In [34]:
dfmain = dfmain.merge(df_prices, how='left', on=['store_id','item_id','wm_yr_wk'])

In [35]:
dfmain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59181090 entries, 0 to 59181089
Data columns (total 14 columns):
 #   Column      Dtype         
---  ------      -----         
 0   id          object        
 1   item_id     object        
 2   dept_id     object        
 3   cat_id      object        
 4   store_id    object        
 5   state_id    object        
 6   d           int16         
 7   target      int16         
 8   date        datetime64[ns]
 9   wm_yr_wk    int64         
 10  wday        int64         
 11  month       int64         
 12  year        int64         
 13  sell_price  float64       
dtypes: datetime64[ns](1), float64(1), int16(2), int64(4), object(6)
memory usage: 6.0+ GB


### Adding Group by for Item and store for lags

* what is the sales of a particular item across the country on a particular day
    * That will be used as a lagged feature
* what is the performance of a particular department across the country on a particular day 
    * That too, can be used as a lagged feature

In [37]:
# by item
gb_item  = dfmain.groupby(['item_id','d'], as_index=False).target.sum()
gb_item.rename(columns={'target':'target_item'}, inplace=True)    

# by dept
gb_dept  = dfmain.groupby(['dept_id','d'], as_index=False).target.sum()
gb_dept.rename(columns={'target':'target_dept'}, inplace=True)    

In [38]:
#adding the item groupped target and store groupped target as feature
dfmain = dfmain.merge(gb_dept, how='left', on=['dept_id','d']).fillna(0)
dfmain = dfmain.merge(gb_item, how='left', on=['item_id','d']).fillna(0)

In [40]:
del gb_dept, gb_item
gc.collect()

20

In [41]:
dfmain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59181090 entries, 0 to 59181089
Data columns (total 16 columns):
 #   Column       Dtype         
---  ------       -----         
 0   id           object        
 1   item_id      object        
 2   dept_id      object        
 3   cat_id       object        
 4   store_id     object        
 5   state_id     object        
 6   d            int16         
 7   target       int16         
 8   date         datetime64[ns]
 9   wm_yr_wk     int64         
 10  wday         int64         
 11  month        int64         
 12  year         int64         
 13  sell_price   float64       
 14  target_dept  int16         
 15  target_item  int16         
dtypes: datetime64[ns](1), float64(1), int16(4), int64(4), object(6)
memory usage: 6.2+ GB


### down casting, label encoding and removing unwanted columns

In [42]:
#converting all the ids to label encoded values
le = LabelEncoder()
dfmain['dept_id_code'] = le.fit_transform(dfmain.dept_id)
dfmain['cat_id_code'] = le.fit_transform(dfmain.cat_id)
dfmain['store_id_code'] = le.fit_transform(dfmain.store_id)
dfmain['state_id_code'] = le.fit_transform(dfmain.state_id)
dfmain['item_id_code'] = le.fit_transform(dfmain.item_id)

#deleting all such columns
dfmain.drop(['dept_id','cat_id','store_id','state_id','item_id'], axis=1, inplace=True)

dfmain.dept_id_code.max(), dfmain.cat_id_code.max(),dfmain.store_id_code.max(),dfmain.item_id_code.max()

(6, 2, 9, 3048)

In [43]:
dfmain.item_id_code.max()

3048

In [44]:
dfmain.wm_yr_wk     = dfmain.wm_yr_wk.astype('int16')
dfmain.wday         = dfmain.wday.astype('int8')
dfmain.month        = dfmain.month.astype('int8')
dfmain.year         = dfmain.year.astype('int16')
dfmain.sell_price   = dfmain.sell_price.astype('float16')
dfmain.dept_id_code = dfmain.dept_id_code.astype('int8')
dfmain.cat_id_code  = dfmain.cat_id_code.astype('int8')
dfmain.store_id_code= dfmain.store_id_code.astype('int8')
dfmain.state_id_code= dfmain.state_id_code.astype('int8')
dfmain.item_id_code= dfmain.item_id_code.astype('int16')

**delete the date as well, as it is not needed**

In [None]:
#but before deleting the date, we may add dofm (day of month)
def get_d_of_m(df):
    df['day'] = df['date'].dt.day
    
dfmain['dom'] = dfmain['date'].apply(get_d_of_m)

In [46]:
dfmain.drop(['date'], inplace=True, axis=1)

In [47]:
dfmain['sell_price'].fillna(0, inplace=True)

In [48]:
dfmain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59181090 entries, 0 to 59181089
Data columns (total 15 columns):
 #   Column         Dtype  
---  ------         -----  
 0   id             object 
 1   d              int16  
 2   target         int16  
 3   wm_yr_wk       int16  
 4   wday           int8   
 5   month          int8   
 6   year           int16  
 7   sell_price     float16
 8   target_dept    int16  
 9   target_item    int16  
 10  dept_id_code   int8   
 11  cat_id_code    int8   
 12  store_id_code  int8   
 13  state_id_code  int8   
 14  item_id_code   int16  
dtypes: float16(1), int16(7), int8(6), object(1)
memory usage: 2.1+ GB


In [51]:
dfmain.isna().sum()

id               0
d                0
target           0
wm_yr_wk         0
wday             0
month            0
year             0
sell_price       0
target_dept      0
target_item      0
dept_id_code     0
cat_id_code      0
store_id_code    0
state_id_code    0
item_id_code     0
dtype: int64

# Preparing the Data for Training

# Adding Means

In [52]:
### Adding Means

'''
in future feature sets, we should be adding 
1 - store_state_target_mean 
2 - store_dept_target_mean etc 
3 - wday_target_mean
4 - month_target_mean
  - others
'''

def add_mean(dfmain, col):
    mean_attrib = col + '_target_mean'
    mean_values = dfmain.groupby(col).target.mean()
    dfmain[mean_attrib] = dfmain[col].map(mean_values)
    return dfmain

In [53]:
dfmain = add_mean(dfmain,'store_id_code')
dfmain = add_mean(dfmain,'cat_id_code')
dfmain = add_mean(dfmain,'state_id_code')
dfmain = add_mean(dfmain,'item_id_code')
dfmain = add_mean(dfmain,'dept_id_code')

In [54]:
dfmain[['store_id_code','store_id_code_target_mean']].drop_duplicates()

Unnamed: 0,store_id_code,store_id_code_target_mean
0,0,1.323438
3049,1,0.983151
6098,2,1.92013
9147,3,0.706735
12196,4,0.961933
15245,5,1.238511
18294,6,1.048636
21343,7,0.889052
24392,8,1.131778
27441,9,1.105515


In [55]:
dfmain[['cat_id_code','cat_id_code_target_mean']].drop_duplicates()

Unnamed: 0,cat_id_code,cat_id_code_target_mean
0,1,0.569058
565,2,0.726498
1612,0,1.646427


In [56]:
dfmain[['dept_id_code','dept_id_code_target_mean']].drop_duplicates()

Unnamed: 0,dept_id_code,dept_id_code_target_mean
0,3,0.705799
416,4,0.187284
565,5,1.135262
1097,6,0.304241
1612,0,1.238003
1828,1,1.009041
2226,2,2.061858


In [57]:
dfmain.columns

Index(['id', 'd', 'target', 'wm_yr_wk', 'wday', 'month', 'year', 'sell_price',
       'target_dept', 'target_item', 'dept_id_code', 'cat_id_code',
       'store_id_code', 'state_id_code', 'item_id_code',
       'store_id_code_target_mean', 'cat_id_code_target_mean',
       'state_id_code_target_mean', 'item_id_code_target_mean',
       'dept_id_code_target_mean'],
      dtype='object')

In [58]:
dfmain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59181090 entries, 0 to 59181089
Data columns (total 20 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   id                         object 
 1   d                          int16  
 2   target                     int16  
 3   wm_yr_wk                   int16  
 4   wday                       int8   
 5   month                      int8   
 6   year                       int16  
 7   sell_price                 float16
 8   target_dept                int16  
 9   target_item                int16  
 10  dept_id_code               int8   
 11  cat_id_code                int8   
 12  store_id_code              int8   
 13  state_id_code              int8   
 14  item_id_code               int16  
 15  store_id_code_target_mean  float64
 16  cat_id_code_target_mean    float64
 17  state_id_code_target_mean  float64
 18  item_id_code_target_mean   float64
 19  dept_id_code_target_mean   float64
dtype

In [59]:
# drop the following attributes, since we've already added their means
df_train = dfmain.drop(['cat_id_code','state_id_code','dept_id_code','wm_yr_wk','wday','sell_price'], axis=1)

In [60]:
del dfmain
gc.collect()

228

In [61]:
df_train.dept_id_code_target_mean = df_train.dept_id_code_target_mean.astype('float16')
df_train.item_id_code_target_mean = df_train.item_id_code_target_mean.astype('float16')
df_train.state_id_code_target_mean= df_train.state_id_code_target_mean.astype('float16')
df_train.store_id_code_target_mean= df_train.store_id_code_target_mean.astype('float16')
df_train.cat_id_code_target_mean  = df_train.cat_id_code_target_mean.astype('float16')

In [62]:
gc.collect()

40

In [63]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59181090 entries, 0 to 59181089
Data columns (total 14 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   id                         object 
 1   d                          int16  
 2   target                     int16  
 3   month                      int8   
 4   year                       int16  
 5   target_dept                int16  
 6   target_item                int16  
 7   store_id_code              int8   
 8   item_id_code               int16  
 9   store_id_code_target_mean  float16
 10  cat_id_code_target_mean    float16
 11  state_id_code_target_mean  float16
 12  item_id_code_target_mean   float16
 13  dept_id_code_target_mean   float16
dtypes: float16(5), int16(6), int8(2), object(1)
memory usage: 2.2+ GB


In [64]:
df_train = df_train[df_train.year > 2013] 

In [65]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26617770 entries, 32563320 to 59181089
Data columns (total 14 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   id                         object 
 1   d                          int16  
 2   target                     int16  
 3   month                      int8   
 4   year                       int16  
 5   target_dept                int16  
 6   target_item                int16  
 7   store_id_code              int8   
 8   item_id_code               int16  
 9   store_id_code_target_mean  float16
 10  cat_id_code_target_mean    float16
 11  state_id_code_target_mean  float16
 12  item_id_code_target_mean   float16
 13  dept_id_code_target_mean   float16
dtypes: float16(5), int16(6), int8(2), object(1)
memory usage: 1015.4+ MB


In [84]:
def add_lags(df, shift_range,index_cols, lag_cols, exception_cols):
    cols_to_rename = list(df.columns.difference(index_cols + exception_cols)) 
    
    print('Columns to rename : ',cols_to_rename)
    
    print(index_cols + cols_to_rename)

    for day_shift in tqdm_notebook(shift_range):
        train_shift = df[index_cols + cols_to_rename].copy()
        print('copied to train_shift with columns ' , train_shift.columns)
        train_shift['d'] = train_shift['d'] + day_shift
        
        print(f'performed the shifting of {day_shift}')

        foo = lambda x: '{}_lag_{}'.format(x, day_shift) if x in cols_to_rename else x
        train_shift = train_shift.rename(columns=foo)

        print('\n')
        
        df = pd.merge(df, train_shift, on=index_cols, how='left').fillna(0)
        print('\n')
        print('performed the merge')
        
    return df

**So the index columns where we will merge the lags will be**
* d
* item_id_code
* store_id_code

the obvious reason to chose these three is the fact that, id column is a concatenation of dept_id and store_id

**while one thing, that I am not able to findout is that the id has either _validation or _evaluation suffix.**

whether we are going to use _validation suffice in the id while submitting, I am not sure but once we submit it. 

In [85]:
mean_enc_cols = [col for col in df_train.columns if 'mean' in str(col)]
exception_cols = mean_enc_cols + ['cat_id', 'date', 'day', 'id','sell_price', 'snap_CA', 'snap_TX', 'snap_WI',
                                  'state_id', 'wday', 'wm_yr_wk','year','month']

index_cols = ['store_id_code','item_id_code','d']
lag_cols = ['target']
shift_range = [x for x in range(1,29)]

df_train = add_lags(df_train,shift_range,index_cols,lag_cols,exception_cols)

Columns to rename :  ['target', 'target_dept', 'target_item']
['store_id_code', 'item_id_code', 'd', 'target', 'target_dept', 'target_item']


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

copied to train_shift with columns  Index(['store_id_code', 'item_id_code', 'd', 'target', 'target_dept',
       'target_item'],
      dtype='object')
performed the shifting of 1




performed the merge
copied to train_shift with columns  Index(['store_id_code', 'item_id_code', 'd', 'target', 'target_dept',
       'target_item'],
      dtype='object')
performed the shifting of 2




performed the merge
copied to train_shift with columns  Index(['store_id_code', 'item_id_code', 'd', 'target', 'target_dept',
       'target_item'],
      dtype='object')
performed the shifting of 3




performed the merge
copied to train_shift with columns  Index(['store_id_code', 'item_id_code', 'd', 'target', 'target_dept',
       'target_item'],
      dtype='object')
performed the shifting of 4




performed the merge
copied to train_shift with columns  Index(['store_id_code', 'item_id_code', 'd', 'target', 'target_dept',
       'target_item'],
      dtype='object')
performed the shifting of 5




perfo

In [102]:
df_train[(df_train.item_id_code==1) & (df_train.store_id_code==1)][['d','target_dept','target_dept_lag_1','target_dept_lag_2',
                                                                   'target_dept_lag_3','target_dept_lag_4',
                                                                   'target_dept_lag_5']].head(15)

Unnamed: 0,d,target_dept,target_dept_lag_1,target_dept_lag_2,target_dept_lag_3,target_dept_lag_4,target_dept_lag_5
4662,1069,1988,0,0,0,0,0
35152,1070,2254,1988,0,0,0,0
65642,1071,2682,2254,1988,0,0,0
96132,1072,2897,2682,2254,1988,0,0
126622,1073,2910,2897,2682,2254,1988,0
157112,1074,1978,2910,2897,2682,2254,1988
187602,1075,2177,1978,2910,2897,2682,2254
218092,1076,2472,2177,1978,2910,2897,2682
248582,1077,2646,2472,2177,1978,2910,2897
279072,1078,2577,2646,2472,2177,1978,2910


In [92]:
df_train[(df_train.item_id_code==1) & (df_train.store_id_code==1)][['d','target_item','target_item_lag_1','target_item_lag_2',
                                                                   'target_item_lag_3','target_item_lag_4',
                                                                   'target_item_lag_5']].head(15)

Unnamed: 0,target_item,target_item_lag_1,target_item_lag_2,target_item_lag_3,target_item_lag_4,target_item_lag_5
4662,0,0.0,0.0,0.0,0.0,0.0
35152,3,0.0,0.0,0.0,0.0,0.0
65642,4,3.0,0.0,0.0,0.0,0.0
96132,5,4.0,3.0,0.0,0.0,0.0
126622,3,5.0,4.0,3.0,0.0,0.0
157112,5,3.0,5.0,4.0,3.0,0.0
187602,2,5.0,3.0,5.0,4.0,3.0
218092,2,2.0,5.0,3.0,5.0,4.0
248582,4,2.0,2.0,5.0,3.0,5.0
279072,2,4.0,2.0,2.0,5.0,3.0


In [106]:
df_train.target_dept.max(), df_train.target_item.max(), df_train.target.max()

(28153, 1525, 606)

In [93]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26617770 entries, 0 to 26617769
Data columns (total 29 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   id                         object 
 1   d                          int16  
 2   target                     int16  
 3   month                      int8   
 4   year                       int16  
 5   target_dept                int16  
 6   target_item                int16  
 7   store_id_code              int8   
 8   item_id_code               int16  
 9   store_id_code_target_mean  float16
 10  cat_id_code_target_mean    float16
 11  state_id_code_target_mean  float16
 12  item_id_code_target_mean   float16
 13  dept_id_code_target_mean   float16
 14  target_lag_1               float64
 15  target_dept_lag_1          float64
 16  target_item_lag_1          float64
 17  target_lag_2               float64
 18  target_dept_lag_2          float64
 19  target_item_lag_2          float64
 20  

In [94]:
#converting all the lags to be int16
lag_cols = [col for col in df_train.columns if 'lag' in str(col)]
for col in lag_cols:
    df_train[col] = df_train[col].astype('int16')  
    
df_train.info()    

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26617770 entries, 0 to 26617769
Data columns (total 29 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   id                         object 
 1   d                          int16  
 2   target                     int16  
 3   month                      int8   
 4   year                       int16  
 5   target_dept                int16  
 6   target_item                int16  
 7   store_id_code              int8   
 8   item_id_code               int16  
 9   store_id_code_target_mean  float16
 10  cat_id_code_target_mean    float16
 11  state_id_code_target_mean  float16
 12  item_id_code_target_mean   float16
 13  dept_id_code_target_mean   float16
 14  target_lag_1               int16  
 15  target_dept_lag_1          int16  
 16  target_item_lag_1          int16  
 17  target_lag_2               int16  
 18  target_dept_lag_2          int16  
 19  target_item_lag_2          int16  
 20  

# Prediction Idea

**Since we have to predict 28 days, 3049 items, and 10 stores**
* Days = 28
* Items = 3049
* Stores = 10

Total Predictions = 28 x 3049 x 10 = 853,720

**Let's see if this is roughly the sum of each month entries in our dataset**

In [110]:
df_train.groupby(['month','year'], as_index=False).target.count()

Unnamed: 0,month,year,target
0,1,2014,945190
1,1,2015,945190
2,1,2016,945190
3,2,2014,853720
4,2,2015,853720
5,2,2016,884210
6,3,2014,945190
7,3,2015,945190
8,3,2016,945190
9,4,2014,914700


In [None]:
**so, this clears the logic, as we can see that feb-2014 and feb-2015 have similar number of items**

**Let's also suppose that we are to predict only one day**

Then the equation would be : 
    
    * 3049 x 10 = 30,490
    
So, 
    There can be two ways to do that

    
* **First Procedure** 


    for day in range(1,29):
        predict([item,day])
        
        
* **Second Procedure** 

    * seperate each day and create 28 data sets
    for day in range(1,29)
        load_data(day)
        train data
        predict(for day)
