# This notebook is for Basic EDA

In [151]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm_notebook
from sklearn.preprocessing import LabelEncoder

DATA_FOLDER = '..//data//'
d_parser = lambda x: pd.datetime.strptime(x,'%Y-%m-%d')

In [152]:
df_sample_sub = pd.read_csv(os.path.join(DATA_FOLDER,'sample_submission.csv'))
df_stv        = pd.read_csv(os.path.join(DATA_FOLDER,'sales_train_validation.csv'))
df_ste        = pd.read_csv(os.path.join(DATA_FOLDER,'sales_train_evaluation.csv'))
df_prices     = pd.read_csv(os.path.join(DATA_FOLDER,'sell_prices.csv'))
df_calander   = pd.read_csv(os.path.join(DATA_FOLDER,'calendar.csv'), parse_dates=["date"], date_parser=d_parser)

  


### Checking the Sample Submission file

In [153]:
df_sample_sub.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,HOBBIES_1_002_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,HOBBIES_1_004_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,HOBBIES_1_005_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Important things to be noted about the Submission file**
* The submission file is a format, which we have to follow when putting our submissions
* The total number of rows in the submission files will be (total id in validation + total id in evaluation) 

In [154]:
df_stv.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4


* **It is quite clear now that the columns which start from d_ are representing days.**
* **We need to convert those days into rows, so that we should be able to process them**

### Checking for the Uniques 

In [155]:
print('Evaluation Dataset: \n', df_ste.agg({'id':'nunique','item_id':'nunique', 'store_id':'nunique'}))
print('\n')
print('Validation Dataset: \n', df_stv.agg({'id':'nunique','item_id':'nunique', 'store_id':'nunique'}))

Evaluation Dataset: 
 id          30490
item_id      3049
store_id       10
dtype: int64


Validation Dataset: 
 id          30490
item_id      3049
store_id       10
dtype: int64


**So, the unique ID in both evaluation and validation is unique_item_id X unique_store_id**

### Converting the Days in the Evaluation and Validation Datasets 
* The column format to be converted into row format

In [156]:
df_ste_rows = df_ste.melt(
id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
var_name ='d',
value_name ='target'
)

df_stv_rows = df_stv.melt(
id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
var_name ='d',
value_name ='target'
)

In [157]:
df_ste_rows.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59181090 entries, 0 to 59181089
Data columns (total 8 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   id        object
 1   item_id   object
 2   dept_id   object
 3   cat_id    object
 4   store_id  object
 5   state_id  object
 6   d         object
 7   target    int64 
dtypes: int64(1), object(7)
memory usage: 3.5+ GB


In [158]:
df_stv_rows.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58327370 entries, 0 to 58327369
Data columns (total 8 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   id        object
 1   item_id   object
 2   dept_id   object
 3   cat_id    object
 4   store_id  object
 5   state_id  object
 6   d         object
 7   target    int64 
dtypes: int64(1), object(7)
memory usage: 3.5+ GB


In [159]:
df_ste.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30490 entries, 0 to 30489
Columns: 1947 entries, id to d_1941
dtypes: int64(1941), object(6)
memory usage: 452.9+ MB


In [160]:
df_stv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30490 entries, 0 to 30489
Columns: 1919 entries, id to d_1913
dtypes: int64(1913), object(6)
memory usage: 446.4+ MB


# down casting and changing the d column

In [161]:
df_ste_rows['d']   = df_ste_rows['d'].apply(lambda x: x.replace('d_',''))
df_ste_rows['d']   = df_ste_rows['d'].astype('int16')

df_stv_rows['d']   = df_stv_rows['d'].apply(lambda x: x.replace('d_',''))
df_stv_rows['d']   = df_stv_rows['d'].astype('int16')

**checking for the rows after conversion**

In [162]:
len(df_ste_rows), len(df_stv_rows), len(df_ste), len(df_stv)

(59181090, 58327370, 30490, 30490)

In [163]:
df_ste_rows.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59181090 entries, 0 to 59181089
Data columns (total 8 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   id        object
 1   item_id   object
 2   dept_id   object
 3   cat_id    object
 4   store_id  object
 5   state_id  object
 6   d         int16 
 7   target    int64 
dtypes: int16(1), int64(1), object(6)
memory usage: 3.2+ GB


In [164]:
df_stv_rows.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58327370 entries, 0 to 58327369
Data columns (total 8 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   id        object
 1   item_id   object
 2   dept_id   object
 3   cat_id    object
 4   store_id  object
 5   state_id  object
 6   d         int16 
 7   target    int64 
dtypes: int16(1), int64(1), object(6)
memory usage: 3.2+ GB


# Checking the difference between the Evaluation and Validation Sales data

In [168]:
max_d_in_e = df_ste_rows.d.max()
max_d_in_v = df_stv_rows.d.max()

d = 1449
s = df_ste_rows[(df_ste_rows.d==d) & (df_ste_rows.state_id=='CA')]['target'].sum()
t = df_stv_rows[(df_stv_rows.d==d) & (df_stv_rows.state_id=='CA')]['target'].sum()


print(f'Last day in evaluation: {df_ste_rows.d.max()} and last day in Validation: {df_stv_rows.d.max()}, means 28 days more')
print(f'Evaluation Dataset , total sales for day {d} is {s}, While in Validation it is {t}')
print(f'max for validation is {max_d_in_v} and max in evaluation is {max_d_in_e}')
print(f'total additional days in evaluation are {max_d_in_e - max_d_in_v}')

Last day in evaluation: 1941 and last day in Validation: 1913, means 28 days more
Evaluation Dataset , total sales for day 1449 is 13997, While in Validation it is 13997
max for validation is 1913 and max in evaluation is 1941
total additional days in evaluation are 28


**This means that both data sets are same, and we have to train our model on validation dataset, and** 

### Step-1
* train our model on validation dataset which is until 1913
* predict for  1914 + 28
* evaluate the performance of our dataset from the evaluate dataset, as these dates are available.

### Step-2 (final predictions)
* train our model on evaluation dataset which is until 1941
* predict for 1942 + 28
* submit to kaggle

<font color=red> Or rather we don't use the validation data at all, and extract (last 28 days) from evaluation for test</font>

# Getting rid of validation data frame

In [169]:
del df_stv, df_stv_rows

In [170]:
df_ste_rows.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59181090 entries, 0 to 59181089
Data columns (total 8 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   id        object
 1   item_id   object
 2   dept_id   object
 3   cat_id    object
 4   store_id  object
 5   state_id  object
 6   d         int16 
 7   target    int64 
dtypes: int16(1), int64(1), object(6)
memory usage: 3.2+ GB


In [171]:
df_ste_rows.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,target
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,1,0
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,1,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,1,0
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,1,0
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,1,0


In [179]:
df_ste_rows.item_id.nunique(), df_ste_rows.dept_id.nunique(), df_ste_rows.cat_id.nunique(), df_ste_rows.store_id.nunique()

(3049, 7, 3, 10)

In [183]:
df_ste_rows.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59181090 entries, 0 to 59181089
Data columns (total 13 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   id             object
 1   item_id        object
 2   dept_id        object
 3   cat_id         object
 4   store_id       object
 5   state_id       object
 6   d              int16 
 7   target         int16 
 8   dept_id_code   int32 
 9   cat_id_code    int32 
 10  store_id_code  int32 
 11  state_id_code  int32 
 12  item_id_code   int32 
dtypes: int16(2), int32(5), object(6)
memory usage: 4.0+ GB


### Down casting

In [176]:

df_ste_rows.target   = df_ste_rows.target.astype('int16')

In [177]:
df_ste_rows.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59181090 entries, 0 to 59181089
Data columns (total 8 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   id        object
 1   item_id   object
 2   dept_id   object
 3   cat_id    object
 4   store_id  object
 5   state_id  object
 6   d         int16 
 7   target    int16 
dtypes: int16(2), object(6)
memory usage: 2.9+ GB


In [None]:
**converting to 

### Analyzing for Calendar

In [34]:
df_calander.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1


In [35]:
df_ste_rows.item_id.nunique(), df_ste_rows.id.nunique(), df_ste_rows.store_id.nunique()

(3049, 30490, 10)

In [36]:
df_ste_rows.target.max(),df_ste_rows.target.min()

(763, 0)

In [37]:
total_weeks_2015 = df_calander[df_calander.year==2015]['wm_yr_wk'].sort_values().nunique()
print('We have ',df_calander[df_calander.year==2015]['wm_yr_wk'].sort_values().nunique(), ' weeks in 2015\n')
df_calander[df_calander.year==2015]['wm_yr_wk'].sort_values().unique()

We have  53  weeks in 2015



array([11448, 11449, 11450, 11451, 11452, 11501, 11502, 11503, 11504,
       11505, 11506, 11507, 11508, 11509, 11510, 11511, 11512, 11513,
       11514, 11515, 11516, 11517, 11518, 11519, 11520, 11521, 11522,
       11523, 11524, 11525, 11526, 11527, 11528, 11529, 11530, 11531,
       11532, 11533, 11534, 11535, 11536, 11537, 11538, 11539, 11540,
       11541, 11542, 11543, 11544, 11545, 11546, 11547, 11548],
      dtype=int64)

In [38]:
df_calander[(df_calander.year==2015) &
           (df_calander.wm_yr_wk==11450)]

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
1442,2015-01-10,11450,Saturday,1,1,2015,d_1443,,,,,1,0,0
1443,2015-01-11,11450,Sunday,2,1,2015,d_1444,,,,,0,1,1
1444,2015-01-12,11450,Monday,3,1,2015,d_1445,,,,,0,1,1
1445,2015-01-13,11450,Tuesday,4,1,2015,d_1446,,,,,0,1,0
1446,2015-01-14,11450,Wednesday,5,1,2015,d_1447,,,,,0,0,1
1447,2015-01-15,11450,Thursday,6,1,2015,d_1448,,,,,0,1,1
1448,2015-01-16,11450,Friday,7,1,2015,d_1449,,,,,0,0,0


In [39]:
t = df_ste_rows[(df_ste_rows.d==1443) & (df_ste_rows.state_id=='CA')]['target'].sum()
print(f'So in california state, we had {t} items sold on day 1443')

So in california state, we had 19244 items sold on day 1443


# snap

There are 3 binary variables with a prefix "snap_" plus the state name.

snapCA, snapTX, and snap_WI: A binary variable (0 or 1) indicating whether the stores of CA, TX or WI allow SNAP purchases on the examined date. 1 indicates that SNAP purchases are allowed.

For those who is not familiar with SNAP like me;
"The United States federal government provides a nutrition assistance benefit called the Supplement Nutrition Assistance Program (SNAP). SNAP provides low income families and individuals with an Electronic Benefits Transfer debit card to purchase food products. In many states, the monetary benefits are dispersed to people across 10 days of the month and on each of these days 1/10 of the people will receive the benefit on their card."
Source: https://www.fns.usda.gov/snap/supplemental-nutrition-assistance-program

**there is not point in keeping a prefix of d_ with the d column as we all know that this is a day number sequence**

In [40]:
df_calander['d'] = df_calander['d'].apply(lambda x: x.replace('d_',''))
df_calander['d'] = df_calander['d'].astype('int16')

# Join Calendar and Sales (Evaluation)

In [41]:
dfmain = df_ste_rows.merge(df_calander[['date','wm_yr_wk','wday','d','snap_CA','snap_TX','snap_WI']], on=['d'], how='left')

**Verifying the merge**

In [25]:
t = df_ste_rows[(df_ste_rows.d==1443) & (df_ste_rows.state_id=='CA')]['target'].sum()
print(f'So in california state, we had {t} items sold on day 1443')

So in california state, we had 19244 items sold on day 1443


In [42]:
dfmain[(dfmain.d==1443)][['date','d','wm_yr_wk']].drop_duplicates()

Unnamed: 0,date,d,wm_yr_wk
43966580,2015-01-10,1443,11450


In [43]:
df_calander[(df_calander.d==1443)][['date','d','wm_yr_wk']].drop_duplicates()

Unnamed: 0,date,d,wm_yr_wk
1442,2015-01-10,1443,11450


# Checking the Prices

In [44]:
df_prices.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,CA_1,HOBBIES_1_001,11325,9.58
1,CA_1,HOBBIES_1_001,11326,9.58
2,CA_1,HOBBIES_1_001,11327,8.26
3,CA_1,HOBBIES_1_001,11328,8.26
4,CA_1,HOBBIES_1_001,11329,8.26


**so, this is basically item prices on a particular week in a particular store**
* We can easily join the item prices to the main data frame.

In [45]:
dfmain = dfmain.merge(df_prices, how='left', on=['store_id','item_id','wm_yr_wk'])

### Verifying

### Cleaning the memory and removing the unwanted datasets

In [46]:
dfmain[(dfmain.d==1443)]

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,target,date,wm_yr_wk,wday,snap_CA,snap_TX,snap_WI,sell_price
43966580,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,1443,1,2015-01-10,11450,1,1,0,0,8.26
43966581,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,1443,1,2015-01-10,11450,1,1,0,0,3.97
43966582,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,1443,0,2015-01-10,11450,1,1,0,0,2.97
43966583,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,1443,1,2015-01-10,11450,1,1,0,0,4.64
43966584,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,1443,1,2015-01-10,11450,1,1,0,0,2.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43997065,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,1443,1,2015-01-10,11450,1,1,0,0,2.88
43997066,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,1443,0,2015-01-10,11450,1,1,0,0,2.68
43997067,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,1443,1,2015-01-10,11450,1,1,0,0,3.98
43997068,FOODS_3_826_WI_3_evaluation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,1443,0,2015-01-10,11450,1,1,0,0,1.28


In [47]:
dfmain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59181090 entries, 0 to 59181089
Data columns (total 15 columns):
 #   Column      Dtype         
---  ------      -----         
 0   id          category      
 1   item_id     object        
 2   dept_id     category      
 3   cat_id      category      
 4   store_id    object        
 5   state_id    category      
 6   d           int16         
 7   target      int16         
 8   date        datetime64[ns]
 9   wm_yr_wk    int64         
 10  wday        int64         
 11  snap_CA     int64         
 12  snap_TX     int64         
 13  snap_WI     int64         
 14  sell_price  float64       
dtypes: category(4), datetime64[ns](1), float64(1), int16(2), int64(5), object(2)
memory usage: 4.9+ GB


In [35]:
dfmain.store_id.nunique()

10

### down casting, label encoding and removing unwanted columns

In [None]:
le = LabelEncoder()
df_ste_rows['dept_id_code'] = le.fit_transform(df_ste_rows.dept_id)
df_ste_rows['cat_id_code'] = le.fit_transform(df_ste_rows.cat_id)
df_ste_rows['store_id_code'] = le.fit_transform(df_ste_rows.store_id)
df_ste_rows['state_id_code'] = le.fit_transform(df_ste_rows.state_id)
df_ste_rows['item_id_code'] = le.fit_transform(df_ste_rows.item_id)

In [52]:
dfmain.store_id = dfmain.store_id.astype('category')
dfmain.item_id  = dfmain.item_id.astype('category')
dfmain.snap_CA  = dfmain.snap_CA.astype('int8')
dfmain.snap_TX  = dfmain.snap_TX.astype('int8')
dfmain.snap_WI  = dfmain.snap_WI.astype('int8')
dfmain.sell_price = dfmain.sell_price.astype('float16')
dfmain.wday  = dfmain.wday.astype('int8')
dfmain.wm_yr_wk  = dfmain.wm_yr_wk.astype('int16')

In [53]:
dfmain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59181090 entries, 0 to 59181089
Data columns (total 15 columns):
 #   Column      Dtype         
---  ------      -----         
 0   id          category      
 1   item_id     category      
 2   dept_id     category      
 3   cat_id      category      
 4   store_id    category      
 5   state_id    category      
 6   d           int16         
 7   target      int16         
 8   date        datetime64[ns]
 9   wm_yr_wk    int16         
 10  wday        int8          
 11  snap_CA     int8          
 12  snap_TX     int8          
 13  snap_WI     int8          
 14  sell_price  float16       
dtypes: category(6), datetime64[ns](1), float16(1), int16(3), int8(4)
memory usage: 2.0 GB


In [54]:
def add_date_attributes(dfmain):
    dfmain['day'] = dfmain['date'].dt.day
    return dfmain

In [55]:
dfmain = add_date_attributes(dfmain)

In [56]:
dfmain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59181090 entries, 0 to 59181089
Data columns (total 16 columns):
 #   Column      Dtype         
---  ------      -----         
 0   id          category      
 1   item_id     category      
 2   dept_id     category      
 3   cat_id      category      
 4   store_id    category      
 5   state_id    category      
 6   d           int16         
 7   target      int16         
 8   date        datetime64[ns]
 9   wm_yr_wk    int16         
 10  wday        int8          
 11  snap_CA     int8          
 12  snap_TX     int8          
 13  snap_WI     int8          
 14  sell_price  float16       
 15  day         int64         
dtypes: category(6), datetime64[ns](1), float16(1), int16(3), int64(1), int8(4)
memory usage: 2.4 GB


In [57]:
dfmain.day  = dfmain.day.astype('int8')

In [58]:
dfmain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59181090 entries, 0 to 59181089
Data columns (total 16 columns):
 #   Column      Dtype         
---  ------      -----         
 0   id          category      
 1   item_id     category      
 2   dept_id     category      
 3   cat_id      category      
 4   store_id    category      
 5   state_id    category      
 6   d           int16         
 7   target      int16         
 8   date        datetime64[ns]
 9   wm_yr_wk    int16         
 10  wday        int8          
 11  snap_CA     int8          
 12  snap_TX     int8          
 13  snap_WI     int8          
 14  sell_price  float16       
 15  day         int8          
dtypes: category(6), datetime64[ns](1), float16(1), int16(3), int8(5)
memory usage: 2.0 GB


In [66]:
dfmain.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,target,date,wm_yr_wk,wday,snap_CA,snap_TX,snap_WI,sell_price,day
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,11101,1,0,0,0,0.0,29
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,11101,1,0,0,0,0.0,29
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,11101,1,0,0,0,0.0,29
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,11101,1,0,0,0,0.0,29
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,11101,1,0,0,0,0.0,29


In [65]:
dfmain['sell_price'].fillna(0, inplace=True)

# Preparing the Data for Training

# Adding Means

In [67]:
### Adding Means
def add_mean(dfmain, col):
    mean_attrib = col + '_target_mean'
    mean_values = dfmain.groupby(col).target.mean()
    dfmain[mean_attrib] = dfmain[col].map(mean_values)
    return dfmain

In [68]:
dfmain = add_mean(dfmain,'store_id')
dfmain = add_mean(dfmain,'id')
dfmain = add_mean(dfmain,'cat_id')
dfmain = add_mean(dfmain,'state_id')
dfmain = add_mean(dfmain,'item_id')

In [76]:
dfmain = add_mean(dfmain,'dept_id')

In [71]:
dfmain[['store_id','store_id_target_mean']].drop_duplicates()

Unnamed: 0,store_id,store_id_target_mean
0,CA_1,1.323438
3049,CA_2,0.983151
6098,CA_3,1.92013
9147,CA_4,0.706735
12196,TX_1,0.961933
15245,TX_2,1.238511
18294,TX_3,1.048636
21343,WI_1,0.889052
24392,WI_2,1.131778
27441,WI_3,1.105515


In [72]:
dfmain[['cat_id','cat_id_target_mean']].drop_duplicates()

Unnamed: 0,cat_id,cat_id_target_mean
0,HOBBIES,0.569058
565,HOUSEHOLD,0.726498
1612,FOODS,1.646427


In [77]:
dfmain[['dept_id','dept_id_target_mean']].drop_duplicates()

Unnamed: 0,dept_id,dept_id_target_mean
0,HOBBIES_1,0.705799
416,HOBBIES_2,0.187284
565,HOUSEHOLD_1,1.135262
1097,HOUSEHOLD_2,0.304241
1612,FOODS_1,1.238003
1828,FOODS_2,1.009041
2226,FOODS_3,2.061858


In [136]:
dfmain.columns

Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd',
       'target', 'date', 'wm_yr_wk', 'wday', 'snap_CA', 'snap_TX', 'snap_WI',
       'sell_price', 'day', 'store_id_target_mean', 'id_target_mean',
       'cat_id_target_mean', 'state_id_target_mean', 'item_id_target_mean',
       'dept_id_target_mean'],
      dtype='object')

In [138]:
df_train = dfmain.drop(['cat_id','state_id','date','wm_yr_wk','wday','sell_price'], axis=1)

In [147]:
df_train.dept_id_target_mean = df_train.dept_id_target_mean.astype('float16')
df_train.item_id_target_mean = df_train.item_id_target_mean.astype('float16')
df_train.state_id_target_mean= df_train.state_id_target_mean.astype('float16')
df_train.store_id_target_mean= df_train.store_id_target_mean.astype('float16')
df_train.id_target_mean      = df_train.id_target_mean.astype('float16')

In [149]:
df_train.item_id.nunique(), df_train.dept_id.nunique(), 

3049

In [148]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59181090 entries, 0 to 59181089
Data columns (total 16 columns):
 #   Column                Dtype   
---  ------                -----   
 0   id                    category
 1   item_id               category
 2   dept_id               category
 3   store_id              category
 4   d                     int16   
 5   target                int16   
 6   snap_CA               int8    
 7   snap_TX               int8    
 8   snap_WI               int8    
 9   day                   int8    
 10  store_id_target_mean  float16 
 11  id_target_mean        float16 
 12  cat_id_target_mean    category
 13  state_id_target_mean  float16 
 14  item_id_target_mean   float16 
 15  dept_id_target_mean   float16 
dtypes: category(5), float16(5), int16(2), int8(4)
memory usage: 4.3 GB


In [121]:
def add_lags(df, shift_range,index_cols, lag_cols, exception_cols):
    cols_to_rename = list(df.columns.difference(index_cols + exception_cols)) 
    
    print('Columns to rename : ',cols_to_rename)

    for day_shift in tqdm_notebook(shift_range):
        train_shift = df[index_cols + cols_to_rename].copy()
        print('copied to train_shift')
        train_shift['d'] = train_shift['d'] + day_shift
        print(f'performed the shifting of {day_shift}')

        foo = lambda x: '{}_lag_{}'.format(x, day_shift) if x in cols_to_rename else x
        train_shift = train_shift.rename(columns=foo)
        print('columns are renamed for ', foo)

        print('\n')
        print(df.columns)
        print('\n')
        print('\n')
        print(train_shift.columns)        
        df = pd.merge(df, train_shift, on=index_cols, how='left')
        print('\n')
        print('performed the merge')
        
    return df

**So the index columns where we will merge the lags will be**
* d
* item_id
* store_id

the obvious reason to chose these three is the fact that, id column is a concatenation of dept_id and store_id

**while one thing, that I am not able to findout is that the id has either _validation or _evaluation suffix.**

whether we are going to use _validation suffice in the id while submitting, I am not sure but once we submit it. 

In [123]:
dfmain.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,target,date,wm_yr_wk,...,snap_TX,snap_WI,sell_price,day,store_id_target_mean,id_target_mean,cat_id_target_mean,state_id_target_mean,item_id_target_mean,dept_id_target_mean
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,11101,...,0,0,0.0,29,1.323438,0.326121,0.569058,1.233363,0.219629,0.705799
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,11101,...,0,0,0.0,29,1.323438,0.257599,0.569058,1.233363,0.263524,0.705799
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,11101,...,0,0,0.0,29,1.323438,0.159196,0.569058,1.233363,0.077795,0.705799
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,11101,...,0,0,0.0,29,1.323438,1.719217,0.569058,1.233363,2.04137,0.705799
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,11101,...,0,0,0.0,29,1.323438,0.972694,0.569058,1.233363,0.766358,0.705799


In [124]:
mean_enc_cols = [col for col in dfmain.columns if 'mean' in str(col)]
exception_cols = mean_enc_cols + ['cat_id', 'date', 'day', 'id','sell_price', 'snap_CA', 'snap_TX', 'snap_WI',
                                  'state_id', 'wday', 'wm_yr_wk']

index_cols = ['store_id','item_id','d']
lag_cols = ['target','dept_id','store_id','item_id']
shift_range = [x for x in range(1,3)]

dfmain = add_lags(dfmain,shift_range,index_cols,lag_cols,exception_cols)

Columns to rename :  ['dept_id', 'target']


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

copied to train_shift
performed the shifting of 1
columns are renamed for  <function add_lags.<locals>.<lambda> at 0x0000019F1E8595E8>


Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd',
       'target', 'date', 'wm_yr_wk', 'wday', 'snap_CA', 'snap_TX', 'snap_WI',
       'sell_price', 'day', 'store_id_target_mean', 'id_target_mean',
       'cat_id_target_mean', 'state_id_target_mean', 'item_id_target_mean',
       'dept_id_target_mean'],
      dtype='object')




Index(['store_id', 'item_id', 'd', 'dept_id_lag_1', 'target_lag_1'], dtype='object')


performed the merge
copied to train_shift
performed the shifting of 2
columns are renamed for  <function add_lags.<locals>.<lambda> at 0x0000019F1E859798>


Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd',
       'target', 'date', 'wm_yr_wk', 'wday', 'snap_CA', 'snap_TX', 'snap_WI',
       'sell_price', 'day', 'store_id_target_mean', 'id_target_mean',
       'cat_id_target_mean', 'state_id_tar

['item_id', 'd', 'dept_id', 'store_id']

In [128]:
dfmain[dfmain.dept_id=='HOBBIES_1'][['dept_id','dept_id_lag_1','dept_id_lag_2','dept_id_lag_3','dept_id_lag_4']]

Unnamed: 0,dept_id,dept_id_lag_1,dept_id_lag_2,dept_id_lag_3,dept_id_lag_4
0,HOBBIES_1,,,,
1,HOBBIES_1,,,,
2,HOBBIES_1,,,,
3,HOBBIES_1,,,,
4,HOBBIES_1,,,,
...,...,...,...,...,...
59178452,HOBBIES_1,HOBBIES_1,HOBBIES_1,HOBBIES_1,HOBBIES_1
59178453,HOBBIES_1,HOBBIES_1,HOBBIES_1,HOBBIES_1,HOBBIES_1
59178454,HOBBIES_1,HOBBIES_1,HOBBIES_1,HOBBIES_1,HOBBIES_1
59178455,HOBBIES_1,HOBBIES_1,HOBBIES_1,HOBBIES_1,HOBBIES_1


In [127]:
print(dfmain.dept_id.unique())

[HOBBIES_1, HOBBIES_2, HOUSEHOLD_1, HOUSEHOLD_2, FOODS_1, FOODS_2, FOODS_3]
Categories (7, object): [HOBBIES_1, HOBBIES_2, HOUSEHOLD_1, HOUSEHOLD_2, FOODS_1, FOODS_2, FOODS_3]


In [129]:
dfmain.columns

Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd',
       'target', 'date', 'wm_yr_wk', 'wday', 'snap_CA', 'snap_TX', 'snap_WI',
       'sell_price', 'day', 'store_id_target_mean', 'id_target_mean',
       'cat_id_target_mean', 'state_id_target_mean', 'item_id_target_mean',
       'dept_id_target_mean', 'dept_id_lag_1', 'target_lag_1', 'dept_id_lag_2',
       'target_lag_2', 'dept_id_lag_3', 'target_lag_3', 'dept_id_lag_4',
       'target_lag_4'],
      dtype='object')

In [133]:
dfmain.drop(['dept_id_lag_1', 'target_lag_1', 'dept_id_lag_2',
       'target_lag_2', 'dept_id_lag_3', 'target_lag_3', 'dept_id_lag_4',
       'target_lag_4'], inplace=True, axis=1)

0           NaN
1           NaN
2           NaN
3           NaN
4           NaN
           ... 
59181085    1.0
59181086    1.0
59181087    0.0
59181088    1.0
59181089    5.0
Name: target_lag_1, Length: 59181090, dtype: float64