# This notebook is for Basic EDA

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm_notebook
from sklearn.preprocessing import LabelEncoder
import gc
from datetime import datetime

DATA_FOLDER = '..//data//'
d_parser = lambda x: pd.datetime.strptime(x,'%Y-%m-%d')

import warnings
warnings.filterwarnings("ignore")

In [2]:
start_time = datetime.now()
print(datetime.now())
df_sample_sub = pd.read_csv(os.path.join(DATA_FOLDER,'sample_submission.csv'))
df_stv        = pd.read_csv(os.path.join(DATA_FOLDER,'sales_train_validation.csv'))
df_ste        = pd.read_csv(os.path.join(DATA_FOLDER,'sales_train_evaluation.csv'))
df_prices     = pd.read_csv(os.path.join(DATA_FOLDER,'sell_prices.csv'))
df_calander   = pd.read_csv(os.path.join(DATA_FOLDER,'calendar.csv'), parse_dates=["date"], date_parser=d_parser)


2020-06-25 03:27:26.648604


### Checking the Sample Submission file

In [171]:
df_submission.id.nunique()

30490

In [178]:
df_sample_sub.id.head(), df_sample_sub.id.tail()

(0    HOBBIES_1_001_CA_1_validation
 1    HOBBIES_1_002_CA_1_validation
 2    HOBBIES_1_003_CA_1_validation
 3    HOBBIES_1_004_CA_1_validation
 4    HOBBIES_1_005_CA_1_validation
 Name: id, dtype: object,
 60975    FOODS_3_823_WI_3_evaluation
 60976    FOODS_3_824_WI_3_evaluation
 60977    FOODS_3_825_WI_3_evaluation
 60978    FOODS_3_826_WI_3_evaluation
 60979    FOODS_3_827_WI_3_evaluation
 Name: id, dtype: object)

**Important things to be noted about the Submission file**
* The submission file is a format, which we have to follow when putting our submissions
* The total number of rows in the submission files will be (total_items x total_stores x last 28 days) 

In [4]:
df_stv.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4


* **It is quite clear now that the columns which start from d_ are representing days.**
* **We need to convert those days into rows, so that we should be able to process them**

### Checking for the Uniques 

In [5]:
print('Evaluation Dataset: \n', df_ste.agg({'id':'nunique','item_id':'nunique', 'store_id':'nunique'}))
print('\n')
print('Validation Dataset: \n', df_stv.agg({'id':'nunique','item_id':'nunique', 'store_id':'nunique'}))

Evaluation Dataset: 
 id          30490
item_id      3049
store_id       10
dtype: int64


Validation Dataset: 
 id          30490
item_id      3049
store_id       10
dtype: int64


**So, the unique ID in both evaluation and validation is unique_item_id X unique_store_id**

### Converting the Days in the Evaluation and Validation Datasets 
* The column format to be converted into row format

In [6]:
df_ste_rows = df_ste.melt(
id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
var_name ='d',
value_name ='target'
)

df_stv_rows = df_stv.melt(
id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
var_name ='d',
value_name ='target'
)

In [7]:
df_ste_rows.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,target
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0


In [8]:
df_ste_rows.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59181090 entries, 0 to 59181089
Data columns (total 8 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   id        object
 1   item_id   object
 2   dept_id   object
 3   cat_id    object
 4   store_id  object
 5   state_id  object
 6   d         object
 7   target    int64 
dtypes: int64(1), object(7)
memory usage: 3.5+ GB


In [9]:
df_stv_rows.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58327370 entries, 0 to 58327369
Data columns (total 8 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   id        object
 1   item_id   object
 2   dept_id   object
 3   cat_id    object
 4   store_id  object
 5   state_id  object
 6   d         object
 7   target    int64 
dtypes: int64(1), object(7)
memory usage: 3.5+ GB


In [10]:
df_ste.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30490 entries, 0 to 30489
Columns: 1947 entries, id to d_1941
dtypes: int64(1941), object(6)
memory usage: 452.9+ MB


In [11]:
df_stv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30490 entries, 0 to 30489
Columns: 1919 entries, id to d_1913
dtypes: int64(1913), object(6)
memory usage: 446.4+ MB


**the colum d has a suffix d_ , we can remove that**

In [12]:
df_ste_rows['d']   = df_ste_rows['d'].apply(lambda x: x.replace('d_',''))
df_ste_rows['d']   = df_ste_rows['d'].astype('int16')

df_stv_rows['d']   = df_stv_rows['d'].apply(lambda x: x.replace('d_',''))
df_stv_rows['d']   = df_stv_rows['d'].astype('int16')

**checking for the rows after conversion**

In [13]:
len(df_ste_rows), len(df_stv_rows), len(df_ste), len(df_stv)

(59181090, 58327370, 30490, 30490)

In [14]:
df_ste_rows.shape, df_stv_rows.shape

((59181090, 8), (58327370, 8))

**as we can see that there are 60 million rows, we have to do some serious down casting here...**
* I am also beginning to think that there must be a way to manage the data without having to do **melt**

In [15]:
df_ste_rows.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59181090 entries, 0 to 59181089
Data columns (total 8 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   id        object
 1   item_id   object
 2   dept_id   object
 3   cat_id    object
 4   store_id  object
 5   state_id  object
 6   d         int16 
 7   target    int64 
dtypes: int16(1), int64(1), object(6)
memory usage: 3.2+ GB


In [16]:
df_stv_rows.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58327370 entries, 0 to 58327369
Data columns (total 8 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   id        object
 1   item_id   object
 2   dept_id   object
 3   cat_id    object
 4   store_id  object
 5   state_id  object
 6   d         int16 
 7   target    int64 
dtypes: int16(1), int64(1), object(6)
memory usage: 3.2+ GB


# Checking the difference between the Evaluation and Validation Sales data

In [17]:
max_d_in_e = df_ste_rows.d.max()
max_d_in_v = df_stv_rows.d.max()

d = 600
s = df_ste_rows[(df_ste_rows.d==d) & (df_ste_rows.state_id=='CA')]['target'].sum()
t = df_stv_rows[(df_stv_rows.d==d) & (df_stv_rows.state_id=='CA')]['target'].sum()


print(f'Last day in evaluation: {df_ste_rows.d.max()} and last day in Validation: {df_stv_rows.d.max()}, means 28 days more')
print(f'Evaluation Dataset , total sales for day {d} is {s}, While in Validation it is {t}')
print(f'max for validation is {max_d_in_v} and max in evaluation is {max_d_in_e}')
print(f'total additional days in evaluation are {max_d_in_e - max_d_in_v}')

Last day in evaluation: 1941 and last day in Validation: 1913, means 28 days more
Evaluation Dataset , total sales for day 600 is 11513, While in Validation it is 11513
max for validation is 1913 and max in evaluation is 1941
total additional days in evaluation are 28


**This means that both data sets are same, and we have to train our model on validation dataset, and** 

### Step-1
* train our model on validation dataset which is until 1913
* predict for  1913 + 28
* evaluate the performance of our dataset from the evaluate dataset, as these dates are available.

### Step-2 (final predictions)
* train our model on evaluation dataset which is until 1941
* predict for 1942 + 28
* submit to kaggle

<font color=red> Or rather we don't use the validation data at all, and extract (last 28 days) from evaluation for test</font>

# Getting rid of validation data frame

In [18]:
del df_stv, df_stv_rows
gc.collect()

20

In [19]:
df_ste_rows.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59181090 entries, 0 to 59181089
Data columns (total 8 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   id        object
 1   item_id   object
 2   dept_id   object
 3   cat_id    object
 4   store_id  object
 5   state_id  object
 6   d         int16 
 7   target    int64 
dtypes: int16(1), int64(1), object(6)
memory usage: 3.2+ GB


In [20]:
df_ste_rows.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,target
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,1,0
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,1,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,1,0
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,1,0
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,1,0


### Down casting

In [21]:
df_ste_rows.d.max(), df_ste_rows.target.max()

(1941, 763)

In [22]:
df_ste_rows.d   = df_ste_rows.d.astype('int16')
df_ste_rows.target   = df_ste_rows.target.astype('int16')
gc.collect()

80

In [23]:
df_ste_rows.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59181090 entries, 0 to 59181089
Data columns (total 8 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   id        object
 1   item_id   object
 2   dept_id   object
 3   cat_id    object
 4   store_id  object
 5   state_id  object
 6   d         int16 
 7   target    int16 
dtypes: int16(2), object(6)
memory usage: 2.9+ GB


### Analyzing for Calendar

In [24]:
df_calander.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1


In [25]:
len(df_calander)

1969

In [26]:
total_weeks_2015 = df_calander[df_calander.year==2017]['wm_yr_wk'].sort_values().nunique()
print('We have ',df_calander[df_calander.year==2015]['wm_yr_wk'].sort_values().nunique(), ' weeks in 2015\n')
df_calander[df_calander.year==2015]['wm_yr_wk'].sort_values().unique()

We have  53  weeks in 2015



array([11448, 11449, 11450, 11451, 11452, 11501, 11502, 11503, 11504,
       11505, 11506, 11507, 11508, 11509, 11510, 11511, 11512, 11513,
       11514, 11515, 11516, 11517, 11518, 11519, 11520, 11521, 11522,
       11523, 11524, 11525, 11526, 11527, 11528, 11529, 11530, 11531,
       11532, 11533, 11534, 11535, 11536, 11537, 11538, 11539, 11540,
       11541, 11542, 11543, 11544, 11545, 11546, 11547, 11548],
      dtype=int64)

In [27]:
df_calander[(df_calander.year==2015) &
           (df_calander.wm_yr_wk==11507)]

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
1505,2015-03-14,11507,Saturday,1,3,2015,d_1506,,,,,0,0,1
1506,2015-03-15,11507,Sunday,2,3,2015,d_1507,,,,,0,1,1
1507,2015-03-16,11507,Monday,3,3,2015,d_1508,,,,,0,0,0
1508,2015-03-17,11507,Tuesday,4,3,2015,d_1509,StPatricksDay,Cultural,,,0,0,0
1509,2015-03-18,11507,Wednesday,5,3,2015,d_1510,,,,,0,0,0
1510,2015-03-19,11507,Thursday,6,3,2015,d_1511,,,,,0,0,0
1511,2015-03-20,11507,Friday,7,3,2015,d_1512,,,,,0,0,0


In [28]:
t = df_ste_rows[(df_ste_rows.d==1443) & (df_ste_rows.state_id=='CA')]['target'].sum()
print(f'So in california state, we had {t} items sold on day 1443')

So in california state, we had 19244 items sold on day 1443


# snap

There are 3 binary variables with a prefix "snap_" plus the state name.

snapCA, snapTX, and snap_WI: A binary variable (0 or 1) indicating whether the stores of CA, TX or WI allow SNAP purchases on the examined date. 1 indicates that SNAP purchases are allowed.

For those who is not familiar with SNAP like me;
"The United States federal government provides a nutrition assistance benefit called the Supplement Nutrition Assistance Program (SNAP). SNAP provides low income families and individuals with an Electronic Benefits Transfer debit card to purchase food products. In many states, the monetary benefits are dispersed to people across 10 days of the month and on each of these days 1/10 of the people will receive the benefit on their card."
Source: https://www.fns.usda.gov/snap/supplemental-nutrition-assistance-program

**there is not point in keeping a prefix of d_ with the d column as we all know that this is a day number sequence**

In [29]:
df_calander['d'] = df_calander['d'].apply(lambda x: x.replace('d_',''))
df_calander['d'] = df_calander['d'].astype('int16')
gc.collect()

60

In [30]:
#but before deleting the date, we may add dofm (day of month)
def get_d_of_m(col):
    return col.day
df_calander['dom'] = df_calander['date'].apply(get_d_of_m)

In [31]:
df_calander[['date','dom']]

Unnamed: 0,date,dom
0,2011-01-29,29
1,2011-01-30,30
2,2011-01-31,31
3,2011-02-01,1
4,2011-02-02,2
...,...,...
1964,2016-06-15,15
1965,2016-06-16,16
1966,2016-06-17,17
1967,2016-06-18,18


# Join Calendar and Sales (Evaluation)

In [32]:
dfmain = df_ste_rows.merge(df_calander[['date','wm_yr_wk','wday','d','month','year','dom']], on=['d'], how='left')

**Verifying the merge**

In [33]:
t = df_ste_rows[(df_ste_rows.d==1443) & (df_ste_rows.state_id=='CA')]['target'].sum()
print(f'So in california state, we had {t} items sold on day 1443')

So in california state, we had 19244 items sold on day 1443


In [34]:
s = dfmain[(dfmain.d==1443) & (dfmain.state_id=='CA')]['target'].sum()
print(f'So in california state, we had {s} items sold on day 1443')

So in california state, we had 19244 items sold on day 1443


In [35]:
# delete the unwanted data sets
del df_ste,df_ste_rows, df_calander
gc.collect()

100

In [36]:
dfmain[(dfmain.d==1443)][['date','d','wm_yr_wk','dom']].drop_duplicates()

Unnamed: 0,date,d,wm_yr_wk,dom
43966580,2015-01-10,1443,11450,10


# Checking the Prices

In [37]:
df_prices.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,CA_1,HOBBIES_1_001,11325,9.58
1,CA_1,HOBBIES_1_001,11326,9.58
2,CA_1,HOBBIES_1_001,11327,8.26
3,CA_1,HOBBIES_1_001,11328,8.26
4,CA_1,HOBBIES_1_001,11329,8.26


In [38]:
dfmain.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,target,date,wm_yr_wk,wday,month,year,dom
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,11101,1,1,2011,29
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,11101,1,1,2011,29
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,11101,1,1,2011,29
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,11101,1,1,2011,29
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,11101,1,1,2011,29


**so, this is basically item prices on a particular week in a particular store**
* We can easily join the item prices to the main data frame.

In [39]:
dfmain = dfmain.merge(df_prices, how='left', on=['store_id','item_id','wm_yr_wk'])

In [40]:
dfmain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59181090 entries, 0 to 59181089
Data columns (total 15 columns):
 #   Column      Dtype         
---  ------      -----         
 0   id          object        
 1   item_id     object        
 2   dept_id     object        
 3   cat_id      object        
 4   store_id    object        
 5   state_id    object        
 6   d           int16         
 7   target      int16         
 8   date        datetime64[ns]
 9   wm_yr_wk    int64         
 10  wday        int64         
 11  month       int64         
 12  year        int64         
 13  dom         int64         
 14  sell_price  float64       
dtypes: datetime64[ns](1), float64(1), int16(2), int64(5), object(6)
memory usage: 6.4+ GB


### Adding Group by for Item and store for lags

* what is the sales of a particular item across the country on a particular day
    * That will be used as a lagged feature
* what is the performance of a particular department across the country on a particular day 
    * That too, can be used as a lagged feature

In [41]:
# by item
gb_item  = dfmain.groupby(['item_id','d'], as_index=False).target.sum()
gb_item.rename(columns={'target':'target_item'}, inplace=True)    

# by dept
gb_dept  = dfmain.groupby(['dept_id','d'], as_index=False).target.sum()
gb_dept.rename(columns={'target':'target_dept'}, inplace=True)    

In [42]:
#adding the item groupped target and store groupped target as feature
dfmain = dfmain.merge(gb_dept, how='left', on=['dept_id','d']).fillna(0)
dfmain = dfmain.merge(gb_item, how='left', on=['item_id','d']).fillna(0)

In [43]:
del gb_dept, gb_item
gc.collect()

20

In [44]:
dfmain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59181090 entries, 0 to 59181089
Data columns (total 17 columns):
 #   Column       Dtype         
---  ------       -----         
 0   id           object        
 1   item_id      object        
 2   dept_id      object        
 3   cat_id       object        
 4   store_id     object        
 5   state_id     object        
 6   d            int16         
 7   target       int16         
 8   date         datetime64[ns]
 9   wm_yr_wk     int64         
 10  wday         int64         
 11  month        int64         
 12  year         int64         
 13  dom          int64         
 14  sell_price   float64       
 15  target_dept  int16         
 16  target_item  int16         
dtypes: datetime64[ns](1), float64(1), int16(4), int64(5), object(6)
memory usage: 6.6+ GB


### down casting, label encoding and removing unwanted columns

In [45]:
#converting all the ids to label encoded values
le = LabelEncoder()
dfmain['dept_id_code'] = le.fit_transform(dfmain.dept_id)
dfmain['cat_id_code'] = le.fit_transform(dfmain.cat_id)
dfmain['store_id_code'] = le.fit_transform(dfmain.store_id)
dfmain['state_id_code'] = le.fit_transform(dfmain.state_id)
dfmain['item_id_code'] = le.fit_transform(dfmain.item_id)

#deleting all such columns
dfmain.drop(['dept_id','cat_id','store_id','state_id','item_id'], axis=1, inplace=True)

dfmain.dept_id_code.max(), dfmain.cat_id_code.max(),dfmain.store_id_code.max(),dfmain.item_id_code.max()

(6, 2, 9, 3048)

In [46]:
dfmain.item_id_code.max()

3048

In [47]:
dfmain.wm_yr_wk     = dfmain.wm_yr_wk.astype('int16')
dfmain.wday         = dfmain.wday.astype('int8')
dfmain.month        = dfmain.month.astype('int8')
dfmain.year         = dfmain.year.astype('int16')
dfmain.sell_price   = dfmain.sell_price.astype('float16')
dfmain.dept_id_code = dfmain.dept_id_code.astype('int8')
dfmain.cat_id_code  = dfmain.cat_id_code.astype('int8')
dfmain.store_id_code= dfmain.store_id_code.astype('int8')
dfmain.state_id_code= dfmain.state_id_code.astype('int8')
dfmain.item_id_code = dfmain.item_id_code.astype('int16')
dfmain.dom          = dfmain.dom.astype('int8')

**delete the date as well, as it is not needed**

In [48]:
dfmain.drop(['date'], inplace=True, axis=1)

In [49]:
dfmain['sell_price'].fillna(0, inplace=True)

In [50]:
dfmain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59181090 entries, 0 to 59181089
Data columns (total 16 columns):
 #   Column         Dtype  
---  ------         -----  
 0   id             object 
 1   d              int16  
 2   target         int16  
 3   wm_yr_wk       int16  
 4   wday           int8   
 5   month          int8   
 6   year           int16  
 7   dom            int8   
 8   sell_price     float16
 9   target_dept    int16  
 10  target_item    int16  
 11  dept_id_code   int8   
 12  cat_id_code    int8   
 13  store_id_code  int8   
 14  state_id_code  int8   
 15  item_id_code   int16  
dtypes: float16(1), int16(7), int8(7), object(1)
memory usage: 2.1+ GB


In [51]:
dfmain.isna().sum()

id               0
d                0
target           0
wm_yr_wk         0
wday             0
month            0
year             0
dom              0
sell_price       0
target_dept      0
target_item      0
dept_id_code     0
cat_id_code      0
store_id_code    0
state_id_code    0
item_id_code     0
dtype: int64

# Preparing the Data for Training

# Adding Means

In [52]:
### Adding Means

'''
in future feature sets, we should be adding 
1 - store_state_target_mean 
2 - store_dept_target_mean etc 
3 - wday_target_mean
4 - month_target_mean
  - others
'''

def add_mean(dfmain, col):
    mean_attrib = col + '_target_mean'
    mean_values = dfmain.groupby(col).target.mean()
    dfmain[mean_attrib] = dfmain[col].map(mean_values)
    return dfmain

In [53]:
dfmain = add_mean(dfmain,'store_id_code') # will not delete, as it is index column
dfmain = add_mean(dfmain,'cat_id_code') # will DELETE, as we got the mean from it
dfmain = add_mean(dfmain,'state_id_code') # will DELETE, as we got the mean from it
dfmain = add_mean(dfmain,'item_id_code') # will not delete, as it is index column
dfmain = add_mean(dfmain,'dept_id_code')# will DELETE, as we got the mean from it

In [54]:
dfmain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59181090 entries, 0 to 59181089
Data columns (total 21 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   id                         object 
 1   d                          int16  
 2   target                     int16  
 3   wm_yr_wk                   int16  
 4   wday                       int8   
 5   month                      int8   
 6   year                       int16  
 7   dom                        int8   
 8   sell_price                 float16
 9   target_dept                int16  
 10  target_item                int16  
 11  dept_id_code               int8   
 12  cat_id_code                int8   
 13  store_id_code              int8   
 14  state_id_code              int8   
 15  item_id_code               int16  
 16  store_id_code_target_mean  float64
 17  cat_id_code_target_mean    float64
 18  state_id_code_target_mean  float64
 19  item_id_code_target_mean   float64
 20  

# <font color=red>Getting the records only for last 6 months </font>

In [55]:
dfmain =dfmain[(dfmain.d > (dfmain.d.max() - 180))]

In [56]:
dfmain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5488200 entries, 53692890 to 59181089
Data columns (total 21 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   id                         object 
 1   d                          int16  
 2   target                     int16  
 3   wm_yr_wk                   int16  
 4   wday                       int8   
 5   month                      int8   
 6   year                       int16  
 7   dom                        int8   
 8   sell_price                 float16
 9   target_dept                int16  
 10  target_item                int16  
 11  dept_id_code               int8   
 12  cat_id_code                int8   
 13  store_id_code              int8   
 14  state_id_code              int8   
 15  item_id_code               int16  
 16  store_id_code_target_mean  float64
 17  cat_id_code_target_mean    float64
 18  state_id_code_target_mean  float64
 19  item_id_code_target_mean   float64

In [57]:
dfmain[['store_id_code','store_id_code_target_mean']].drop_duplicates()

Unnamed: 0,store_id_code,store_id_code_target_mean
53692890,0,1.323438
53695939,1,0.983151
53698988,2,1.92013
53702037,3,0.706735
53705086,4,0.961933
53708135,5,1.238511
53711184,6,1.048636
53714233,7,0.889052
53717282,8,1.131778
53720331,9,1.105515


In [58]:
dfmain[['cat_id_code','cat_id_code_target_mean']].drop_duplicates()

Unnamed: 0,cat_id_code,cat_id_code_target_mean
53692890,1,0.569058
53693455,2,0.726498
53694502,0,1.646427


In [59]:
dfmain[['dept_id_code','dept_id_code_target_mean']].drop_duplicates()

Unnamed: 0,dept_id_code,dept_id_code_target_mean
53692890,3,0.705799
53693306,4,0.187284
53693455,5,1.135262
53693987,6,0.304241
53694502,0,1.238003
53694718,1,1.009041
53695116,2,2.061858


In [60]:
dfmain.columns

Index(['id', 'd', 'target', 'wm_yr_wk', 'wday', 'month', 'year', 'dom',
       'sell_price', 'target_dept', 'target_item', 'dept_id_code',
       'cat_id_code', 'store_id_code', 'state_id_code', 'item_id_code',
       'store_id_code_target_mean', 'cat_id_code_target_mean',
       'state_id_code_target_mean', 'item_id_code_target_mean',
       'dept_id_code_target_mean'],
      dtype='object')

In [61]:
dfmain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5488200 entries, 53692890 to 59181089
Data columns (total 21 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   id                         object 
 1   d                          int16  
 2   target                     int16  
 3   wm_yr_wk                   int16  
 4   wday                       int8   
 5   month                      int8   
 6   year                       int16  
 7   dom                        int8   
 8   sell_price                 float16
 9   target_dept                int16  
 10  target_item                int16  
 11  dept_id_code               int8   
 12  cat_id_code                int8   
 13  store_id_code              int8   
 14  state_id_code              int8   
 15  item_id_code               int16  
 16  store_id_code_target_mean  float64
 17  cat_id_code_target_mean    float64
 18  state_id_code_target_mean  float64
 19  item_id_code_target_mean   float64

In [62]:
# drop the following attributes, since we've already added their means
df_train = dfmain.drop(['cat_id_code','state_id_code','dept_id_code','wm_yr_wk','wday','sell_price'], axis=1)

In [63]:
del dfmain
gc.collect()

40

In [64]:
df_train.dept_id_code_target_mean = df_train.dept_id_code_target_mean.astype('float16')
df_train.item_id_code_target_mean = df_train.item_id_code_target_mean.astype('float16')
df_train.state_id_code_target_mean= df_train.state_id_code_target_mean.astype('float16')
df_train.store_id_code_target_mean= df_train.store_id_code_target_mean.astype('float16')
df_train.cat_id_code_target_mean  = df_train.cat_id_code_target_mean.astype('float16')

In [65]:
gc.collect()

40

In [66]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5488200 entries, 53692890 to 59181089
Data columns (total 15 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   id                         object 
 1   d                          int16  
 2   target                     int16  
 3   month                      int8   
 4   year                       int16  
 5   dom                        int8   
 6   target_dept                int16  
 7   target_item                int16  
 8   store_id_code              int8   
 9   item_id_code               int16  
 10  store_id_code_target_mean  float16
 11  cat_id_code_target_mean    float16
 12  state_id_code_target_mean  float16
 13  item_id_code_target_mean   float16
 14  dept_id_code_target_mean   float16
dtypes: float16(5), int16(6), int8(3), object(1)
memory usage: 214.6+ MB


In [68]:
# create three datasets
#df_train = df_train.merge(df_tmp, on='id', how='inner')

#let's go with all the data set and let's see what happens...anyways we are experimenting and learning
#df_train = df_train[df_train.year > 2013] 
#df_tmp[(df_tmp.year==2015) & (df_tmp.dom==1)][['dom']].drop_duplicates
'''
one = df_tmp[(df_tmp.year==2015) & (df_tmp.dom==1)]['d'].count()
two = df_tmp[(df_tmp.year==2015) & (df_tmp.dom==2)]['d'].count()
three = df_tmp[(df_tmp.year==2015) & (df_tmp.dom==3)]['d'].count()

four = df_tmp[(df_tmp.year==2015) & (df_tmp.dom==4)]['d'].count()
five = df_tmp[(df_tmp.year==2015) & (df_tmp.dom==5)]['d'].count()
six = df_tmp[(df_tmp.year==2015) & (df_tmp.dom==6)]['d'].count()

one,two, three, four, five, six
df_tmp['dom'].unique()
'''
pass

In [69]:
len(df_train)/df_train['store_id_code'].nunique()

548820.0

In [70]:
def add_lags(df, shift_range,index_cols, lag_cols, exception_cols):
    cols_to_rename = list(df.columns.difference(index_cols + exception_cols)) 
    
    #print('Columns to rename : ',cols_to_rename)
    
    #print(index_cols + cols_to_rename)

    for day_shift in tqdm_notebook(shift_range):
        train_shift = df[index_cols + cols_to_rename].copy()
        print('copied to train_shift')
        train_shift['d'] = train_shift['d'] + day_shift
        
        print(f'performed the shifting of {day_shift}')

        foo = lambda x: '{}_lag_{}'.format(x, day_shift) if x in cols_to_rename else x
        train_shift = train_shift.rename(columns=foo)

        df = pd.merge(df, train_shift, on=index_cols, how='left').fillna(0)
        
        print(f'performed the merge ---------{day_shift}--------------')
        
    return df

**So the index columns where we will merge the lags will be**
* d
* item_id_code
* store_id_code

the obvious reason to chose these three is the fact that, id column is a concatenation of dept_id and store_id

**while one thing, that I am not able to findout is that the id has either _validation or _evaluation suffix.**

whether we are going to use _validation suffice in the id while submitting, I am not sure but once we submit it. 

In [71]:
df_train.columns

Index(['id', 'd', 'target', 'month', 'year', 'dom', 'target_dept',
       'target_item', 'store_id_code', 'item_id_code',
       'store_id_code_target_mean', 'cat_id_code_target_mean',
       'state_id_code_target_mean', 'item_id_code_target_mean',
       'dept_id_code_target_mean'],
      dtype='object')

In [None]:
# divide the columns to add the lags, that will be joined at a later stage
'''
tmp_columns = ['id', 'd','month', 'year', 'dom','store_id_code_target_mean', 'cat_id_code_target_mean',
       'state_id_code_target_mean', 'item_id_code_target_mean',
       'dept_id_code_target_mean','target_dept','target_item']

df_tmp = df_train[tmp_columns]

df_train.head(15)
df_train.tail(15)
df_tmp.info()
df_train.drop(list(set(tmp_columns) - set(['id','d'])), axis=1, inplace=True)


# Keeping on last 6 months

**we will use later, the id column to merge df_train and df_tmp**

In [72]:
mean_enc_cols = [col for col in df_train.columns if 'mean' in str(col)]
exception_cols = mean_enc_cols + ['cat_id', 'date', 'day', 'id','sell_price', 'snap_CA', 'snap_TX', 'snap_WI',
                                  'state_id', 'wday', 'wm_yr_wk','year','month','dom']

index_cols = ['store_id_code','item_id_code','d']
lag_cols = ['target']
shift_range = [x for x in range(1,29)] # from 4 to 28

#divide the dataframe to add the lags

df_train = add_lags(df_train,shift_range,index_cols,lag_cols,exception_cols)

HBox(children=(FloatProgress(value=0.0, max=28.0), HTML(value='')))

copied to train_shift
performed the shifting of 1
performed the merge ---------1--------------
copied to train_shift
performed the shifting of 2
performed the merge ---------2--------------
copied to train_shift
performed the shifting of 3
performed the merge ---------3--------------
copied to train_shift
performed the shifting of 4
performed the merge ---------4--------------
copied to train_shift
performed the shifting of 5
performed the merge ---------5--------------
copied to train_shift
performed the shifting of 6
performed the merge ---------6--------------
copied to train_shift
performed the shifting of 7
performed the merge ---------7--------------
copied to train_shift
performed the shifting of 8
performed the merge ---------8--------------
copied to train_shift
performed the shifting of 9
performed the merge ---------9--------------
copied to train_shift
performed the shifting of 10
performed the merge ---------10--------------
copied to train_shift
performed the shifting of 

In [74]:
df_train[(df_train.item_id_code==1) & (df_train.store_id_code==1)][['d','target_dept','target_dept_lag_1',
                                                                   'target_dept_lag_2',
                                                                   'target_dept_lag_3','target_dept_lag_4',
                                                                   'target_dept_lag_5','target_dept_lag_6',
                                                                   'target_dept_lag_7','target_dept_lag_8',
                                                                   'target_dept_lag_9']].head(15)

Unnamed: 0,d,target_dept,target_dept_lag_1,target_dept_lag_2,target_dept_lag_3,target_dept_lag_4,target_dept_lag_5,target_dept_lag_6,target_dept_lag_7,target_dept_lag_8,target_dept_lag_9
4662,1762,2915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35152,1763,1715,2915.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
65642,1764,2737,1715.0,2915.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96132,1765,2664,2737.0,1715.0,2915.0,0.0,0.0,0.0,0.0,0.0,0.0
126622,1766,2247,2664.0,2737.0,1715.0,2915.0,0.0,0.0,0.0,0.0,0.0
157112,1767,2169,2247.0,2664.0,2737.0,1715.0,2915.0,0.0,0.0,0.0,0.0
187602,1768,2795,2169.0,2247.0,2664.0,2737.0,1715.0,2915.0,0.0,0.0,0.0
218092,1769,2637,2795.0,2169.0,2247.0,2664.0,2737.0,1715.0,2915.0,0.0,0.0
248582,1770,3042,2637.0,2795.0,2169.0,2247.0,2664.0,2737.0,1715.0,2915.0,0.0
279072,1771,3270,3042.0,2637.0,2795.0,2169.0,2247.0,2664.0,2737.0,1715.0,2915.0


In [75]:
df_train[(df_train.item_id_code==1) & (df_train.store_id_code==1)][['d','target_item','target_item_lag_1','target_item_lag_2',
                                                                   'target_item_lag_3','target_item_lag_4',
                                                                   'target_item_lag_5']].head(15)

Unnamed: 0,d,target_item,target_item_lag_1,target_item_lag_2,target_item_lag_3,target_item_lag_4,target_item_lag_5
4662,1762,2,0.0,0.0,0.0,0.0,0.0
35152,1763,1,2.0,0.0,0.0,0.0,0.0
65642,1764,0,1.0,2.0,0.0,0.0,0.0
96132,1765,5,0.0,1.0,2.0,0.0,0.0
126622,1766,4,5.0,0.0,1.0,2.0,0.0
157112,1767,4,4.0,5.0,0.0,1.0,2.0
187602,1768,3,4.0,4.0,5.0,0.0,1.0
218092,1769,3,3.0,4.0,4.0,5.0,0.0
248582,1770,9,3.0,3.0,4.0,4.0,5.0
279072,1771,7,9.0,3.0,3.0,4.0,4.0


In [76]:
df_train.target_dept.max(), df_train.target_item.max(), df_train.target.max()

(28153, 1525, 323)

In [77]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5488200 entries, 0 to 5488199
Data columns (total 99 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   id                         object 
 1   d                          int16  
 2   target                     int16  
 3   month                      int8   
 4   year                       int16  
 5   dom                        int8   
 6   target_dept                int16  
 7   target_item                int16  
 8   store_id_code              int8   
 9   item_id_code               int16  
 10  store_id_code_target_mean  float16
 11  cat_id_code_target_mean    float16
 12  state_id_code_target_mean  float16
 13  item_id_code_target_mean   float16
 14  dept_id_code_target_mean   float16
 15  target_lag_1               float64
 16  target_dept_lag_1          float64
 17  target_item_lag_1          float64
 18  target_lag_2               float64
 19  target_dept_lag_2          float64
 20  ta

In [78]:
#converting all the lags to be int16
lag_cols = [col for col in df_train.columns if 'lag' in str(col)]
for col in lag_cols:
    df_train[col] = df_train[col].astype('int16')  

In [79]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5488200 entries, 0 to 5488199
Data columns (total 99 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   id                         object 
 1   d                          int16  
 2   target                     int16  
 3   month                      int8   
 4   year                       int16  
 5   dom                        int8   
 6   target_dept                int16  
 7   target_item                int16  
 8   store_id_code              int8   
 9   item_id_code               int16  
 10  store_id_code_target_mean  float16
 11  cat_id_code_target_mean    float16
 12  state_id_code_target_mean  float16
 13  item_id_code_target_mean   float16
 14  dept_id_code_target_mean   float16
 15  target_lag_1               int16  
 16  target_dept_lag_1          int16  
 17  target_item_lag_1          int16  
 18  target_lag_2               int16  
 19  target_dept_lag_2          int16  
 20  ta

In [80]:
df_train['id'] = df_train['id'].astype('category')
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5488200 entries, 0 to 5488199
Data columns (total 99 columns):
 #   Column                     Dtype   
---  ------                     -----   
 0   id                         category
 1   d                          int16   
 2   target                     int16   
 3   month                      int8    
 4   year                       int16   
 5   dom                        int8    
 6   target_dept                int16   
 7   target_item                int16   
 8   store_id_code              int8    
 9   item_id_code               int16   
 10  store_id_code_target_mean  float16 
 11  cat_id_code_target_mean    float16 
 12  state_id_code_target_mean  float16 
 13  item_id_code_target_mean   float16 
 14  dept_id_code_target_mean   float16 
 15  target_lag_1               int16   
 16  target_dept_lag_1          int16   
 17  target_item_lag_1          int16   
 18  target_lag_2               int16   
 19  target_dept_lag_2    

In [81]:
df_train.head()

Unnamed: 0,id,d,target,month,year,dom,target_dept,target_item,store_id_code,item_id_code,...,target_item_lag_25,target_lag_26,target_dept_lag_26,target_item_lag_26,target_lag_27,target_dept_lag_27,target_item_lag_27,target_lag_28,target_dept_lag_28,target_item_lag_28
0,HOBBIES_1_001_CA_1_evaluation,1762,0,11,2015,25,3740,0,0,1437,...,0,0,0,0,0,0,0,0,0,0
1,HOBBIES_1_002_CA_1_evaluation,1762,0,11,2015,25,3740,1,0,1438,...,0,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,1762,3,11,2015,25,3740,4,0,1439,...,0,0,0,0,0,0,0,0,0,0
3,HOBBIES_1_004_CA_1_evaluation,1762,3,11,2015,25,3740,15,0,1440,...,0,0,0,0,0,0,0,0,0,0
4,HOBBIES_1_005_CA_1_evaluation,1762,0,11,2015,25,3740,10,0,1441,...,0,0,0,0,0,0,0,0,0,0


# Train Test Vdlidation Split

In [174]:
drop_columns= ['id','target','target_dept','target_item','month','year','dom']

#this is training data with the exclusion of 28 days
X_train = df_train[df_train.d <= df_train.d.max()-28].drop(drop_columns, axis=1) 
y_train = df_train[df_train.d <= df_train.d.max()-28]['target']

X_valid = df_train[df_train.d > df_train.d.max()-28].drop(list(set(drop_columns) - set(['id'])), axis=1)
y_valid = df_train[df_train.d > df_train.d.max()-28]['target']
#convert the id
X_valid['id'] = X_valid['id'].apply(lambda x: x.replace('_evaluation','_validation'))



X_test  =  df_train[df_train.d > df_train.d.max()-28].drop(list(set(drop_columns) - set(['id'])), axis=1)
X_test.d = X_test.d + 28 #adding 28 for all the days

# y_test are the predictions

#this is final training data without the exclusion of 28 days
X_train_final = df_train.drop(drop_columns, axis=1) 
y_train_final = df_train['target']

In [175]:
X_valid.d.unique(), X_test.d.unique()

(array([1914, 1915, 1916, 1917, 1918, 1919, 1920, 1921, 1922, 1923, 1924,
        1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935,
        1936, 1937, 1938, 1939, 1940, 1941], dtype=int16),
 array([1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952,
        1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963,
        1964, 1965, 1966, 1967, 1968, 1969], dtype=int16))

In [177]:
X_valid.id.head(), X_test.id.head()

(4634480    HOBBIES_1_001_CA_1_validation
 4634481    HOBBIES_1_002_CA_1_validation
 4634482    HOBBIES_1_003_CA_1_validation
 4634483    HOBBIES_1_004_CA_1_validation
 4634484    HOBBIES_1_005_CA_1_validation
 Name: id, dtype: category
 Categories (30490, object): [FOODS_1_001_CA_1_validation, FOODS_1_001_CA_2_validation, FOODS_1_001_CA_3_validation, FOODS_1_001_CA_4_validation, ..., HOUSEHOLD_2_516_TX_3_validation, HOUSEHOLD_2_516_WI_1_validation, HOUSEHOLD_2_516_WI_2_validation, HOUSEHOLD_2_516_WI_3_validation],
 4634480    HOBBIES_1_001_CA_1_evaluation
 4634481    HOBBIES_1_002_CA_1_evaluation
 4634482    HOBBIES_1_003_CA_1_evaluation
 4634483    HOBBIES_1_004_CA_1_evaluation
 4634484    HOBBIES_1_005_CA_1_evaluation
 Name: id, dtype: category
 Categories (30490, object): [FOODS_1_001_CA_1_evaluation, FOODS_1_001_CA_2_evaluation, FOODS_1_001_CA_3_evaluation, FOODS_1_001_CA_4_evaluation, ..., HOUSEHOLD_2_516_TX_3_evaluation, HOUSEHOLD_2_516_WI_1_evaluation, HOUSEHOLD_2_516_WI_2_eval

In [179]:
# Trying Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
lr = LinearRegression(n_jobs=-1, normalize=False, fit_intercept=True, copy_X=True)

In [180]:
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

In [181]:
#Prediction for Valid set
pred_lr_valid = lr_stack_pred = lr.predict(X_valid.drop(['id'], axis=1))
df_submission_valid= X_valid[['id','d']]
df_submission_valid['target'] = pred_lr_valid

#Prediction for Eval Set
pred_lr_eval = lr_stack_pred = lr.predict(X_test.drop(['id'], axis=1))
df_submission_eval = X_test[['id','d']]
df_submission_eval['target'] = pred_lr_eval

In [182]:
col_append = ['F'+str(i) for i in range(1,29)]

#1. Converting 

#valid set
submission_pivot_valid = df_submission_valid.pivot_table('target', ['id'], 'd')
submission_pivot_valid.columns = col_append
submission_pivot_valid = submission_pivot_valid.reset_index()

#eval set
submission_pivot_eval = df_submission_eval.pivot_table('target', ['id'], 'd')
submission_pivot_eval.columns = col_append
submission_pivot_eval = submission_pivot_eval.reset_index()

#Concatenating
df_submission = pd.concat([submission_pivot_valid, submission_pivot_eval], axis=0)


In [184]:
#Saving
df_submission.to_csv('lr_submission_21jun2020_1914_41_1942_69.csv', index=False)

In [183]:
df_submission.id.nunique()

60980

In [None]:
for store in range(1, 11):
    print('store : ',store)

# Prediction Idea

**Since we have to predict 28 days, 3049 items, and 10 stores**
* Days = 28
* Items = 3049
* Stores = 10

Total Predictions = 28 x 3049 x 10 = 853,720

**Let's see if this is roughly the sum of each month entries in our dataset**

**so, this clears the logic, as we can see that feb-2014 and feb-2015 have similar number of items**

**Let's also suppose that we are to predict only one day**

Then the equation would be : 
    
    * 3049 x 10 = 30,490
    
So, 
    There can be two ways to do that

    
* **First Procedure** 
    Train for all days (huge dataset and huge model size)

    for day in range(1,29):
        predict([item,day])
        
        bnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnmnjkm,k
* **Second Procedure** 

    * seperate each day and create 28 data sets
    for day in range(1,29)
        * load_data(day)
        * train data
        * predict(for day)



just a crazy thought...

**also to be noted that, if the 1 to 28 is for a particular month, then 23, 30 31 are not required for any of the month**

* we will see once the data gets compiled


In [103]:
print(mean_squared_error(y_valid,pred_r), r2_score(y_valid, pred_r))

3.9990506 0.6982803664150627


In [148]:
lr.fit(X_train_final, y_train_final)
pred_lr_final_1jun2020 = lr_stack_pred = lr.predict(X_test.drop(['id'], axis=1))

In [120]:
len(pred_lr_final_1jun2020)

853720

In [121]:
X_test.head()

Unnamed: 0,id,d,store_id_code,item_id_code,store_id_code_target_mean,cat_id_code_target_mean,state_id_code_target_mean,item_id_code_target_mean,dept_id_code_target_mean,target_lag_1,...,target_item_lag_25,target_lag_26,target_dept_lag_26,target_item_lag_26,target_lag_27,target_dept_lag_27,target_item_lag_27,target_lag_28,target_dept_lag_28,target_item_lag_28
4634480,HOBBIES_1_001_CA_1_evaluation,1942,0,1437,1.323242,0.568848,1.233398,0.219604,0.705566,1,...,2,0,2958,6,0,3123,9,1,3231,6
4634481,HOBBIES_1_002_CA_1_evaluation,1942,0,1438,1.323242,0.568848,1.233398,0.263428,0.705566,0,...,2,0,2958,4,0,3123,0,1,3231,3
4634482,HOBBIES_1_003_CA_1_evaluation,1942,0,1439,1.323242,0.568848,1.233398,0.07782,0.705566,1,...,0,0,2958,0,0,3123,2,0,3231,2
4634483,HOBBIES_1_004_CA_1_evaluation,1942,0,1440,1.323242,0.568848,1.233398,2.041016,0.705566,2,...,10,0,2958,10,0,3123,13,0,3231,12
4634484,HOBBIES_1_005_CA_1_evaluation,1942,0,1441,1.323242,0.568848,1.233398,0.766602,0.705566,4,...,9,4,2958,11,0,3123,9,1,3231,12


In [123]:
len(df_submission)

853720

In [126]:
submission_trans.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, d to target
Columns: 853720 entries, 4634480 to 5488199
dtypes: float32(853720)
memory usage: 6.5+ MB


Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_evaluation,1.282654,1.202532,0.903100,0.739114,0.722886,0.897184,1.025901,1.261501,0.513765,...,0.757762,1.268006,1.507601,1.364873,1.184933,0.803448,0.656094,1.146351,0.978135,0.643805
1,FOODS_1_001_CA_2_evaluation,1.740583,1.191805,1.720940,1.152399,0.862649,1.238713,1.756865,1.674244,1.081507,...,0.574088,1.382080,1.713957,0.697148,0.879808,0.820173,0.608149,0.533720,0.960836,1.220133
2,FOODS_1_001_CA_3_evaluation,0.769530,0.680082,0.560077,0.590597,0.614635,2.709026,2.160303,1.235490,0.780345,...,0.805515,1.327993,1.655731,0.815431,0.620212,0.624196,0.855104,1.157865,1.680206,1.244082
3,FOODS_1_001_CA_4_evaluation,0.752946,0.291519,0.531976,0.386735,0.286859,0.632291,0.479894,0.588857,0.189715,...,0.202372,0.695013,0.470369,0.311594,0.225238,0.194480,0.352906,0.677917,0.648406,0.375202
4,FOODS_1_001_TX_1_evaluation,0.120321,0.047658,0.118845,0.283642,0.090254,0.633421,0.367644,0.461131,0.394150,...,0.631626,0.979811,0.845172,0.721358,0.818234,1.579166,0.926591,1.326470,1.577242,1.104552
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,HOUSEHOLD_2_516_TX_2_evaluation,0.170452,0.208212,0.315818,0.116524,0.109979,0.201319,0.134440,0.114504,0.311089,...,0.117435,0.211692,0.151852,0.058970,0.279734,0.194776,0.073547,0.080491,0.209900,0.189697
30486,HOUSEHOLD_2_516_TX_3_evaluation,0.015645,0.127556,0.199359,0.028193,0.055681,0.117191,0.013771,0.199754,0.187065,...,0.180481,0.152454,0.385275,0.117312,0.132531,0.119840,0.116183,0.151722,0.426033,0.459308
30487,HOUSEHOLD_2_516_WI_1_evaluation,-0.050681,0.175376,0.181067,0.080190,0.319270,0.213434,0.116731,0.098961,0.082777,...,0.189825,0.181651,0.079075,0.032673,0.048532,0.042464,0.043327,0.045992,0.363359,0.198981
30488,HOUSEHOLD_2_516_WI_2_evaluation,0.014328,0.056116,0.174431,0.104903,0.074131,0.100362,0.006227,-0.008182,0.038841,...,0.015435,0.013940,0.030405,0.175110,0.304392,0.212012,0.171433,0.113376,0.220949,0.235481


In [173]:
df_submission.shape

(60980, 29)

Index(['id', 'F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F10',
       'F11', 'F12', 'F13', 'F14', 'F15', 'F16', 'F17', 'F18', 'F19', 'F20',
       'F21', 'F22', 'F23', 'F24', 'F25', 'F26', 'F27', 'F28'],
      dtype='object')