In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
# read raw data
df = pd.read_csv('../data/raw/sales_train_validation.csv') # sales train validation
price_df = pd.read_csv('../data/raw/sell_prices.csv') # sell prices
cal_df = pd.read_csv('../data/raw/calendar.csv') # calendar

In [3]:
df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4


In [4]:
price_df.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,CA_1,HOBBIES_1_001,11325,9.58
1,CA_1,HOBBIES_1_001,11326,9.58
2,CA_1,HOBBIES_1_001,11327,8.26
3,CA_1,HOBBIES_1_001,11328,8.26
4,CA_1,HOBBIES_1_001,11329,8.26


In [5]:
cal_df.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1


In [6]:
# Removing 'd' from the days columns
cal_df["d"]=cal_df["d"].apply(lambda x: int(x.split("_")[1]))
cal_df.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,5,,,,,1,0,1


In [7]:
# creating 'id' column in price_df which is a combination of item_id + store_id + "valuation" so as to make a match with the id column of df
price_df["id"] = price_df["item_id"] + "_" + price_df["store_id"] + "_validation"
price_df.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price,id
0,CA_1,HOBBIES_1_001,11325,9.58,HOBBIES_1_001_CA_1_validation
1,CA_1,HOBBIES_1_001,11326,9.58,HOBBIES_1_001_CA_1_validation
2,CA_1,HOBBIES_1_001,11327,8.26,HOBBIES_1_001_CA_1_validation
3,CA_1,HOBBIES_1_001,11328,8.26,HOBBIES_1_001_CA_1_validation
4,CA_1,HOBBIES_1_001,11329,8.26,HOBBIES_1_001_CA_1_validation


In [8]:
cal_df_2 = cal_df.copy()
df_2 = df.copy()
price_df_2 = price_df.copy()

In [9]:
# period 1886-1914 is the validation set 
# period 1914-1941 is the test set for competition till may 31
# in the second phase we want to predict for 1942 to 1969

In [10]:
for day in tqdm(range(1886, 1914)):
    wk_id = list(cal_df[cal_df["d"]==day]["wm_yr_wk"])[0]
    wk_price_df = price_df[price_df["wm_yr_wk"]==wk_id]
    df = df.merge(wk_price_df[["sell_price", "id"]], on=["id"], how='inner') # merging df and price_df on id column using inner join 'sell prices' 
    df["unit_sales_" + str(day)] = df["sell_price"] * df["d_" + str(day)] # create a column in df 'unit_sales' = sell_price * demand 
    df.drop(columns=["sell_price"], inplace=True)

100%|██████████████████████████████████████████████████████████████████████████████████| 28/28 [00:25<00:00,  1.09it/s]


In [11]:
wk_price_df.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price,id
145,CA_1,HOBBIES_1_001,11613,8.38,HOBBIES_1_001_CA_1_validation
407,CA_1,HOBBIES_1_002,11613,3.97,HOBBIES_1_002_CA_1_validation
532,CA_1,HOBBIES_1_003,11613,2.97,HOBBIES_1_003_CA_1_validation
809,CA_1,HOBBIES_1_004,11613,4.64,HOBBIES_1_004_CA_1_validation
1075,CA_1,HOBBIES_1_005,11613,2.88,HOBBIES_1_005_CA_1_validation


In [12]:
df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,unit_sales_1904,unit_sales_1905,unit_sales_1906,unit_sales_1907,unit_sales_1908,unit_sales_1909,unit_sales_1910,unit_sales_1911,unit_sales_1912,unit_sales_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,8.38,25.14,0.0,8.38,8.38,8.38,25.14,0.0,8.38,8.38
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,3.97,0.0,0.0,0.0,0.0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,5.94,2.97,5.94,2.97,2.97,2.97,0.0,2.97,2.97,2.97
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,4.64,0.0,23.2,18.56,4.64,0.0,4.64,13.92,32.48,9.28
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,5.76,2.88,2.88,0.0,2.88,2.88,5.76,5.76,5.76,11.52


In [13]:
cal_df.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,5,,,,,1,0,1


In [14]:
df["dollar_sales"] = df[[c for c in df.columns if c.find("unit_sales")==0]].sum(axis=1)

In [15]:
df.drop(columns=[c for c in df.columns if c.find("unit_sales")==0], inplace=True)

In [16]:
df["weight"] = df["dollar_sales"] / df["dollar_sales"].sum()

In [17]:
df.drop(columns=["dollar_sales"], inplace=True)

In [18]:
df["weight"] /= 12

In [19]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))

In [20]:
print('Largest sales value is', df[[c for c in df.columns if c.find('d_')==0]].max().max(), 
      '\nLargest int16 is', np.iinfo(np.int16).max)

Largest sales value is 763 
Largest int16 is 32767


In [21]:
for d in range(1914, 1942):
    df["d_" + str(d)] = np.nan

In [22]:
df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,,,,,,,,,,
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,,,,,,,,,,
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,,,,,,,,,,
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,,,,,,,,,,
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,,,,,,,,,,


In [23]:
df['weight'].head()

0    5.001094e-06
1    1.765301e-07
2    1.056513e-06
3    5.261220e-06
4    2.433181e-06
Name: weight, dtype: float64

In [24]:
df.filter(regex='^[d]',axis=1).head()

Unnamed: 0,dept_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
1,HOBBIES_1,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
2,HOBBIES_1,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
3,HOBBIES_1,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
4,HOBBIES_1,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,


In [25]:
# printing column names that dont start with d_
df.loc[:, ~df.columns.str.startswith('d_')].head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,weight
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,5.001094e-06
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,1.765301e-07
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,1.056513e-06
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,5.26122e-06
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,2.433181e-06


In [26]:
temp = df.drop(columns = [c for c in df.columns if c.find('d_')==0 and int(c.split('_')[1]) < 1100])

In [27]:
temp.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1100,d_1101,d_1102,d_1103,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,1,0,0,0,...,,,,,,,,,,
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,1,0,0,1,...,,,,,,,,,,
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,,,,,,,,,,
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,4,2,0,0,...,,,,,,,,,,
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,3,1,0,0,...,,,,,,,,,,


In [28]:
df_melted = temp.melt(id_vars=[n for n in temp.columns if n.find("id")!=-1],
       value_vars=[n for n in temp.columns if n.find("d_")==0],
       var_name = 'day', value_name = 'sales')
del temp

In [29]:
df[['id', 'd_1100']].head(1)

Unnamed: 0,id,d_1100
0,HOBBIES_1_001_CA_1_validation,1


In [30]:
df_melted.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day,sales
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1100,1.0
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1100,1.0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1100,0.0
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1100,4.0
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1100,3.0


In [31]:
df_melted["day"]=df_melted["day"].apply(lambda x: int(x.split("_")[1]))

In [32]:
df_melted.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day,sales
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,1100,1.0
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,1100,1.0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,1100,0.0
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,1100,4.0
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,1100,3.0


In [33]:
cal_df.drop(columns=["date",  "weekday"], inplace= True)

In [34]:
cal_df.head()

Unnamed: 0,wm_yr_wk,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,11101,1,1,2011,1,,,,,0,0,0
1,11101,2,1,2011,2,,,,,0,0,0
2,11101,3,1,2011,3,,,,,0,0,0
3,11101,4,2,2011,4,,,,,1,1,0
4,11101,5,2,2011,5,,,,,1,0,1


In [35]:
reduce_mem_usage(cal_df)

Mem. usage decreased to  0.08 Mb (55.2% reduction)


In [36]:
reduce_mem_usage(df_melted)

Mem. usage decreased to 1273.13 Mb (18.7% reduction)


In [37]:
df_melted=df_melted.merge(cal_df, left_on=["day"], right_on=["d"]).drop(columns=["d"])

In [38]:
df_melted.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day,sales,wm_yr_wk,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,1100,1.0,11401,1,2,2014,,,,,1,1,0
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,1100,1.0,11401,1,2,2014,,,,,1,1,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,1100,0.0,11401,1,2,2014,,,,,1,1,0
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,1100,4.0,11401,1,2,2014,,,,,1,1,0
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,1100,3.0,11401,1,2,2014,,,,,1,1,0


In [39]:
df_melted['event_name_1'].value_counts(dropna=False).head()

NaN              23538280
Easter              91470
StPatricksDay       91470
LentWeek2           91470
Purim End           91470
Name: event_name_1, dtype: int64

In [40]:
df_melted['event_name_1'].astype('category').cat.codes.astype("int8").value_counts().head()

-1     23538280
 22       91470
 2        91470
 4        91470
 11       91470
dtype: int64

In [41]:
df_melted["event_name_1"]=df_melted["event_name_1"].astype('category').cat.codes.astype("int8")
df_melted["event_name_2"]=df_melted["event_name_2"].astype('category').cat.codes.astype("int8")
df_melted["event_type_1"]=df_melted["event_type_1"].astype('category').cat.codes.astype("int8")
df_melted["event_type_2"]=df_melted["event_type_2"].astype('category').cat.codes.astype("int8")

In [42]:
df_melted.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day,sales,wm_yr_wk,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,1100,1.0,11401,1,2,2014,-1,-1,-1,-1,1,1,0
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,1100,1.0,11401,1,2,2014,-1,-1,-1,-1,1,1,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,1100,0.0,11401,1,2,2014,-1,-1,-1,-1,1,1,0
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,1100,4.0,11401,1,2,2014,-1,-1,-1,-1,1,1,0
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,1100,3.0,11401,1,2,2014,-1,-1,-1,-1,1,1,0


In [43]:
useful_ids = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
id_encodings = [id_col + '_encoding' for id_col in useful_ids]

for id_col in useful_ids:
    if id_col == 'item_id':
        df_melted[id_col + '_encoding'] = df_melted[id_col].astype('category').cat.codes.astype("int16")
    else: 
        df_melted[id_col + '_encoding'] = df_melted[id_col].astype('category').cat.codes.astype("int8")

In [44]:
df_melted.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day,sales,wm_yr_wk,wday,...,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,item_id_encoding,dept_id_encoding,cat_id_encoding,store_id_encoding,state_id_encoding
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,1100,1.0,11401,1,...,-1,-1,1,1,0,1437,3,1,0,0
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,1100,1.0,11401,1,...,-1,-1,1,1,0,1438,3,1,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,1100,0.0,11401,1,...,-1,-1,1,1,0,1439,3,1,0,0
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,1100,4.0,11401,1,...,-1,-1,1,1,0,1440,3,1,0,0
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,1100,3.0,11401,1,...,-1,-1,1,1,0,1441,3,1,0,0


In [45]:
df_melted.columns

Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'day',
       'sales', 'wm_yr_wk', 'wday', 'month', 'year', 'event_name_1',
       'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX',
       'snap_WI', 'item_id_encoding', 'dept_id_encoding', 'cat_id_encoding',
       'store_id_encoding', 'state_id_encoding'],
      dtype='object')

In [46]:
df_melted.drop(columns=['year'] + useful_ids, inplace=True)

In [47]:
reduce_mem_usage(df_melted)

Mem. usage decreased to 905.88 Mb (0.0% reduction)


In [48]:
df_melted.head()

Unnamed: 0,id,day,sales,wm_yr_wk,wday,month,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,item_id_encoding,dept_id_encoding,cat_id_encoding,store_id_encoding,state_id_encoding
0,HOBBIES_1_001_CA_1_validation,1100,1.0,11401,1,2,-1,-1,-1,-1,1,1,0,1437,3,1,0,0
1,HOBBIES_1_002_CA_1_validation,1100,1.0,11401,1,2,-1,-1,-1,-1,1,1,0,1438,3,1,0,0
2,HOBBIES_1_003_CA_1_validation,1100,0.0,11401,1,2,-1,-1,-1,-1,1,1,0,1439,3,1,0,0
3,HOBBIES_1_004_CA_1_validation,1100,4.0,11401,1,2,-1,-1,-1,-1,1,1,0,1440,3,1,0,0
4,HOBBIES_1_005_CA_1_validation,1100,3.0,11401,1,2,-1,-1,-1,-1,1,1,0,1441,3,1,0,0


In [49]:
df_melted['test'] = df_melted[["id","sales"]].groupby("id")["sales"].shift(1).fillna(-1).astype(np.int16)

In [50]:
print(list(df_melted[df_melted['day']==1101]['test']) == list(df_melted[df_melted['day']==1100]['sales']))

True


In [51]:
df_melted.head()

Unnamed: 0,id,day,sales,wm_yr_wk,wday,month,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,item_id_encoding,dept_id_encoding,cat_id_encoding,store_id_encoding,state_id_encoding,test
0,HOBBIES_1_001_CA_1_validation,1100,1.0,11401,1,2,-1,-1,-1,-1,1,1,0,1437,3,1,0,0,-1
1,HOBBIES_1_002_CA_1_validation,1100,1.0,11401,1,2,-1,-1,-1,-1,1,1,0,1438,3,1,0,0,-1
2,HOBBIES_1_003_CA_1_validation,1100,0.0,11401,1,2,-1,-1,-1,-1,1,1,0,1439,3,1,0,0,-1
3,HOBBIES_1_004_CA_1_validation,1100,4.0,11401,1,2,-1,-1,-1,-1,1,1,0,1440,3,1,0,0,-1
4,HOBBIES_1_005_CA_1_validation,1100,3.0,11401,1,2,-1,-1,-1,-1,1,1,0,1441,3,1,0,0,-1


In [52]:
df_melted.drop(columns=['test'], inplace=True)

In [53]:
df_melted.head()

Unnamed: 0,id,day,sales,wm_yr_wk,wday,month,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,item_id_encoding,dept_id_encoding,cat_id_encoding,store_id_encoding,state_id_encoding
0,HOBBIES_1_001_CA_1_validation,1100,1.0,11401,1,2,-1,-1,-1,-1,1,1,0,1437,3,1,0,0
1,HOBBIES_1_002_CA_1_validation,1100,1.0,11401,1,2,-1,-1,-1,-1,1,1,0,1438,3,1,0,0
2,HOBBIES_1_003_CA_1_validation,1100,0.0,11401,1,2,-1,-1,-1,-1,1,1,0,1439,3,1,0,0
3,HOBBIES_1_004_CA_1_validation,1100,4.0,11401,1,2,-1,-1,-1,-1,1,1,0,1440,3,1,0,0
4,HOBBIES_1_005_CA_1_validation,1100,3.0,11401,1,2,-1,-1,-1,-1,1,1,0,1441,3,1,0,0


In [54]:
# create lags, lags starts from 28 days ago to 77 days ago, spaced by 7 days
for lag in tqdm([28, 35, 42, 49, 56, 63, 70, 77]):
    df_melted["lag_" + str(lag)] = df_melted[["id","sales"]].groupby("id")["sales"].shift(lag).fillna(-1).astype(np.int16)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [01:36<00:00, 12.09s/it]


In [55]:
df_melted=df_melted[df_melted['lag_77']!=-1]

In [56]:
assert list(df_melted[df_melted['day']==1528]['lag_28']) == list(df_melted[df_melted['day']==1500]['sales'])

In [57]:
df_melted.head()

Unnamed: 0,id,day,sales,wm_yr_wk,wday,month,event_name_1,event_type_1,event_name_2,event_type_2,...,store_id_encoding,state_id_encoding,lag_28,lag_35,lag_42,lag_49,lag_56,lag_63,lag_70,lag_77
2347730,HOBBIES_1_001_CA_1_validation,1177,2.0,11412,1,4,-1,-1,-1,-1,...,0,0,1,0,0,1,2,1,2,1
2347731,HOBBIES_1_002_CA_1_validation,1177,1.0,11412,1,4,-1,-1,-1,-1,...,0,0,0,0,1,1,0,0,0,1
2347732,HOBBIES_1_003_CA_1_validation,1177,1.0,11412,1,4,-1,-1,-1,-1,...,0,0,0,0,0,2,0,1,0,0
2347733,HOBBIES_1_004_CA_1_validation,1177,3.0,11412,1,4,-1,-1,-1,-1,...,0,0,5,2,6,0,12,5,4,4
2347734,HOBBIES_1_005_CA_1_validation,1177,2.0,11412,1,4,-1,-1,-1,-1,...,0,0,1,2,2,0,1,1,0,3


In [58]:
cal_df.head()

Unnamed: 0,wm_yr_wk,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,11101,1,1,2011,1,,,,,0,0,0
1,11101,2,1,2011,2,,,,,0,0,0
2,11101,3,1,2011,3,,,,,0,0,0
3,11101,4,2,2011,4,,,,,1,1,0
4,11101,5,2,2011,5,,,,,1,0,1


In [59]:
df_melted=df_melted.merge(cal_df[['d', 'wm_yr_wk']], left_on=['day'], right_on=['d']).drop(columns=['d'])

In [60]:
df_melted.head()

Unnamed: 0,id,day,sales,wm_yr_wk_x,wday,month,event_name_1,event_type_1,event_name_2,event_type_2,...,state_id_encoding,lag_28,lag_35,lag_42,lag_49,lag_56,lag_63,lag_70,lag_77,wm_yr_wk_y
0,HOBBIES_1_001_CA_1_validation,1177,2.0,11412,1,4,-1,-1,-1,-1,...,0,1,0,0,1,2,1,2,1,11412
1,HOBBIES_1_002_CA_1_validation,1177,1.0,11412,1,4,-1,-1,-1,-1,...,0,0,0,1,1,0,0,0,1,11412
2,HOBBIES_1_003_CA_1_validation,1177,1.0,11412,1,4,-1,-1,-1,-1,...,0,0,0,0,2,0,1,0,0,11412
3,HOBBIES_1_004_CA_1_validation,1177,3.0,11412,1,4,-1,-1,-1,-1,...,0,5,2,6,0,12,5,4,4,11412
4,HOBBIES_1_005_CA_1_validation,1177,2.0,11412,1,4,-1,-1,-1,-1,...,0,1,2,2,0,1,1,0,3,11412


In [62]:
price_df.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price,id
0,CA_1,HOBBIES_1_001,11325,9.58,HOBBIES_1_001_CA_1_validation
1,CA_1,HOBBIES_1_001,11326,9.58,HOBBIES_1_001_CA_1_validation
2,CA_1,HOBBIES_1_001,11327,8.26,HOBBIES_1_001_CA_1_validation
3,CA_1,HOBBIES_1_001,11328,8.26,HOBBIES_1_001_CA_1_validation
4,CA_1,HOBBIES_1_001,11329,8.26,HOBBIES_1_001_CA_1_validation


In [63]:
df_melted.head()

Unnamed: 0,id,day,sales,wm_yr_wk_x,wday,month,event_name_1,event_type_1,event_name_2,event_type_2,...,state_id_encoding,lag_28,lag_35,lag_42,lag_49,lag_56,lag_63,lag_70,lag_77,wm_yr_wk_y
0,HOBBIES_1_001_CA_1_validation,1177,2.0,11412,1,4,-1,-1,-1,-1,...,0,1,0,0,1,2,1,2,1,11412
1,HOBBIES_1_002_CA_1_validation,1177,1.0,11412,1,4,-1,-1,-1,-1,...,0,0,0,1,1,0,0,0,1,11412
2,HOBBIES_1_003_CA_1_validation,1177,1.0,11412,1,4,-1,-1,-1,-1,...,0,0,0,0,2,0,1,0,0,11412
3,HOBBIES_1_004_CA_1_validation,1177,3.0,11412,1,4,-1,-1,-1,-1,...,0,5,2,6,0,12,5,4,4,11412
4,HOBBIES_1_005_CA_1_validation,1177,2.0,11412,1,4,-1,-1,-1,-1,...,0,1,2,2,0,1,1,0,3,11412


In [65]:
df_melted=df_melted.merge(price_df[['id', 'sell_price', 'wm_yr_wk']], on=['id', 'wm_yr_wk'], how='inner')

KeyError: 'wm_yr_wk_x'

In [None]:
del price_df
del cal_df

In [None]:
import lightgbm as lgb

In [None]:
best_params = {
            "objective" : "poisson",
            "metric" :"rmse",
            "force_row_wise" : True,
            "learning_rate" : 0.05,
    #         "sub_feature" : 0.8,
            "sub_row" : 0.75,
            "bagging_freq" : 1,
            "lambda_l2" : 0.1,
    #         "nthread" : 4
            "metric": ["rmse"],
        'verbosity': 1,
        'num_iterations' : 2048,
        'num_leaves': 64,
        "min_data_in_leaf": 50,
    }

In [None]:
X_train = df_melted[df_melted["day"] < 1886].drop(columns=["sales"])
y_train = df_melted[df_melted["day"] < 1886]["sales"]

X_val = df_melted[df_melted["day"].between(1886, 1913)].drop(columns=["sales"])
y_val = df_melted[df_melted["day"].between(1886, 1913)]["sales"]

X_test = df_melted[df_melted["day"] > 1913].drop(columns=["sales"])