In [1]:
import numpy as np
import pandas as pd
import os
import re

In [2]:
key_cols = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'date']

In [3]:
TEST_SIZE = 28

df_sales = pd.read_csv('./data/sales_train_evaluation.csv')
df_calendar = pd.read_csv('./data/calendar.csv', parse_dates=['date'])
df_prices = pd.read_csv('./data/sell_prices.csv')

# Add null sales for the remaining days 1942-1969
# d_1 to d_1941: train set & valid set
# d_1942 - d_1969: test set (forecast F1 to F28 in sample submission)

series_d = pd.Series(df_sales.columns)
series_d = series_d[series_d.str.contains('d_')].reset_index(drop=True)
series_d = series_d.apply(lambda x: x.split('_')[1]).astype(int)
max_d = max(series_d.values)

for d in range(max_d+1, max_d+1+TEST_SIZE):
    col = 'd_' + str(d)
    df_sales[col] = np.nan
    

# Trimming

In [4]:
# ma and lag params
MA = [7, 28]
LAG = [1,1]

In [5]:
START_DATE = '2016'

start_date = pd.to_datetime(START_DATE) - pd.Timedelta(max(max(LAG), 1+max(MA)),'days')      # lag:1 + max(ma window)
df_calendar_trim = df_calendar[(df_calendar.date>=start_date)].copy()

# For trimming
d_min = int(df_calendar_trim.d.min().split('_')[1])
d_max = int(df_calendar_trim.d.max().split('_')[1])
week_min = df_calendar_trim.wm_yr_wk.min()
week_max = df_calendar_trim.wm_yr_wk.max()

# preprocess on calendar
df_calendar_trim['is_holiday'] = df_calendar_trim['event_name_1'].notnull() | df_calendar_trim['event_name_2'].notnull()
df_calendar_trim['is_weekend'] = df_calendar_trim.weekday.isin(['Saturday','Sunday'])

df_calendar_trim

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,is_holiday,is_weekend
1769,2015-12-03,11544,Thursday,6,12,2015,d_1770,,,,,1,1,1,False,False
1770,2015-12-04,11544,Friday,7,12,2015,d_1771,,,,,1,0,0,False,False
1771,2015-12-05,11545,Saturday,1,12,2015,d_1772,,,,,1,1,1,False,True
1772,2015-12-06,11545,Sunday,2,12,2015,d_1773,,,,,1,1,1,False,True
1773,2015-12-07,11545,Monday,3,12,2015,d_1774,,,,,1,1,0,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1964,2016-06-15,11620,Wednesday,5,6,2016,d_1965,,,,,0,1,1,False,False
1965,2016-06-16,11620,Thursday,6,6,2016,d_1966,,,,,0,0,0,False,False
1966,2016-06-17,11620,Friday,7,6,2016,d_1967,,,,,0,0,0,False,False
1967,2016-06-18,11621,Saturday,1,6,2016,d_1968,,,,,0,0,0,False,True


In [6]:
df_sales_trim = df_sales[[col for col in key_cols if col != 'date'] + [f"d_{n}" for n in range(d_min, d_max+1)]]
df_sales_trim

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1770,d_1771,d_1772,d_1773,...,d_1960,d_1961,d_1962,d_1963,d_1964,d_1965,d_1966,d_1967,d_1968,d_1969
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,,,,,,,,,,
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,2,1,...,,,,,,,,,,
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,,,,,,,,,,
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,1,3,3,2,...,,,,,,,,,,
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,1,2,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,1,0,0,2,...,,,,,,,,,,
30486,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,,,,,,,,,,
30487,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,1,0,1,0,...,,,,,,,,,,
30488,FOODS_3_826_WI_3_evaluation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,0,0,1,5,...,,,,,,,,,,


In [7]:
df_prices_trim = df_prices[(df_prices.wm_yr_wk>=week_min) & (df_prices.wm_yr_wk<=week_max)]

# preprocess on price
df_prices_trim['sell_price_diff'] = df_prices_trim.groupby(['store_id', 'item_id'])['sell_price'].transform(lambda x: (x-x.iloc[0])/x.iloc[0])

df_prices_trim

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_prices_trim['sell_price_diff'] = df_prices_trim.groupby(['store_id', 'item_id'])['sell_price'].transform(lambda x: (x-x.iloc[0])/x.iloc[0])


Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price,sell_price_diff
124,CA_1,HOBBIES_1_001,11544,8.26,0.0
125,CA_1,HOBBIES_1_001,11545,8.26,0.0
126,CA_1,HOBBIES_1_001,11546,8.26,0.0
127,CA_1,HOBBIES_1_001,11547,8.26,0.0
128,CA_1,HOBBIES_1_001,11548,8.26,0.0
...,...,...,...,...,...
6841116,WI_3,FOODS_3_827,11617,1.00,0.0
6841117,WI_3,FOODS_3_827,11618,1.00,0.0
6841118,WI_3,FOODS_3_827,11619,1.00,0.0
6841119,WI_3,FOODS_3_827,11620,1.00,0.0


In [8]:
df = pd.melt(df_sales_trim, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name='d', value_name='sold')
df = pd.merge(df, df_calendar_trim[['date', 'wm_yr_wk', 'weekday', 'd', 'is_holiday', 'is_weekend', 'snap_CA', 'snap_TX', 'snap_WI']], how='left', on='d')
df = pd.merge(df, df_prices_trim, how='left', on=['store_id','item_id','wm_yr_wk'])
df['d'] = df['d'].apply(lambda x: x.split('_')[1]).astype(int)
df


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sold,date,wm_yr_wk,weekday,is_holiday,is_weekend,snap_CA,snap_TX,snap_WI,sell_price,sell_price_diff
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,1770,0.0,2015-12-03,11544,Thursday,False,False,1,1,1,8.26,0.000
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,1770,0.0,2015-12-03,11544,Thursday,False,False,1,1,1,3.97,0.000
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,1770,0.0,2015-12-03,11544,Thursday,False,False,1,1,1,2.97,0.000
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,1770,1.0,2015-12-03,11544,Thursday,False,False,1,1,1,4.64,0.000
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,1770,0.0,2015-12-03,11544,Thursday,False,False,1,1,1,2.88,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6097995,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,1969,,2016-06-19,11621,Sunday,True,True,0,0,0,2.98,0.192
6097996,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,1969,,2016-06-19,11621,Sunday,True,True,0,0,0,2.48,0.240
6097997,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,1969,,2016-06-19,11621,Sunday,True,True,0,0,0,3.98,0.000
6097998,FOODS_3_826_WI_3_evaluation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,1969,,2016-06-19,11621,Sunday,True,True,0,0,0,1.28,0.000


In [9]:
# # utils
# def create_lag_ma(df, lag: int, MA:list, key:list) -> pd.DataFrame:
#     key_name = '_'.join(key)+'_' if len(key) > 0 else ''

#     df_temp = df[key + ["sold", "date"]].groupby(key+["date"]).sum().shift(lag).reset_index().rename(columns={'sold':f"{key_name}sold_lag{lag}"})

#     for ma in MA :
#         if len(key)==0:
#             df_temp[f"{key_name}sold_lag{lag}_ma{ma}"] = df_temp[key + [f"{key_name}sold_lag{lag}"]].transform(lambda x : x.rolling(ma).mean())
#         else:
#             df_temp[f"{key_name}sold_lag{lag}_ma{ma}"] = df_temp[key + [f"{key_name}sold_lag{lag}"]].groupby(key)[f"{key_name}sold_lag{lag}"].transform(lambda x : x.rolling(ma).mean())

#     df = df.merge(df_temp, how='left', on=key+["date"])
#     return df

In [10]:
# utils
def create_ma_and_ma_diff(df, MA:list, key:list, lag: int) -> pd.DataFrame:
    key_name = ('_'.join(key) if len(key) > 0 else 'global') + '_'

    # create lag for ma
    df_temp = df[key + ["sold", "date"]].groupby(key+["date"]).sum().shift(lag).reset_index().rename(columns={'sold':f"{key_name}sold_lag{lag}"})

    # create ma
    for ma in MA :
        if len(key)==0:
            df_temp[f"{key_name}sold_lag{lag}_ma{ma}"] = df_temp[[f"{key_name}sold_lag{lag}"]].transform(lambda x : x.rolling(ma).mean())   # create ma
            df_temp[f"{key_name}sold_lag{lag}_ma{ma}_diff"] = df_temp[[f"{key_name}sold_lag{lag}_ma{ma}"]].diff(1)                          # create ma diff
        else:
            df_temp[f"{key_name}sold_lag{lag}_ma{ma}"] = df_temp[key + [f"{key_name}sold_lag{lag}"]].groupby(key)[f"{key_name}sold_lag{lag}"].transform(lambda x : x.rolling(ma).mean())    # create ma
            df_temp[f"{key_name}sold_lag{lag}_ma{ma}_diff"] = df_temp[key + [f"{key_name}sold_lag{lag}_ma{ma}"]].groupby(key)[f"{key_name}sold_lag{lag}_ma{ma}"].diff(1)                    # create ma diff

    df_temp = df_temp[[col for col in df_temp.columns if col != f"{key_name}sold_lag{lag}"]]
    df = df.merge(df_temp, how='left', on=key+["date"])
    return df

In [11]:
# create ma features
df = create_ma_and_ma_diff(df, MA, key=['id'], lag=1)           # id lv
# df = create_ma_and_ma_diff(df, MA, key=['item_id'], lag=1)                # item lv
# df = create_ma_and_ma_diff(df, MA, key=['cat_id'], lag=1)                 # cat lv
# df = create_ma_and_ma_diff(df, MA, key=['store_id'], lag=1)               # store lv
# df = create_ma_and_ma_diff(df, MA, key=['dept_id','store_id'], lag=1)     # dept-store lv
# df = create_ma_and_ma_diff(df, MA, key=['cat_id','state_id'], lag=1)      # cat-state lv
df = create_ma_and_ma_diff(df, MA, key=[], lag=1)               # global lv (like prospect index)

df = create_ma_and_ma_diff(df, MA, key=['id'], lag=0)
df = create_ma_and_ma_diff(df, MA, key=[], lag=0)

df

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sold,date,wm_yr_wk,...,global_sold_lag1_ma28,global_sold_lag1_ma28_diff,id_sold_lag0_ma7,id_sold_lag0_ma7_diff,id_sold_lag0_ma28,id_sold_lag0_ma28_diff,global_sold_lag0_ma7,global_sold_lag0_ma7_diff,global_sold_lag0_ma28,global_sold_lag0_ma28_diff
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,1770,0.0,2015-12-03,11544,...,,,,,,,,,,
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,1770,0.0,2015-12-03,11544,...,,,,,,,,,,
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,1770,0.0,2015-12-03,11544,...,,,,,,,,,,
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,1770,1.0,2015-12-03,11544,...,,,,,,,,,,
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,1770,0.0,2015-12-03,11544,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6097995,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,1969,,2016-06-19,11621,...,1940.642857,-1839.928571,0.0,0.0,0.0,-0.035714,0.0,0.0,0.0,-1940.642857
6097996,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,1969,,2016-06-19,11621,...,1940.642857,-1839.928571,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,-1940.642857
6097997,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,1969,,2016-06-19,11621,...,1940.642857,-1839.928571,0.0,0.0,0.0,-0.071429,0.0,0.0,0.0,-1940.642857
6097998,FOODS_3_826_WI_3_evaluation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,1969,,2016-06-19,11621,...,1940.642857,-1839.928571,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,-1940.642857


In [12]:
# create lag features
for i in range(LAG[0], LAG[1]+1):
    df[f"sold_lag{i}"] = df.groupby(["id"])["sold"].shift(i)

df

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sold,date,wm_yr_wk,...,global_sold_lag1_ma28_diff,id_sold_lag0_ma7,id_sold_lag0_ma7_diff,id_sold_lag0_ma28,id_sold_lag0_ma28_diff,global_sold_lag0_ma7,global_sold_lag0_ma7_diff,global_sold_lag0_ma28,global_sold_lag0_ma28_diff,sold_lag1
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,1770,0.0,2015-12-03,11544,...,,,,,,,,,,
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,1770,0.0,2015-12-03,11544,...,,,,,,,,,,
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,1770,0.0,2015-12-03,11544,...,,,,,,,,,,
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,1770,1.0,2015-12-03,11544,...,,,,,,,,,,
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,1770,0.0,2015-12-03,11544,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6097995,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,1969,,2016-06-19,11621,...,-1839.928571,0.0,0.0,0.0,-0.035714,0.0,0.0,0.0,-1940.642857,
6097996,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,1969,,2016-06-19,11621,...,-1839.928571,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,-1940.642857,
6097997,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,1969,,2016-06-19,11621,...,-1839.928571,0.0,0.0,0.0,-0.071429,0.0,0.0,0.0,-1940.642857,
6097998,FOODS_3_826_WI_3_evaluation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,1969,,2016-06-19,11621,...,-1839.928571,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,-1940.642857,


In [13]:
# trimming on start date (due to lag & ma)
df = df[(df.date>=START_DATE)].copy()

df = df.sort_values(['id', 'date']).reset_index(drop=True)
df.isnull().sum()

id                                 0
item_id                            0
dept_id                            0
cat_id                             0
store_id                           0
state_id                           0
d                                  0
sold                          853720
date                               0
wm_yr_wk                           0
weekday                            0
is_holiday                         0
is_weekend                         0
snap_CA                            0
snap_TX                            0
snap_WI                            0
sell_price                       513
sell_price_diff                  513
id_sold_lag1_ma7                   0
id_sold_lag1_ma7_diff              0
id_sold_lag1_ma28                  0
id_sold_lag1_ma28_diff             0
global_sold_lag1_ma7               0
global_sold_lag1_ma7_diff          0
global_sold_lag1_ma28              0
global_sold_lag1_ma28_diff         0
id_sold_lag0_ma7                   0
i

In [14]:
# trimming on sell price
df = df[df.sell_price.notnull()].reset_index(drop=True)
df.isnull().sum()

# There are few null in sell price diff which is fine
# coz these are the first week of the product
# null != 0 for lgbm which is fine for lgbm to distinct null and 0
# you may fillna(0) but must not remove null in sell price diff

id                                 0
item_id                            0
dept_id                            0
cat_id                             0
store_id                           0
state_id                           0
d                                  0
sold                          853720
date                               0
wm_yr_wk                           0
weekday                            0
is_holiday                         0
is_weekend                         0
snap_CA                            0
snap_TX                            0
snap_WI                            0
sell_price                         0
sell_price_diff                    0
id_sold_lag1_ma7                   0
id_sold_lag1_ma7_diff              0
id_sold_lag1_ma28                  0
id_sold_lag1_ma28_diff             0
global_sold_lag1_ma7               0
global_sold_lag1_ma7_diff          0
global_sold_lag1_ma28              0
global_sold_lag1_ma28_diff         0
id_sold_lag0_ma7                   0
i

In [15]:
# avg(sold) over "id" across all dates within scope
df['avg_sold_per_id'] = df[['id', 'sold']].groupby(['id'])['sold'].transform(np.mean)

In [16]:
# for sold_ma_diff.div(df['avg_sold_per_id'])
for col in [col for col in df.columns if re.search("ma\d_diff", col)]:
    df[col] = df[col].div(df['avg_sold_per_id'])

df

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sold,date,wm_yr_wk,...,id_sold_lag0_ma7,id_sold_lag0_ma7_diff,id_sold_lag0_ma28,id_sold_lag0_ma28_diff,global_sold_lag0_ma7,global_sold_lag0_ma7_diff,global_sold_lag0_ma28,global_sold_lag0_ma28_diff,sold_lag1,avg_sold_per_id
0,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,1799,0.0,2016-01-01,11548,...,0.428571,0.00000,0.785714,-0.107143,35064.142857,5797.628571,36218.607143,-197.714286,2.0,0.804196
1,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,1800,0.0,2016-01-02,11549,...,0.285714,-0.17764,0.785714,0.000000,35972.714286,1129.788820,36136.892857,-81.714286,0.0,0.804196
2,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,1801,0.0,2016-01-03,11549,...,0.285714,0.00000,0.785714,0.000000,37253.000000,1592.007453,35991.357143,-145.535714,0.0,0.804196
3,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,1802,1.0,2016-01-04,11549,...,0.428571,0.17764,0.750000,-0.035714,38214.285714,1195.337888,36037.250000,45.892857,0.0,0.804196
4,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,1803,0.0,2016-01-05,11549,...,0.428571,0.00000,0.607143,-0.142857,38701.857143,606.284472,36122.714286,85.464286,1.0,0.804196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5213272,HOUSEHOLD_2_516_WI_3_evaluation,HOUSEHOLD_2_516,HOUSEHOLD_2,HOUSEHOLD,WI_3,WI,1965,,2016-06-15,11620,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,6620.392857,-1324.857143,,0.062937
5213273,HOUSEHOLD_2_516_WI_3_evaluation,HOUSEHOLD_2_516,HOUSEHOLD_2,HOUSEHOLD,WI_3,WI,1966,,2016-06-16,11620,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,5300.285714,-1320.107143,,0.062937
5213274,HOUSEHOLD_2_516_WI_3_evaluation,HOUSEHOLD_2_516,HOUSEHOLD_2,HOUSEHOLD,WI_3,WI,1967,,2016-06-17,11620,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,3780.571429,-1519.714286,,0.062937
5213275,HOUSEHOLD_2_516_WI_3_evaluation,HOUSEHOLD_2_516,HOUSEHOLD_2,HOUSEHOLD,WI_3,WI,1968,,2016-06-18,11621,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,1940.642857,-1839.928571,,0.062937


In [17]:
df = df.sort_values(['id', 'date']).reset_index(drop=True)
df

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sold,date,wm_yr_wk,...,id_sold_lag0_ma7,id_sold_lag0_ma7_diff,id_sold_lag0_ma28,id_sold_lag0_ma28_diff,global_sold_lag0_ma7,global_sold_lag0_ma7_diff,global_sold_lag0_ma28,global_sold_lag0_ma28_diff,sold_lag1,avg_sold_per_id
0,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,1799,0.0,2016-01-01,11548,...,0.428571,0.00000,0.785714,-0.107143,35064.142857,5797.628571,36218.607143,-197.714286,2.0,0.804196
1,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,1800,0.0,2016-01-02,11549,...,0.285714,-0.17764,0.785714,0.000000,35972.714286,1129.788820,36136.892857,-81.714286,0.0,0.804196
2,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,1801,0.0,2016-01-03,11549,...,0.285714,0.00000,0.785714,0.000000,37253.000000,1592.007453,35991.357143,-145.535714,0.0,0.804196
3,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,1802,1.0,2016-01-04,11549,...,0.428571,0.17764,0.750000,-0.035714,38214.285714,1195.337888,36037.250000,45.892857,0.0,0.804196
4,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,1803,0.0,2016-01-05,11549,...,0.428571,0.00000,0.607143,-0.142857,38701.857143,606.284472,36122.714286,85.464286,1.0,0.804196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5213272,HOUSEHOLD_2_516_WI_3_evaluation,HOUSEHOLD_2_516,HOUSEHOLD_2,HOUSEHOLD,WI_3,WI,1965,,2016-06-15,11620,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,6620.392857,-1324.857143,,0.062937
5213273,HOUSEHOLD_2_516_WI_3_evaluation,HOUSEHOLD_2_516,HOUSEHOLD_2,HOUSEHOLD,WI_3,WI,1966,,2016-06-16,11620,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,5300.285714,-1320.107143,,0.062937
5213274,HOUSEHOLD_2_516_WI_3_evaluation,HOUSEHOLD_2_516,HOUSEHOLD_2,HOUSEHOLD,WI_3,WI,1967,,2016-06-17,11620,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,3780.571429,-1519.714286,,0.062937
5213275,HOUSEHOLD_2_516_WI_3_evaluation,HOUSEHOLD_2_516,HOUSEHOLD_2,HOUSEHOLD,WI_3,WI,1968,,2016-06-18,11621,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,1940.642857,-1839.928571,,0.062937


In [18]:
df.columns

Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd',
       'sold', 'date', 'wm_yr_wk', 'weekday', 'is_holiday', 'is_weekend',
       'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'sell_price_diff',
       'id_sold_lag1_ma7', 'id_sold_lag1_ma7_diff', 'id_sold_lag1_ma28',
       'id_sold_lag1_ma28_diff', 'global_sold_lag1_ma7',
       'global_sold_lag1_ma7_diff', 'global_sold_lag1_ma28',
       'global_sold_lag1_ma28_diff', 'id_sold_lag0_ma7',
       'id_sold_lag0_ma7_diff', 'id_sold_lag0_ma28', 'id_sold_lag0_ma28_diff',
       'global_sold_lag0_ma7', 'global_sold_lag0_ma7_diff',
       'global_sold_lag0_ma28', 'global_sold_lag0_ma28_diff', 'sold_lag1',
       'avg_sold_per_id'],
      dtype='object')

In [19]:
print(f"df memory usage: {np.round(df.memory_usage().sum()/(1024*1024),1)}mb")
df.to_pickle('./saved/data/'+'preprocessed_df.pkl')

df memory usage: 1342.4mb
