In [1]:
# The directory for top level folder
dir_ = "/home/sugam/Work/20-29 Deep Learning/22 Projects/M5-Forecasting/m5-forecasting-accuracy/"

In [2]:
raw_data_dir = dir_ + '2.data/'
processed_data_dir = dir_+'2.data/processed/'

# 1. MAIN SETUP

In [3]:
# Genderal imports 
import numpy as np
import pandas as pd
import os, sys, gc,time,warnings,pickle,psutil,random
from math import ceil
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
warnings.filterwarnings("ignore")

In [4]:
# Simple memory profilers to see memeory usage
def get_memory_usage():
    return np.round(psutil.Process(os.getpid()).memory_info()[0]/2.**30, 2) 
        
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

In [5]:
## Memory Reducer
# :df pandas dataframe to reduce size             # type: pd.DataFrame()
# :verbose                                        # type: bool
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                       df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [6]:
## Merging by concat to not lose dtypes
def merge_by_concat(df1,df2,merge_on):
    merged_df = df1[merge_on] # merge_on is multiple columns
    merged_df = merged_df.merge(df2,on=merge_on,how='left')
    new_columns = [col for col in list(merged_df) if col not in merge_on]
    df1 = pd.concat([df1,merged_df[new_columns]],axis=1)
    return df1

In [7]:
######################### Vars
###################################################################
TARGET = 'sales' # main target
END_TRAIN = 1941 # Last day of train set
MAIN_INDEX = ['id','d'] # Identify each entry by these columns

# 2. PART -1 
- Melting train data => grid_part_1
- creating price features => grid_part_2
- creating calender features => grid_part_3

In [8]:
####################### LOAD DATA
##################################################################
print("Load Main Data")
# Refering our data without any modification and dtype
train_df = pd.read_csv(raw_data_dir+'sales_train_evaluation.csv')
prices_df = pd.read_csv(raw_data_dir+'sell_prices.csv')
calendar_df = pd.read_csv(raw_data_dir+'calendar.csv')

Load Main Data


In [9]:
####################### MAKE GRID
##################################################################
print("Create Grid")

# We are unpivoting the table inorder to convert wide format table into long format
index_columns = ['id','item_id','dept_id','cat_id','store_id','state_id']
grid_df = pd.melt(train_df,
                 id_vars = index_columns,
                 var_name = 'd',
                 value_name = TARGET)
# In the train_df, there are very few training rows
# But each day can provide a lot of training data
print(f"Train rows: {len(train_df)} --- {len(grid_df)}")

# To be able to make predictions we need to add test set to our grid
# Below code adds the new rows for future data.
# It will add 28 days in the future
add_grid = pd.DataFrame()
for i in range(1,29):
    temp_df = train_df[index_columns]
    temp_df = temp_df.drop_duplicates() # Ensures data is unique which you going to predict in the future
    temp_df['d'] = "d_"+str(END_TRAIN+i)
    temp_df[TARGET] = np.nan
    add_grid = pd.concat([add_grid,temp_df])

grid_df = pd.concat([grid_df,add_grid])
grid_df.reset_index(drop=False,inplace=True)
del temp_df, add_grid
del train_df

# Let's check our memory usage
print("{:>20}: {:>8}".format('Original grid_df',sizeof_fmt(grid_df.memory_usage(index=True).sum())))
# We can free some memory 
# by converting "strings" to categorical
# it will not affect merging and 
# we will not lose any valuable data
for col in index_columns:
    grid_df[col] = grid_df[col].astype('category')

# Let's check again memory usage
print("{:>20}: {:>8}".format('Reduced grid_df',sizeof_fmt(grid_df.memory_usage(index=True).sum())))

Create Grid
Train rows: 30490 --- 59181090
    Original grid_df:   4.0GiB
     Reduced grid_df:   1.8GiB


In [10]:
########################### Product Release date
#################################################################################
print('Release week')

# It seems that leadings zero values
# in each train_df item row
# are not real 0 sales but mean
# absence for the item in the store
# we can safe some memory by removing
# such zeros

# Prices are set by week
# so it we will have not very accurate release week 
release_df = prices_df.groupby(['store_id','item_id'])['wm_yr_wk'].agg(['min']).reset_index()
release_df.columns = ['store_id','item_id','release']

# Now we can merge release_df
grid_df = merge_by_concat(grid_df, release_df, ['store_id','item_id'])
del release_df

# We want to remove some "zeros" rows
# from grid_df 
# to do it we need wm_yr_wk column
# let's merge partly calendar_df to have it
grid_df = merge_by_concat(grid_df, calendar_df[['wm_yr_wk','d']], ['d'])
                      
# Now we can cutoff some rows 
# and safe memory 
grid_df = grid_df[grid_df['wm_yr_wk']>=grid_df['release']]
grid_df = grid_df.reset_index(drop=True)

# Let's check our memory usage
print("{:>20}: {:>8}".format('Original grid_df',sizeof_fmt(grid_df.memory_usage(index=True).sum())))

# Should we keep release week 
# as one of the features?
# Only good CV can give the answer.
# Let's minify the release values.
# Min transformation will not help here 
# as int16 -> Integer (-32768 to 32767)
# and our grid_df['release'].max() serves for int16
# but we have have an idea how to transform other columns in case we will need it
grid_df['release'] = grid_df['release'] - grid_df['release'].min()
grid_df['release'] = grid_df['release'].astype(np.int16)

# Let's check again memory usage
print("{:>20}: {:>8}".format('Reduced grid_df',sizeof_fmt(grid_df.memory_usage(index=True).sum())))


Release week
    Original grid_df:   2.1GiB
     Reduced grid_df:   1.9GiB


In [11]:


########################### Save part 1
#################################################################################
print('Save Part 1')

# We have our BASE grid ready
# and can save it as pickle file
# for future use (model training)
grid_df.to_pickle(processed_data_dir+'grid_part_1.pkl')
print('Size:', grid_df.shape)

Save Part 1
Size: (47735397, 11)


In [12]:
prices_df.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,CA_1,HOBBIES_1_001,11325,9.58
1,CA_1,HOBBIES_1_001,11326,9.58
2,CA_1,HOBBIES_1_001,11327,8.26
3,CA_1,HOBBIES_1_001,11328,8.26
4,CA_1,HOBBIES_1_001,11329,8.26


In [13]:
########################## PRICES
######################################################
print("Prices")

# We can do some basic aggregration
prices_df['price_max'] = prices_df.groupby(["store_id","item_id"])["sell_price"].transform("max") # this gives you the maximum price each unique product has been sold to .
# Same product can be sold to many prices but the above code finds out the maximum price. Finally it replaces each item in that group with the maximum value.
prices_df['price_min'] = prices_df.groupby(['store_id','item_id'])['sell_price'].transform('min')
prices_df["price_mean"] = prices_df.groupby(["store_id","item_id"])['sell_price'].transform("mean")
prices_df["price_std"] = prices_df.groupby(["store_id","item_id"])['sell_price'].transform("std")


# Doing price normalization(min/max scaling)
prices_df["price_norm"] = prices_df["sell_price"]/prices_df["price_max"]


Prices


In [15]:
# Some items can be inflation dependent and some can be stable
prices_df["price_nunique"] = prices_df.groupby(["store_id","item_id"])["sell_price"].transform("nunique")
prices_df["item_nuinque"] = prices_df.groupby(["store_id","item_id"])["item_id"].transform("nunique")

In [18]:
# Making rolling aggregations but with months and years as window
calendar_prices = calendar_df[["wm_yr_wk","month","year"]]
calendar_prices = calendar_prices.drop_duplicates(subset=["wm_yr_wk"])
prices_df = prices_df.merge(calendar_prices[["wm_yr_wk","month","year"]],on=["wm_yr_wk"],how="left")

In [19]:
del calendar_prices

In [20]:
prices_df

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price,price_max,price_min,price_mean,price_std,price_norm,price_nunique,item_nuinque,month,year
0,CA_1,HOBBIES_1_001,11325,9.58,9.58,8.26,8.285714,0.152139,1.000000,3,1,7,2013
1,CA_1,HOBBIES_1_001,11326,9.58,9.58,8.26,8.285714,0.152139,1.000000,3,1,7,2013
2,CA_1,HOBBIES_1_001,11327,8.26,9.58,8.26,8.285714,0.152139,0.862213,3,1,7,2013
3,CA_1,HOBBIES_1_001,11328,8.26,9.58,8.26,8.285714,0.152139,0.862213,3,1,8,2013
4,CA_1,HOBBIES_1_001,11329,8.26,9.58,8.26,8.285714,0.152139,0.862213,3,1,8,2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6841116,WI_3,FOODS_3_827,11617,1.00,1.00,1.00,1.000000,0.000000,1.000000,1,1,5,2016
6841117,WI_3,FOODS_3_827,11618,1.00,1.00,1.00,1.000000,0.000000,1.000000,1,1,5,2016
6841118,WI_3,FOODS_3_827,11619,1.00,1.00,1.00,1.000000,0.000000,1.000000,1,1,6,2016
6841119,WI_3,FOODS_3_827,11620,1.00,1.00,1.00,1.000000,0.000000,1.000000,1,1,6,2016


In [21]:
# Now we can add price "momentum" (some sort of)
# Shifted by week 
# by month mean
# by year mean
prices_df['price_momentum'] = prices_df['sell_price']/prices_df.groupby(['store_id','item_id'])['sell_price'].transform(lambda x: x.shift(1))
prices_df['price_momentum_m'] = prices_df['sell_price']/prices_df.groupby(['store_id','item_id','month'])['sell_price'].transform('mean')
prices_df['price_momentum_y'] = prices_df['sell_price']/prices_df.groupby(['store_id','item_id','year'])['sell_price'].transform('mean')

del prices_df['month'], prices_df['year']