In [0]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# module import
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

In [0]:
# Memory optimization
def optimize_floats(df: pd.DataFrame) -> pd.DataFrame:
    floats = df.select_dtypes(include=['float64']).columns.tolist()
    df[floats] = df[floats].apply(pd.to_numeric, downcast='float')
    
    return df


def optimize_ints(df: pd.DataFrame) -> pd.DataFrame:
    ints = df.select_dtypes(include=['int64']).columns.tolist()
    df[ints] = df[ints].apply(pd.to_numeric, downcast='integer')
    
    return df


def optimize_objects(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.select_dtypes(include=['object']):
        num_unique_values = len(df[col].unique())
        num_total_values = len(df[col])
        
        if float(num_unique_values) / num_total_values < 0.5:
            df[col] = df[col].astype('category')
    
    return df

def optimize_sparse(df: pd.DataFrame) -> pd.DataFrame:
    n_all = len(df)
    
    for col in df.columns:
        unique_dict = df[col].value_counts(dropna=False).to_dict()
        n_top1 = list(unique_dict.values())[0]
        
        if (n_top1 > 0.5*n_all):
            entry = list(unique_dict.keys())[0]

            df[col] = df[col].astype(pd.SparseDtype(df[col].dtype, entry))
    
    return df

def optimize(df: pd.DataFrame):
    print('optimizing... Pre optimize size')
    print(df.info())
    
    df = optimize_ints(df)
    df = optimize_floats(df)
    #df = optimize_sparse(df)
    df = optimize_objects(df)

    print('DF optimised... Post optimize size')
    print(df.info())
    
    return df

In [0]:
# Loading data
df_cal = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/calendar.csv')
df_prices = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sell_prices.csv')
df_sales = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv')

print('\n', 'Calendar data:')
print(df_cal.head(10))
print('\n', 'Pricing data:')
print(df_prices.head(10))
print('\n', 'Sales data:')
print(df_sales.loc[:,:'d_5'].head(10))

In [0]:
def remDdeli(df, col):
    '''
    Removes the 'd_' delimiter from the 'd' (day number) column
    '''
    df.loc[:, col] = df[col].str.slice(start=2)
    df.loc[:, col] = df.loc[:,col].astype('int16')
    return df

In [0]:
# Calendar data massage

#drop columns
drop_cols_cal = ['date', 'weekday']
df_cal.drop(drop_cols_cal, axis='columns', inplace=True)

# convert day_no to int
df_cal = remDdeli(df_cal, 'd')

# Indicator for training, validation and test days
cal_val_ini = 1914
cal_val_fin = cal_val_ini + 27
cal_test_ini = cal_val_fin + 1
cal_test_fin = cal_test_ini + 27

In [0]:
# memory Optimize all dataframes
df_cal = optimize(df_cal)
df_prices = optimize(df_prices)
df_sales = optimize(df_sales)

In [0]:
# add sales columns for validation and test dates
# this is in order to get a full data set with all dates
for d in range(cal_val_ini, cal_test_fin):
    c_name = 'd_' + str(d)
    
    # Set unknown sale counts to -1, 
    #s o that the data type can be integer (can't handle NAN)
    df_sales[c_name] = np.nan
    
# Melt dataframe into long format
print('\n','Convert from wide to long df format')
cols_keep = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
df_tot = df_sales.melt(id_vars=cols_keep, var_name ='d', value_name='saleCount')

#Remove 'd'-delimiter on dayNo column
df_tot = remDdeli(df_tot, 'd')

# Add train/val/test type column
df_tot.loc[df_tot['d'].between(1, cal_val_ini-1, inclusive = True),'type'] = 'train'
df_tot.loc[df_tot['d'].between(cal_val_ini, cal_val_fin, inclusive = True), 'type'] = 'val'
df_tot.loc[df_tot['d'].between(cal_test_ini, cal_test_fin, inclusive = True), 'type'] = 'test'

# memory optimization by downcasting data types
df_tot = optimize(df_tot)

In [0]:
# Merging 3 dataframes
print('\n','Merging sales and calander data')
df_tot = df_tot.merge(df_cal,
                      how = 'left',
                      left_on ='d',
                      right_on = 'd',
                      copy = False)

print('\n','Merging sales and price data')
df_tot = df_tot.merge(df_prices,
                      how = 'left',
                      left_on =['store_id', 'item_id', 'wm_yr_wk'],
                      right_on = ['store_id', 'item_id', 'wm_yr_wk'],
                      copy = False)

drop_cols = ['wm_yr_wk']

df_tot.drop(drop_cols, axis='columns', inplace=True)

In [0]:
print('Saving merged data to pickle')
df_tot.to_pickle('df_data.pkl')
print('Done')