In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler
from scipy.sparse import csr_matrix

from sklearn.neighbors import LocalOutlierFactor
from sklearn.impute import KNNImputer
from scipy import stats
from tqdm.notebook import tqdm

import gc
%matplotlib inline

In [2]:
# Memory reduction helper function:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns: #columns
        col_type = df[col].dtypes
        if col_type in numerics: #numerics
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

# M5 - Step 1: Clean

## Clean: Impute Gaps & Cap Outliers
#### Capping and imputing sales timeseries while preserving the target:

Sales are noised with outliers and long zero gaps. We split target into three timeseries: 

* **sale_outlier_diff** - differnece between outlier and maximum non outlier value; outliers are flaged using z score.
* **sale_cap_ipmuted** - Sales with capped outliers and imputed gap_days with flat mean
* **imputed_gaps_1** & **imputed_gaps_23** - Store/Satte specific gaps imputed with flat mean

# Load Datasets
All three datasets needed because we need to calculate sales in USD.

In [3]:
# DEFAULT datasets
data_pass = '/kaggle/input/m5-forecasting-accuracy/'

# Sales quantities:

# Calendar to get week number to join sell prices:
calendar = pd.read_csv(data_pass+'calendar.csv')
calendar.d = calendar.d.str.extract('(\d+)').astype(np.int16)
calendar = reduce_mem_usage(calendar)

# Sell prices to calculate sales in USD:
sell_prices = pd.read_csv(data_pass+'sell_prices.csv')
sell_prices = reduce_mem_usage(sell_prices)

Mem. usage decreased to  0.11 Mb (44.3% reduction)
Mem. usage decreased to 130.48 Mb (37.5% reduction)


In [4]:
# GAPS datasets
# Load grid with gaps and imputed sales:
file_pass = '/kaggle/input/'
drop_cols = ['dept_id', 'cat_id','state_id','gaps','gaps_levels','gap_2std','gap_interval_2std','prob_zero','gaps10','gaps9','release']
grid_df = reduce_mem_usage(pd.read_pickle(file_pass+'m5-agg-out-of-stock-levels/grid_part_1_agglvl.pkl').drop(columns=drop_cols))
grid_df_2 = reduce_mem_usage(pd.read_pickle(file_pass+'m5-imputing-accurate-out-of-stock-feature/grid_part_1_sales_imputed.pkl'))
grid_df = pd.concat([grid_df,grid_df_2],  axis=1)

# Drop all levels except the bottom level 11:
grid_df = grid_df[grid_df.level==11]

del grid_df_2

Mem. usage decreased to 947.67 Mb (0.0% reduction)
Mem. usage decreased to 405.42 Mb (62.5% reduction)


In [5]:
# Remove empty categories:
for c in ['id','store_id', 'item_id']:
    grid_df[c] = grid_df[c].astype('str').astype('category')

In [6]:
# Add week of year:
grid_df = pd.merge(grid_df, calendar[['wm_yr_wk','d']], how = 'left', 
                left_on = ['d'], right_on = ['d'])

# Add price:
grid_df = grid_df.merge(sell_prices, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left')

del sell_prices

In [7]:
# We will have to reimpute the gaps, so convert them to boolean flags:
grid_df['imputed_sales'] = ((grid_df['imputed_sales']-grid_df['sales'])>0.001).astype(np.int16, copy=False)
grid_df['imputed_gaps_1'] = (grid_df['imputed_gaps_1']>0)
grid_df['imputed_gaps_23'] = (grid_df['imputed_gaps_23']>0)

grid_df = reduce_mem_usage(grid_df)
gc.collect()

Mem. usage decreased to 1700.46 Mb (7.3% reduction)


0

In [8]:
grid_df.head()

Unnamed: 0,id,item_id,store_id,d,sales,level,imputed_sales,imputed_gaps_1,imputed_gaps_23,wm_yr_wk,sell_price
0,HOBBIES_1_008_CA_1_validation,HOBBIES_1_008,CA_1,1,12,11,0,False,False,11101,0.459961
1,HOBBIES_1_009_CA_1_validation,HOBBIES_1_009,CA_1,1,2,11,0,False,False,11101,1.55957
2,HOBBIES_1_010_CA_1_validation,HOBBIES_1_010,CA_1,1,0,11,0,False,False,11101,3.169922
3,HOBBIES_1_012_CA_1_validation,HOBBIES_1_012,CA_1,1,0,11,0,False,False,11101,5.980469
4,HOBBIES_1_015_CA_1_validation,HOBBIES_1_015,CA_1,1,4,11,0,False,False,11101,0.700195


# Variables
Here we work only with the bottom level==11:

* **sale_outlier_diff** - differnece between outlier and maximum non outlier value; outliers are flaged using z score.
* **sale_cap_ipmuted** - Sales with capped ouliers and imputed gap_days with flat mean
* **imputed_sales** - Boolean flag for sales that has been imputed
* **imputed_gaps_1** - Store specific gaps imputed with flat mean
* **imputed_gaps_23** - State and network specific gaps imputed with flat mean

# Outliers: Find & Cap

In [9]:
def z3_outlier(grp, col_name):
    '''
    Finds outliers based on z score'''
    t = grp[grp[col_name]>0]
    return t[np.abs(stats.zscore(t[col_name])) > 3].index

In [10]:
# Find outliers based on z score:
## Filter out values more than 0 as otherwise all sales might be flagged as outliers:
mask = grid_df.sales > 0

## Group by 'id' as sales values have diffrenet mean from store to store: 
df_group = grid_df[mask].groupby(['id'])

## Find indices of all outliers:
grid_df.loc[:,'sale_outlier_diff']=0
outlier_idx = []
for group_name, g in tqdm(df_group):
    outlier_idx += list(g[stats.zscore(g.sales) > 3].index) 
    
## Create separate column with outliers:
grid_df.loc[outlier_idx,'sale_outlier_diff']=grid_df.loc[outlier_idx,'sales']

HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))

  return (a - mns) / sstd
  if sys.path[0] == '':





In [11]:
# Cap sales outliers with maximum non outlier value:
grid_df['sales_cap_impute'] = grid_df['sales']
nan_idx = grid_df[~grid_df.sales.isna()].index
grid_df.loc[outlier_idx,'sales_cap_impute'] = np.nan

grid_df.loc[nan_idx,'sales_cap_impute'] = grid_df.loc[nan_idx,'sales_cap_impute'].fillna(grid_df.groupby('id')['sales_cap_impute'].transform('max'))

In [12]:
## For outlier column caclulate difference with the max non outlier:
grid_df.loc[outlier_idx,'sale_outlier_diff']=grid_df.loc[outlier_idx,'sale_outlier_diff']-grid_df.loc[outlier_idx,'sales_cap_impute']

# Gaps: Impute with flat mean

In [13]:
# After ouliers are capped we can impute gaps with means:
m = grid_df.groupby('id')['sales_cap_impute'].transform(np.nanmean)
grid_df.loc[:,'imputed_gaps_1'] = m*(grid_df.imputed_gaps_1)
grid_df.loc[:,'imputed_gaps_23'] = m*(grid_df.imputed_gaps_23)

# Imupting store and network specific gaps:
grid_df.loc[:,'sales_cap_impute'] = grid_df.sales +grid_df.imputed_gaps_1 +grid_df.imputed_gaps_23
del m

In [14]:
grid_df = reduce_mem_usage(grid_df)
gc.collect()

Mem. usage decreased to 3248.72 Mb (24.8% reduction)


16

In [15]:
# Dump to pickle. :
grid_df.to_pickle('grid_part_1_outlier.pkl')

In [16]:
grid_df.head()

Unnamed: 0,id,item_id,store_id,d,sales,level,imputed_sales,imputed_gaps_1,imputed_gaps_23,wm_yr_wk,sell_price,sale_outlier_diff,sales_cap_impute
0,HOBBIES_1_008_CA_1_validation,HOBBIES_1_008,CA_1,1,12,11,0,0.0,0.0,11101,0.459961,0.0,12.0
1,HOBBIES_1_009_CA_1_validation,HOBBIES_1_009,CA_1,1,2,11,0,0.0,0.0,11101,1.55957,0.0,2.0
2,HOBBIES_1_010_CA_1_validation,HOBBIES_1_010,CA_1,1,0,11,0,0.0,0.0,11101,3.169922,0.0,0.0
3,HOBBIES_1_012_CA_1_validation,HOBBIES_1_012,CA_1,1,0,11,0,0.0,0.0,11101,5.980469,0.0,0.0
4,HOBBIES_1_015_CA_1_validation,HOBBIES_1_015,CA_1,1,4,11,0,0.0,0.0,11101,0.700195,0.0,4.0


# DANGER: np.float16

The format should be changed to np.float32 before feature engineering or training.