> # M5 - accurate 'out_of_stock' feature

In the [previous notebook](https://www.kaggle.com/sibmike/m5-out-of-stock-feature-640x-faster), we have calculated supply gaps in under 8 minutes. However, we have used an arbitrary 100 years threshold to distinguish them. In this notebook, we are using confidence intervals from this amazing paper: [Longest Run of Heads](https://www.jstor.org/stable/2686886?origin=JSTOR-pdf&seq=1) by Mark F. Schilling

\\( ER_{n} = log_{\frac{1}{p}}(nq) + \frac{\gamma}{log(\frac{1}{p})} - \frac{1}{2} \\)

\\( StdR_{n} = \sqrt{\frac{\pi^2}{6*ln^2(\frac{1}{p})}+\frac{1}{12}} \\) 


Prediction intervals:
\\( ER_{n} \pm 2StdR_{n} \\)

**_PPS: Special thanks to @nadare for outstanding 8x booster that cut time from 1 hr to under 8 mins!_**

In [None]:
def gap_interval(p, N):
    
    '''
    Returns the longest expected Heads run 
    for N trials with probabiliity of Heads equal to p
    
    p - avg probability of zero sales on a given day
    N - trials
    '''
    q = 1-p
    gamma = 0.5777 #Euler's constant
    
    R = np.log(N*q)/np.log(1/p)+gamma/np.log(1/p)-0.5
    sigma = np.sqrt(np.pi**2/(6*(np.log(1/p)**2))+1/12)
    
    d = R+3*sigma #97.5% confidence interval
    if d < 1:
        return 1
    else:
        return int(np.rint(d))

In [None]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, psutil, random
from tqdm.auto import tqdm

import seaborn as sns # data visualization library  
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

In [None]:
## Memory Reducer
# :df pandas dataframe to reduce size             # type: pd.DataFrame()
# :verbose                                        # type: bool
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8, copy=False)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                       df[col] = df[col].astype(np.int16, copy=False)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32, copy=False)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64, copy=False)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16, copy=False)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32, copy=False)
                else:
                    df[col] = df[col].astype(np.float64, copy=False)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
# Load data
grid_df = pd.read_pickle('/kaggle/input/m5-simple-fe/grid_part_1.pkl')

#Fix dat type
grid_df.sales = grid_df.sales.fillna(0).astype(np.int32, copy=False)

#Add aggregation level column
grid_df['level']=11
grid_df.level = grid_df.level.astype(np.int16, copy=False)
grid_df.info()

# Roll Up Data

In [None]:
group_ids = [['state_id'], ['store_id'], 
             ['cat_id'], ['dept_id'], ['state_id', 'cat_id'],  
             ['state_id', 'dept_id'], ['store_id', 'cat_id'], ['store_id', 'dept_id'], 
             ['item_id'], ['state_id','item_id']]

def roll_up(df, groups):
    '''
    This function calculates aggregates for all levels except level 0.
    '''
    t_list=[]
    for i, g in enumerate(tqdm(groups)):
        t = grid_df[g+['d','sales']].groupby(g+['d']).sum().fillna(0).astype(np.int32)
        t.reset_index(inplace=True)
        
        if len(g)>1: 
            t['id']=t[g[0]].astype(str)+'_'+t[g[1]].astype(str)
        else:
            t['id']=t[g[0]]
        t.id = t.id.astype('category', copy=False)
        
        t['level'] = i+1
        t_list += [t]
        
    return t_list

In [None]:
# Create aggregate for level o:
level_0 = grid_df[['d','sales']].groupby(['d'])[['sales']].sum()
level_0.reset_index(inplace=True)
level_0['id'] = 'all'
level_0['level'] = 0

# Calculate remaining levels and concat them:
roll_list = [grid_df, level_0]
roll_list += roll_up(grid_df, group_ids)

#full_df = pd.concat(roll_list, sort=False)
grid_df = reduce_mem_usage(pd.concat(roll_list, sort=False))
grid_df.reset_index(drop=True, inplace=True)
del roll_list, level_0

#free some memory with categoical vars:
cols=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id','level']
for col in cols:
    grid_df[col]=grid_df[col].astype('category', copy=False)

#free memory
gc.collect()

# Find Gaps

In [None]:
# Note: in 'gap' column: 1 is a day without sales:
grid_df['gaps'] = (~(grid_df['sales'] > 0)).astype(np.int16)
grid_df.loc[grid_df.d>(1941-28),'gaps'] = 0
total_days = 1942

prods = list(grid_df.id.unique())
e_list = [] #list to hold expected values of gaps
d_dict = {} #list to hold avg probability of no sales
p_dict = {} #list to hold probs

# magic x8 speed booster thanks to @nadare
for prod_id, df in tqdm(grid_df.groupby("id")):   
    # extract gap_series for a prod_id
    sales_gaps = df.loc[:,'gaps']

    # find and mark gaps
    accum_add_prod = np.frompyfunc(lambda x, y: int((x+y)*y), 2, 1)
    sales_gaps[:] = accum_add_prod.accumulate(df["gaps"], dtype=np.object).astype(int)
    sales_gaps[sales_gaps < sales_gaps.shift(-1)] = np.NaN
    sales_gaps = sales_gaps.fillna(method="bfill").fillna(method='ffill')
    
    # calculate initial probability    
    zero_days = sum(sales_gaps>0)
    
    # our dataset does not have series with 0 days, but its subsets may not have 0 days
    if zero_days == 0:
        e_list += [sales_gaps]
        d_dict[prod_id] = 1
        p_dict[prod_id] = 0
        
    else:
        p = zero_days/total_days+0.00001

        #Find the longest expected run with 95% confidence:
        d = gap_interval(p, total_days)

        # cut out supply_gap days and run recursively
        p1 = 0
        d1 = 0
        while p1 < p:

            if p1!=0: 
                p = p1
                d = d1

            # Based on 95% confidence interval, change gap_interval() to your taste
            gap_days = sum(sales_gaps>=d)

            p1 = (zero_days-gap_days+0.0001)/(total_days-gap_days)      
            d1 = gap_interval(p1, total_days-gap_days)

        # add results to list it turns out masked replacemnt is a very expensive operation in pandas, so better do it in one go
        e_list += [sales_gaps/d]
        d_dict[prod_id] = d
        p_dict[prod_id] = p

In [None]:
# build gap series in one go, sort index
s = pd.concat(e_list)
s.sort_index(inplace=True)

# create features
grid_df['gap_2std'] = s
grid_df['gap_interval_2std'] = grid_df.id.map(d_dict)
grid_df['prob_zero']=grid_df.id.map(p_dict)

# optimize memory, which we will need for dumping pickle
grid_df=reduce_mem_usage(grid_df)

In [None]:
grid_df.gap_2std2 = grid_df.gap_2std * (grid_df['sales'] == 0).astype(np.int16)

In [None]:
# free memory
del s, e_list
gc.collect()

In [None]:
# dump pickle
grid_df.to_pickle('grid_part_1_agg.pkl')

# Gaps EDA
### Level 11 Gaps
Unit sales of product x, aggregated for each State

In [None]:
# e over 100 years does not make much sense
m = grid_df['gap_2std']>=1
grid_df.loc[m,'gap_2std']=1

# take a subsample to vizualise:
np.random.seed(19)
depts = list(grid_df.dept_id.dropna().unique())

prod_list = []
for d in depts:
    prod_by_dept=grid_df['item_id'][grid_df.dept_id == d].unique()
    prod_list += list(np.random.choice(prod_by_dept,5))
    
m = grid_df.item_id.isin(prod_list)
viz_df = grid_df[m]
viz_df.head()

### Level 10: Supply gaps by item_id aggregated over all stores:
Ituition our algorithm will provide more confident predictions for supply failure gaps for the top aggregation level.

In [None]:
m = viz_df.level == 9
v_df = viz_df.loc[m].pivot(index='d', columns='id', values='gap_2std')
v_df = v_df.reindex(sorted(v_df.columns), axis=1)
f, ax = plt.subplots(figsize=(15, 20))
temp = sns.heatmap(v_df, cmap='Reds')
plt.show()

### Level 11: Supply gaps by item_id aggregated by state:
Ituition our algorithm will provide a bit less confident predictions for supply failure gaps for the top aggregation level.

In [None]:
m = viz_df.level == 10
v_df = viz_df.loc[m].pivot(index='d', columns='id', values='gap_2std')
v_df = v_df.reindex(sorted(v_df.columns), axis=1)
f, ax = plt.subplots(figsize=(15, 20))
temp = sns.heatmap(v_df, cmap='Reds')
plt.show()

### Level 12: Supply gaps by item_id specific for a store:
These gaps will include gaps on previous level plus store specific supply failures.**

In [None]:
m = viz_df.level == 11

ids = list(viz_df.id.value_counts()[viz_df.id.value_counts()>0].index)
n = viz_df.id.isin(ids)

In [None]:
v_df = viz_df.loc[m&n, ['d','id','gap_2std']].copy()
v_df.gap_2std = v_df.gap_2std.astype(np.float32)
v_df = v_df.pivot(index='d', columns='id', values='gap_2std')
v_df = v_df.reindex(sorted(v_df.columns), axis=1)
f, ax = plt.subplots(figsize=(15, 20))
temp = sns.heatmap(v_df, cmap='Reds')
plt.show()

In [None]:
#Finally lets calculate the proportion of non random gaps in original dataset.
# as mentioned by @Amphi2 we should have dropped last 28 days, so lets substract them:

(sum(grid_df['gap_2std'] >= 1) - 42840*28)/grid_df.shape[0]

 [Discussion here.](https://www.kaggle.com/c/m5-forecasting-accuracy/discussion/138085#790628)