# PREDICT FUTURE SALES
# PART I: Feature Engineering

## Description
This challenge serves as final project for the "How to win a data science competition" Coursera course.
In this competition you will work with a challenging time-series dataset consisting of daily sales data, kindly provided by one of the largest Russian software firms - 1C Company. 
We are asking you to predict total sales for every product and store in the next month. By solving this competition you will be able to apply and enhance your data science skills.

## Evaluation
Submissions are evaluated by root mean squared error (RMSE). True target values are clipped into [0,20] range.

## Reference

In [None]:
# useful links:
# https://pbpython.com/pandas-qcut-cut.html
# https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65
# https://towardsdatascience.com/categorical-encoding-using-label-encoding-and-one-hot-encoder-911ef77fb5bd
# https://towardsdatascience.com/better-heatmaps-and-correlation-matrix-plots-in-python-41445d0f2bec

# Setup

## Imports

In [None]:
import numpy as np 
import pandas as pd 
import os
from itertools import product
import seaborn as sns
import matplotlib as mpl
from matplotlib import pyplot as plt
import gc
import pickle
import re
from sklearn.preprocessing import LabelEncoder
sns.set(context='talk')

## Constants

In [None]:
PATH_TO_INPUT_FILES = '../input/competitive-data-science-predict-future-sales'
DTYPES_FOR_COLUMN_NAMES = pd.Series({
    r'avg_':'float16',
    r'^date_block_num$':'uint8',
    r'^item_id$':'uint16',
    r'^shop_id$':'uint8',
    r'item_cnt_month':'uint8',
    r'^item_id_bucket$':'uint8',
    r'^shop_city$':'uint8',
    r'^item_category_id$':'uint8',
    r'^item_category_group$':'uint8',
    r'^item_first_sold$':'uint8',
    r'^item_sold_before$':'uint8', 
    r'^item_sold_before_in_shop$':'uint8',
    r'^month$':'uint8',
    r'^year$':'uint16',
    r'^daysinmonth$':'uint8',
    r'^quarter$':'uint8',
})
RUSSIAN_REGEX_PATTERN = '[^а-яА-Я0-9]+'

## Helper functions

In [None]:
def read_csv_from_input(file_name):
    return pd.read_csv(os.path.join(PATH_TO_INPUT_FILES, file_name))

def print_df_info(df, name_df='DF'):
    print('DATAFRAME:',name_df)
    print(f'\n-----------------{name_df} INFO-----------------\n')
    print(df.info())
    print(f'\n-----------------{name_df} STATISTICS-----------------\n')
    print(df.describe())
    print(f'\n-----------------{name_df} COUNT NAN VALUES-----------------\n')
    print(df.isna().sum())
    print(f'\n-----------------{name_df} SAMPLE-----------------\n')
    print(df.sample(5))

def drop_columns_in_df(df, cols):
    for col in cols:
        if col in df.columns:
            df.drop(col, axis=1, inplace=True)

def print_columns_in_df(df):
    for col in df.columns:
        print(f"'{col}',")

def transform_dtypes_in_df(df, col_dtype_pairs=DTYPES_FOR_COLUMN_NAMES):
    for (key,value) in col_dtype_pairs.items():
        for col in df.columns:
            if re.search(key, col):
                df[col] = df[col].astype(value)
    return df

def rel_value_counts_in_df(df):
    return df.value_counts() / len(df)

def merge_rare_cat_values(df_col, threshold=0.005, replace_by='other'):
    rel_value_counts = rel_value_counts_in_df(df_col)
    rare_cat_values = rel_value_counts[rel_value_counts < threshold].index
    return df_col.replace(rare_cat_values, replace_by)

def clean_word(word):
    return re.sub(RUSSIAN_REGEX_PATTERN, '', word)

def transform_column(df_column, dtype):
    if dtype:
        df_column = df_column.astype(dtype)
    return df_column

def fillna_column(df_column, fillna_value):
    if fillna_value is not None:
        df_column = df_column.fillna(fillna_value)
    return df_column

def get_column_name_from_merge_cols(merge_cols, column_name=''):
    column_name += '_'.join(merge_cols)
    return column_name

def get_buckets_from_continuous_feature(df_col, n_bins):
    '''
    Create buckets from continuous features (binning).
    qcut: equal distribution of the items in your bins
    cut: define your own numeric bin ranges
    '''
    labels_bins = [(f'bin_{n_bin}') for n_bin in range(n_bins)]
    return pd.cut(df_col, bins=n_bins, labels=labels_bins)

# The following function is mainly copied from
# https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65
def reduce_mem_usage(props, float_type=np.float32, verbose=0):
    if verbose:
        start_mem_usg = props.memory_usage().sum() / 1024**2 
        print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            if verbose:
                # Print current column type
                print("******************************")
                print("Column: ",col)
                print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes float_type
            else:
                props[col] = props[col].astype(float_type)
            
            if verbose:
                # Print new column type
                print("dtype after: ",props[col].dtype)
                print("******************************")
    if verbose:
        # Print final result
        print("___MEMORY USAGE AFTER COMPLETION:___")
        mem_usg = props.memory_usage().sum() / 1024**2 
        print("Memory usage is: ",mem_usg," MB")
        print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props

## Visualize the item_price and item_cnt_day

In [None]:
def plot_item_price_cnt_day(df_sales):
    fig, (ax_price, ax_cnt_day) = plt.subplots(ncols=2, figsize=(18,5))
    plot_item_price = sns.boxplot(df_sales['item_price'], ax=ax_price)
    plot_item_cnt_day = sns.boxplot(df_sales['item_cnt_day'], ax=ax_cnt_day)   

## Delete outliers in training data

In [None]:
def delete_outliers_from_quantiles(df_sales):
    '''
    The item_price and the item_cnt_day are filtered according to the specified quantile,
    such that outliers are deleted from the data set.
    '''
    # Define quantiles for data range.
    quantiles_to_filter_outliers = [0.005, 0.995]
    min_item_price, max_item_price = df_sales['item_price'].quantile(quantiles_to_filter_outliers)
    min_item_cnt_day, max_item_cnt_day = df_sales['item_cnt_day'].quantile(quantiles_to_filter_outliers)

    # df_sales_filtered corresponds to the cleaned data set.
    filter_query = '(@min_item_price <= item_price <= @max_item_price)' \
                 + ' & (@min_item_cnt_day <= item_cnt_day <= @max_item_cnt_day)'
    df_sales_filtered = df_sales.query(filter_query).copy()
    return df_sales_filtered

def delete_outliers_from_absolute_values(df_sales):
    df_sales_filtered = df_sales \
        .query('(item_price < 100000) & (item_price > 0) & (item_cnt_day < 1001)') \
        .copy()
    return df_sales_filtered

# Get monthly data

## Create shop-item gird from train set

Up to now, only sold products are tracked in the training data set. Therefore, two possible data augmentation strategies can be followed
1. each shop should contain all items that are sold in that specific month, i.e. the product portfolio changes every month
2. each shop should contain all items that are sold in the whole period, i.e. the product portfolio is constant

Next, we are following the first approach:

In [None]:
def create_shop_item_grid(df_sales):
    '''
    Create grid for shop-item pairs.
    '''
    matrix = []
    merge_cols = ['date_block_num', 'item_id', 'shop_id']
    date_block_nums = np.sort(df_sales['date_block_num'].unique())
    for date_block_num in date_block_nums:
        sales = df_sales.query('date_block_num == @date_block_num').copy()
        matrix.append(
            np.array(
                list(
                    product(
                        [date_block_num],
                        sales['item_id'].unique(),
                        sales['shop_id'].unique(),
                    )
                )
            )
        )
        
    matrix = pd.DataFrame(np.vstack(matrix), columns=merge_cols, dtype=np.int16)
    matrix.sort_values(merge_cols,inplace=True)

    # get monthly sold items per shop_id and item_id
    group = df_sales \
        .groupby(['date_block_num', 'item_id', 'shop_id']) \
        .agg({'item_cnt_day':'sum'}) \
        .rename(columns={'item_cnt_day':'item_cnt_month'}) \
        .reset_index()

    # merge with shop_id-item_id pairs 
    matrix = pd.merge(
        matrix,
        group,
        on=merge_cols,
        how='left'
    )

    matrix['item_cnt_month'] = matrix['item_cnt_month'] \
        .fillna(0) 
    return matrix

## Add unlabeled data to grid

In [None]:
def concat_with_unlabeled_data(matrix, df_unlabeled, date_block_num=None):
    if date_block_num is not None:
        df_unlabeled['date_block_num'] = date_block_num
    if 'item_cnt_month' not in df_unlabeled.columns:
        df_unlabeled['item_cnt_month'] = 0.0
    matrix = pd.concat([matrix, df_unlabeled], ignore_index=True, sort=False)
    return matrix

# Additional feature engineering

## Add date-based features

In [None]:
def merge_with_date_features(matrix, df_sales):
    df = df_sales.copy()
    df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y')
    start_date = df['date'].min()
    df_time = pd.DataFrame()
    df_time['date'] = pd.date_range(start=start_date, periods=100, freq='M') 
    df_time['date_block_num'] = df_time.index            
    df_time['month'] = df_time['date'].dt.month
    df_time['year'] = df_time['date'].dt.year
    df_time['daysinmonth'] = df_time['date'].dt.daysinmonth
    df_time['quarter'] = df_time['date'].dt.quarter
    df_time.drop('date', axis=1, inplace=True)
    df_time = reduce_mem_usage(df_time, float_type=np.float16, verbose=0)
    return pd.merge(matrix, df_time, on='date_block_num', how='left')

## Add item-based features

In [None]:
def merge_with_item_features(matrix, df_items, df_item_cat):
    df_items_and_cats_info = pd.merge(df_items, df_item_cat, on='item_category_id', how='left')
    df_items_and_cats_info['item_category_group'] = df_items_and_cats_info['item_category_name'] \
        .str.split('-').map(lambda x: x[0])
    drop_columns_in_df(df_items_and_cats_info, ['item_category_name','item_name'])
    matrix = pd.merge(matrix, df_items_and_cats_info, on=['item_id'], how='left')
    return matrix

## Add shop-based features 

In [None]:
def remove_duplicated_shops(df, duplicated_shops):
    for (shop_id, shop_id_duplicate) in duplicated_shops:
        df.loc[df['shop_id'] == shop_id, 'shop_id'] = shop_id_duplicate
    return df

In [None]:
def plot_num_words_in_shop_name(df_shops):
    n_words = df_shops['shop_name'].str.split(" ").map(lambda x: len(x)).astype('category')
    fig, ax = plt.subplots()
    ax = n_words.hist()
    ax.set_xlabel('number of words in shop_name')
    plt.xticks(n_words.cat.categories);

In [None]:
def merge_with_shop_features(matrix, df_shops):
    df_shops['shop_city'] = df_shops['shop_name'].str.split(" ").map(lambda x: x[0].lower()).apply(clean_word)
    drop_columns_in_df(df_shops, ['shop_name'])
    matrix = pd.merge(matrix, df_shops, on=['shop_id'], how='left')
    return matrix

In [None]:
def get_duplicated_shops(df_shops, n_extra_info=1):
    '''
    The duplicated shop_ids are derived from the shop_city, shop_category and n_extra_infos.
    The shop_city, the shop_category as well as extra_infos are extracted from
    the shop_name. n_extra_info determines the number of additional infos. 
    '''
    df = df_shops.copy()
    extra_info_cols = [f'extra_info_{n}' for n in range(n_extra_info)]
    cols = ['shop_city', 'shop_category'] + extra_info_cols
    cols_iter = iter(cols)
    for n in range(len(cols)):
        df[next(cols_iter)] = df['shop_name'].str.split(' ').map(lambda x: x[n].lower() if (len(x) > n) else ' ').apply(clean_word)
    duplicated_shops_bools = df \
        .duplicated(keep=False, subset=cols)                    
    duplicated_shops = df[duplicated_shops_bools] \
        .groupby(cols)['shop_id'].apply(lambda x: list(x))       
    return duplicated_shops

## Add first-sold features

In [None]:
def merge_with_first_sold_features(matrix):
    group = matrix \
        .query('item_cnt_month > 0') \
        .groupby('item_id') \
        .agg({'date_block_num': 'min'}) \
        .reset_index()
    group['item_first_sold'] = 1
    group['date_block_num_item_first_sold'] = group['date_block_num'].astype(np.int8)
    matrix = pd.merge(
        matrix,
        group.drop('date_block_num_item_first_sold',axis=1),
        on=['item_id', 'date_block_num'],
        how='left'
    )
    matrix = pd.merge(
        matrix,
        group.drop(['date_block_num', 'item_first_sold'],axis=1),
        on=['item_id'], how='left'
    )   
    matrix['item_first_sold'].fillna(0, inplace=True) 
    matrix['date_block_num_item_first_sold'].fillna(100, inplace=True)
    group = matrix \
        .query('item_cnt_month > 0') \
        .groupby(['shop_id','item_id']) \
        .agg({'date_block_num': 'min'}) \
        .rename(columns={'date_block_num':'date_block_num_item_first_sold_in_shop'}) \
        .reset_index()

    matrix = pd.merge(matrix, group, on=['shop_id', 'item_id'], how='left')
    matrix['item_sold_before'] = matrix.eval('date_block_num > date_block_num_item_first_sold').astype(np.int8)
    matrix['item_sold_before_in_shop'] = matrix.eval('date_block_num > date_block_num_item_first_sold_in_shop').astype(np.int8)
    matrix['item_first_sold'] = matrix.eval('date_block_num > date_block_num_item_first_sold_in_shop').astype(np.int8)
    drop_columns_in_df(matrix, [
        'date_block_num_item_first_sold',
        'date_block_num_item_first_sold_in_shop',
    ])
    return matrix

## Add price-related features


In [None]:
class PriceRelatedFeatures(object):
    def __init__(self, df_sales):
        self.df_sales = df_sales
        
    def merge_with_avg_price_wrt_merge_cols(self, matrix, merge_cols,
                                            dtype=None, fillna_value=None):
        '''
        Average price with respect to merge_cols. 
        '''
        col = get_column_name_from_merge_cols(merge_cols, column_name='avg_price_wrt_')                
        group = self.df_sales \
                .groupby(merge_cols) \
                .agg({'item_price':'mean'}) \
                .rename(columns={'item_price': col}) \
                .reset_index() 
        matrix = pd.merge(matrix, group, on=merge_cols, how='left')
        matrix[col] = fillna_column(matrix[col], fillna_value)
        matrix[col] = transform_column(matrix[col], dtype)
        return matrix

    def merge_with_rel_price_wrt_merge_cols(self, matrix, merge_cols_1, merge_cols_2,
                                            dtype=None, fillna_value=None):
        '''
        Relative average price: col_3 = (col_1 - col_2) / col_2
        ''' 
        col_1 = get_column_name_from_merge_cols(merge_cols_1, column_name='avg_price_wrt_')
        col_2 = get_column_name_from_merge_cols(merge_cols_2, column_name='avg_price_wrt_')
        col_3 = get_column_name_from_merge_cols(merge_cols_2, column_name='rel_avg_price_')
        matrix[col_3] = matrix.eval(f'({col_1} - {col_2}) / {col_2}')
        matrix[col_3] = fillna_column(matrix[col_3], fillna_value)
        matrix[col_3] = transform_column(matrix[col_3], dtype)
        return matrix
    
    def merge_with_rel_avg_price_lag(self, matrix, merge_cols_1, merge_cols_2, lags=[1,2,3]):
        matrix = self.merge_with_avg_price_wrt_merge_cols(matrix,
                                                          merge_cols=merge_cols_1,
                                                          dtype=np.float16, fillna_value=None)

        matrix = self.merge_with_avg_price_wrt_merge_cols(matrix,
                                                          merge_cols=merge_cols_2,
                                                          dtype=np.float16, fillna_value=None)

        matrix = self.merge_with_rel_price_wrt_merge_cols(matrix,
                                                          merge_cols_1=merge_cols_1,
                                                          merge_cols_2=merge_cols_2,
                                                          dtype=np.float16,fillna_value=0)

        matrix = lag_feature(matrix, lags,
                             get_column_name_from_merge_cols(merge_cols_2, 'rel_avg_price_'),
                             dtype=np.float16, fillna_value=0)
        drop_columns_in_df(matrix,
                           [get_column_name_from_merge_cols(merge_cols_1, 'avg_price_wrt_'),
                            get_column_name_from_merge_cols(merge_cols_2, 'avg_price_wrt_'),
                            get_column_name_from_merge_cols(merge_cols_2, 'rel_avg_price_')])
        return matrix

## Add revenue-related features

In [None]:
class RevenueRelatedFeatures(object):
    def __init__(self, df_sales):
        self.df_sales = df_sales
        self.df_sales['revenue'] = self.df_sales.eval('item_price * item_cnt_day')
        
    def merge_with_tot_revenue_wrt_date_block_num_shop_id(self, matrix,
                                                          dtype=None, fillna_value=None):
        merge_cols = ['date_block_num', 'shop_id']
        col = 'tot_revenue_wrt_date_block_num_shop_id'
        group = self.df_sales \
                .groupby(merge_cols) \
                .agg({'revenue': 'sum'}) \
                .rename(columns={'revenue': col}) \
                .reset_index()
        matrix = pd.merge(matrix, group, on=merge_cols, how='left')
        matrix[col] = fillna_column(matrix[col], fillna_value)
        matrix[col] = transform_column(matrix[col], dtype)            
        return matrix
        
    def merge_with_avg_tot_revenue_wrt_shop_id(self, matrix, 
                                               dtype=None, fillna_value=None,
                                               drop_tot_revenue=True):
        
        if 'tot_revenue_wrt_date_block_num_shop_id' not in matrix.columns:
            matrix = self.merge_with_tot_revenue_wrt_date_block_num_shop_id(matrix, dtype, fillna_value)
        col = 'avg_tot_revenue_wrt_shop_id'
        merge_cols = ['shop_id']
        group = matrix \
            .groupby(merge_cols) \
            .agg({'tot_revenue_wrt_date_block_num_shop_id': 'mean'}) \
            .rename(columns={'tot_revenue_wrt_date_block_num_shop_id': col}) \
            .reset_index()
        matrix = pd.merge(matrix, group, on=merge_cols, how='left')
        matrix[col] = fillna_column(matrix[col], fillna_value)
        matrix[col] = transform_column(matrix[col], dtype)
        if drop_tot_revenue:
            drop_columns_in_df(matrix, ['tot_revenue_wrt_date_block_num_shop_id'])
        return matrix
        
    def merge_with_rel_shop_revenue(self, matrix,
                                    dtype=None, fillna_value=None):
        '''
        col_1 = 'tot_revenue_wrt_date_block_num_shop_id'
        col_2 = 'avg_tot_revenue_wrt_shop_id'
        Relative shop revenue: col_3 = (col_1 - col_2) / col_2
        '''
        col_1 = 'tot_revenue_wrt_date_block_num_shop_id'
        col_2 = 'avg_tot_revenue_wrt_shop_id'
        col_3 = 'rel_avg_tot_revenue_wrt_shop_id'
        matrix[col_3] = matrix.eval(f'({col_1} - {col_2}) / {col_2}')
        matrix[col_3] = fillna_column(matrix[col_3], fillna_value)
        matrix[col_3] = transform_column(matrix[col_3], dtype)
        return matrix
        
    def merge_with_rel_avg_tot_shop_revenue_lag(self, matrix, lags=[1,2,3]):
        col_1 = 'tot_revenue_wrt_date_block_num_shop_id'
        col_2 = 'avg_tot_revenue_wrt_shop_id'
        col_3 = 'rel_avg_tot_revenue_wrt_shop_id'
        matrix = self.merge_with_avg_tot_revenue_wrt_shop_id(matrix, 
                                                             dtype=np.float16, fillna_value=None,
                                                             drop_tot_revenue=False)
        matrix = self.merge_with_rel_shop_revenue(matrix,dtype=np.float16, fillna_value=0)            
        matrix = lag_feature(matrix, lags,col_3,
                             dtype=np.float16, fillna_value=0)
        drop_columns_in_df(matrix,[col_1, col_2, col_3])
        return matrix

## Add monthly item count lags

In [None]:
def merge_with_avg_item_cnt_wrt_merge_cols(matrix, merge_cols,
                                           dtype=None, fillna_value=None):
    '''
    Average item count with respect to merge_cols
    '''
    col = get_column_name_from_merge_cols(merge_cols, column_name='avg_item_cnt_wrt_')
    group = matrix \
            .groupby(merge_cols) \
            .agg({'item_cnt_month':'mean'}) \
            .rename(columns={'item_cnt_month': col}) \
            .reset_index()
    matrix = pd.merge(matrix, group, on=merge_cols, how='left')
    matrix[col] = fillna_column(matrix[col], fillna_value)
    matrix[col] = transform_column(matrix[col], dtype)
    return matrix

def merge_with_avg_item_cnt_lag(matrix, merge_cols, lags=[1,2,3]):
    matrix = merge_with_avg_item_cnt_wrt_merge_cols(matrix,
                                                    merge_cols=merge_cols,
                                                    dtype=np.float16, fillna_value=None)
 
    matrix = lag_feature(matrix, lags,
                         get_column_name_from_merge_cols(merge_cols, 'avg_item_cnt_wrt_'),
                         dtype=np.float16, fillna_value=0)
    drop_columns_in_df(matrix, [get_column_name_from_merge_cols(merge_cols, 'avg_item_cnt_wrt_')])
    return matrix

## Add item count for new items

In [None]:
def merge_with_avg_item_cnt_of_new_items_wrt_merge_cols(matrix, merge_cols, 
                                                        dtype=None, fillna_value=None):
    '''
    Average item count of new items with respect to merge_cols
    '''
    col = get_column_name_from_merge_cols(merge_cols, column_name='avg_item_cnt_new_wrt_')
    group = matrix \
        .query('item_first_sold == 1') \
        .groupby(merge_cols) \
        .agg({'item_cnt_month':'mean'}) \
        .rename(columns={'item_cnt_month': col}) \
        .reset_index()
    matrix = pd.merge(matrix, group, on=merge_cols, how='left')
    matrix[col] = fillna_column(matrix[col], fillna_value)
    matrix[col] = transform_column(matrix[col], dtype)
    return matrix

def merge_with_avg_item_cnt_of_new_items_lag(matrix, merge_cols, lags=[1,2,3]):
    matrix = merge_with_avg_item_cnt_of_new_items_wrt_merge_cols(matrix,
                                                                 merge_cols=merge_cols,
                                                                 dtype=np.float16, fillna_value=None)

    matrix = lag_feature(matrix, lags,
                         get_column_name_from_merge_cols(merge_cols, 'avg_item_cnt_new_wrt_'),
                         dtype=np.float16, fillna_value=0)
    drop_columns_in_df(matrix, [get_column_name_from_merge_cols(merge_cols, 'avg_item_cnt_new_wrt_')])
    return matrix

## Lag functions

In [None]:
def lag_feature(df, lags, col,
                merge_cols=['date_block_num','shop_id','item_id'],
                dtype=None, fillna_value=None):
    '''
    The function returns the lag of a feature and merges it to the 
    input dataframe.
    '''
    tmp = df[merge_cols + [col]].copy()
    index_col = 'date_block_num'
    for lag in lags:
        shifted = tmp.copy()
        col_lag = f'{col}_lag_{lag}'
        shifted[index_col] = shifted[index_col].map(lambda x: x+lag)
        shifted.rename(columns={col: col_lag}, inplace=True)
        df = pd.merge(df, shifted, on=merge_cols, how='left')
        df[col_lag] = fillna_column(df[col_lag], fillna_value)
        df[col_lag] = transform_column(df[col_lag], dtype)
    return df

In [None]:
def lag_avg_feature(df, lag_delta, col,
                    merge_cols=['date_block_num','shop_id','item_id'],
                    dtype=None, fillna_value=None):
    '''
    The function returns the mean value from the lag of a feature.
    First, the lags of the feature are computed, while in a second step
    the lag values are averaged row-wise. In the last step, the averaged
    lag feature is merged to the input dataframe.
    '''
    tmp = df[merge_cols + [col]].copy()
    agg_fun='mean'
    index_col = 'date_block_num'
    lag_avg_col = f'lag_avg_{value_col}'
    for lag in np.arange(-lag_delta, lag_delta+1):
        shifted = tmp.copy()
        col_lag = f'{col}_lag_{lag}'
        shifted[index_col] = shifted[index_col].map(lambda x: x+lag)
        shifted.rename(columns={col: col_lag}, inplace=True)
        df = pd.merge(df, shifted, on=merge_cols, how='left')
        df[col_lag] = transform_column(df[col_lag], dtype)
    pattern = rf'{col}_lag_'
    lag_cols = [col for col in df.columns if re.search(pattern, col)]
    df[lag_avg_col] = df[lag_cols].agg(agg_fun, axis=1)
    df[lag_avg_col] = fillna_column(df[lag_avg_col], fillna_value)
    df[lag_avg_col] = transform_column(df[lag_avg_col], dtype)
    drop_columns_in_df(df, lag_cols)
    return df

In [None]:
# Advanced lag feature motivated by kaggle kernal:
# https://www.kaggle.com/uladzimirkapeika/feature-engineering-lightgbm-top-1
def lag_feature_adv(df, lags, col,
                    shift = -1,
                    merge_cols=['date_block_num','shop_id','item_id'],
                    dtype=None, fillna_value=None):
    '''
    The function returns the lag of a feature with respect to a neighboring
    item_id. The lag is finally merged with the input dataframe.
    ''' 
    shift_col = 'item_id'
    index_col = 'date_block_num'
    tmp = df[merge_cols + [col]].copy()
    tmp[shift_col] = tmp[shift_col].map(lambda x: x+shift)
    for lag in lags:
        col_lag_adv = f'{col}_adv_lag_{lag}'
        shifted = tmp.copy()
        shifted[index_col] = shifted[index_col].map(lambda x: x+lag)
        shifted.rename(columns={col: col_lag_adv}, inplace=True)
        df = pd.merge(df, shifted, on=merge_cols, how='left')
        df[col_lag_adv] = fillna_column(df[col_lag_adv], fillna_value)
        df[col_lag_adv] = transform_column(df[col_lag_adv], dtype)
    return df

# Execute functions

## Preprocessing

In [None]:
# import csv files
df_sales = read_csv_from_input('sales_train.csv')
df_item_cat = read_csv_from_input('item_categories.csv')
df_items = read_csv_from_input('items.csv')
df_shops = read_csv_from_input('shops.csv')
df_test = read_csv_from_input('test.csv')

# preprocessing
df_sales['date'] = pd.to_datetime(df_sales['date'], format='%d.%m.%Y')
drop_columns_in_df(df_test, ['ID'])

# remove duplicated shops
duplicated_shops = get_duplicated_shops(df_shops, n_extra_info=1)
df_sales = remove_duplicated_shops(df_sales, duplicated_shops)
df_test = remove_duplicated_shops(df_test, duplicated_shops)

# remove outliers 
df_sales = delete_outliers_from_quantiles(df_sales)

# monthly shop-item pairs
matrix = create_shop_item_grid(df_sales)

# add test data
matrix = concat_with_unlabeled_data(matrix, df_test, date_block_num=34)

# add bucket for item_id
matrix['item_id_bucket'] = get_buckets_from_continuous_feature(matrix['item_id'], n_bins=25).astype('object')

## Basic feature engineering

In [None]:
# shop features
matrix = merge_with_shop_features(matrix, df_shops)

# item features
matrix = merge_with_item_features(matrix, df_items, df_item_cat)
matrix['item_category_group'] = merge_rare_cat_values(matrix['item_category_group'], threshold=0.005, replace_by='other')

# categorical feature encoding
encoder = {}
for cat_feature in ['shop_city', 'item_category_group', 'item_id_bucket']:
    encoder[cat_feature] = LabelEncoder()
    matrix[cat_feature] = encoder[cat_feature].fit_transform(matrix[cat_feature]).astype(np.int32)

# first-sold features
matrix = merge_with_first_sold_features(matrix)

# date features
matrix = merge_with_date_features(matrix, df_sales)

# reduce size
matrix = transform_dtypes_in_df(matrix)

## Advanced feature engineering
### Price- and revenue related features

In [None]:
# price-related features
rel_avg_price_merge_cols_1 = [
    ['date_block_num', 'item_id'],
    ['date_block_num', 'item_category_id'],
]
rel_avg_price_merge_cols_2 = [
    ['item_id'],
    ['item_category_id'],
]
priceRelatedFeatures = PriceRelatedFeatures(merge_with_item_features(df_sales, df_items, df_item_cat))
for (m_1, m_2) in zip(rel_avg_price_merge_cols_1, rel_avg_price_merge_cols_2):
    matrix = priceRelatedFeatures.merge_with_rel_avg_price_lag(matrix,
                                                               merge_cols_1=m_1,
                                                               merge_cols_2=m_2,
                                                               lags=[1,2,3])
matrix = priceRelatedFeatures.merge_with_avg_price_wrt_merge_cols(matrix,
                                                                  ['item_id'],
                                                                  dtype=np.float16)
matrix = priceRelatedFeatures.merge_with_avg_price_wrt_merge_cols(matrix,
                                                                  ['item_category_id'],
                                                                  dtype=np.float16)
# Fill NaN values in column with values from different column
matrix['avg_price_wrt_item_id'] = matrix['avg_price_wrt_item_id'].combine_first(matrix['item_category_id'])

# revenue-realted features
revenueRelatedFeatures = RevenueRelatedFeatures(df_sales)
matrix = revenueRelatedFeatures.merge_with_rel_avg_tot_shop_revenue_lag(matrix, lags=[1,2,3])

### Average item cnt features

In [None]:
# item cnt lag feature
matrix = lag_feature(matrix, [1, 2, 3], 'item_cnt_month', dtype=np.float16, fillna_value=0)

# average item cnt lag
avg_item_cnt_merge_cols = [
    ['date_block_num', 'item_id'],
    ['date_block_num', 'item_id_bucket'],
    ['date_block_num', 'item_id', 'shop_city'],
    ['date_block_num', 'item_id', 'shop_id'],
    ['date_block_num', 'item_category_id', 'shop_id'],
    ['date_block_num', 'item_category_group', 'shop_id'],
]
for m in avg_item_cnt_merge_cols:
    matrix = merge_with_avg_item_cnt_lag(matrix, merge_cols=m, lags=[1,2,3])

# average item cnt for new items
avg_item_cnt_of_new_items_merge_cols = [
    ['date_block_num', 'item_category_id', 'shop_id'],
    ['date_block_num', 'shop_id'],
]
for m in avg_item_cnt_of_new_items_merge_cols:
    matrix = merge_with_avg_item_cnt_of_new_items_lag(matrix, merge_cols=m, lags=[1,2,3])

# advanced item cnt lag feature
matrix = lag_feature_adv(matrix, [1, 2, 3], 'item_cnt_month', dtype=np.float16, fillna_value=0)

## Postprocessing
* Reduce memory usage.
* Delete missing data from dataframe.
* Clip data to range.
* Data set contains lagged features, such that the first months cannot be used as training data. 

In [None]:
matrix.fillna(0, inplace=True)
matrix = transform_dtypes_in_df(matrix)
matrix['item_cnt_month'] = matrix['item_cnt_month'].clip(0, 20)
matrix = matrix.query('date_block_num > 2').copy()

Save data set and encoder

In [None]:
matrix.to_pickle('data_after_feature_eng.pkl')
with open('cat_encoder.pkl', 'wb') as handle:
    pickle.dump(encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
matrix.info()