In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import gc

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

import pickle

from sklearn.model_selection import StratifiedKFold
from sklearn.cluster import AgglomerativeClustering
import Levenshtein
import xgboost as xgb
from scipy.spatial.distance import squareform
from scipy.spatial.distance import pdist

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# gc.set_debug(gc.DEBUG_LEAK)

In [None]:
def import_data():
    """Import all data from csv files"""
    sales = pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv")
    item_cat = pd.read_csv("../input/competitive-data-science-predict-future-sales/item_categories.csv")
    items = pd.read_csv("../input/competitive-data-science-predict-future-sales/items.csv")
    # sub_sample = pd.read_csv("../input/competitive-data-science-predict-future-sales/sample_submission.csv")
    shops = pd.read_csv("../input/competitive-data-science-predict-future-sales/shops.csv")
    test = pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv")
    return sales, item_cat, items, shops, test


def downcast_dtypes(df):
    """Downcast float columns to float32 and int columns to int16"""
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols = [c for c in df if df[c].dtype in ["int64", "int32"]]
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int16)
    return df

In [None]:
SALES, ITEM_CAT, ITEMS, SHOPS, TEST = import_data()

Take into account, that some items appear to be the same despite of having slightly different names. Special characters are deleted from the item names and item_id is replaced by item_lab.

In [None]:
def pre_process_item_data(df):
    """create new item id 'item_lab' to account for duplicates. Of 22k items there are roughly
    100 duplicates"""
    df['name_pre'] = df.item_name.str.replace(r'[\*\!\./,]', '')
    df['name_pre'] = df.name_pre.str.lower()
    # df['name_pre'] = df.name_pre.str.replace(r'\((.*?)\)', '')  # delete all () brackets
    # df['name_pre'] = df.name_pre.str.replace(r'\[(.*?)\]', '')  # delete all [] brackets
    df['name_pre'] = df.name_pre.str.replace(r'd$', '').str.strip()
    df['item_lab'] = df.name_pre.factorize(sort=True)[0]

In [None]:
pre_process_item_data(ITEMS)
SALES = SALES.merge(ITEMS[['item_id', 'item_category_id', 'item_lab']], how='left', on='item_id')
# SALES.head()

In [None]:
# drop all thats not needed for aggr, merge again later
SALES.drop(['item_id', 'date', 'item_category_id'], axis=1, inplace=True)

Deal with missing values, aggregate the item count per day to item count per month and add zeros for missing shop / item pairs. 

In the test set are 5100 items * 42 shops = 214200 pairs which suggests, that entries with item_cnt_month == 0 are also included. The train set however does not contain any zeros.

In [None]:
def clip_fillna_prices(df):
    # clip item prices with 99 percentile
    quantile_99 = df.item_price.quantile(0.99)
    df.item_price.clip(upper=quantile_99, inplace=True)

    # Fix single item price == -1 value with mean of same month, shop and item
    idx = df[df.item_price < 0].index.tolist()
    df.at[idx[0], 'item_price'] = (2499. + 1249.) / 2.


def aggregate_item_count(df):
    # Aggregate item count to sum per month and item price to mean
    train = df.groupby(['date_block_num', 'shop_id', 'item_lab']).agg(
        item_price=('item_price', 'mean'), item_cnt_month=('item_cnt_day', 'sum')
    ).reset_index()
    
    # clip item_cnt_month to min/max in test set
    train.item_cnt_month.clip(lower=0, upper=20, inplace=True)
    
    return train


def add_missing_item_shop_pairs(df, price):
    # set item count for missing shop / item pairs per month to zero
    df.set_index(['date_block_num', 'shop_id', 'item_lab'], inplace=True)
    idx = []
    for month in df.index.unique('date_block_num'):
        shops_unique = df.loc[month].index.unique('shop_id')
        items_unique = df.loc[month].index.unique('item_lab')
        idx.append(pd.MultiIndex.from_product([[month], shops_unique, items_unique], names=['date_block_num', 'shop_id', 'item_lab']))

    idx = idx[0].append(idx[1:])
    df = df.reindex(idx, fill_value=0.)

    # fill missing item prices with mean
    df = df.reset_index()
    
    df = pd.merge(df, price, how='left', on='item_lab', suffixes=('', '_'))
    df['item_price'] = np.where(df.item_price > 0, df.item_price, df.item_price_)
    del df['item_price_']
    return df

In [None]:
clip_fillna_prices(SALES)
PRICE = SALES[['item_lab', 'item_price']][SALES.item_price > 0].groupby('item_lab').agg('mean')
MATRIX = aggregate_item_count(SALES)
MATRIX = add_missing_item_shop_pairs(MATRIX, PRICE)

MATRIX = MATRIX.merge(ITEMS[['item_category_id', 'item_lab']].drop_duplicates(), how='left', on='item_lab')

In [None]:
# MATRIX.head()

Include test set into matrix, so that feature generation will be consistent over train and test set.

In [None]:
TEST['date_block_num'] = 34
TEST.drop('ID', axis=1, inplace=True)
TEST['item_cnt_month'] = np.nan

TEST = TEST.merge(ITEMS[['item_id', 'item_lab', 'item_category_id']], how='left', on='item_id')
TEST = pd.merge(TEST.drop('item_id', axis=1), PRICE, how='left', on='item_lab')

MATRIX = MATRIX.append(TEST, ignore_index=True)
MATRIX['item_category_id'] = MATRIX.item_category_id.astype('int16')
# MATRIX.head()

Fill missing item prices with mean values

In [None]:
def fillna_means(df, col):
    """Fill with means of item_lab, if not available, use means of category"""
    
    # df[col] = df.groupby('item_id')[col].transform(lambda x: x.fillna(x.mean()))
    df[col] = df.groupby('item_lab')[col].transform(lambda x: x.fillna(x.mean()))
    df[col] = df.groupby('item_category_id')[col].transform(lambda x: x.fillna(x.mean()))

In [None]:
fillna_means(MATRIX, 'item_price')

Do some preprocessing of item category data. Category name is split into category 1 and 2 names.

In [None]:
def pre_process_item_cat_data(df):
    """Do pre processing of item_cat data frame"""
    
    # split name at '-' to seperate to categories and sub-categories
    cat = df.item_category_name.str.split('-', n=1, expand=True)
    df['cat1'] = cat[0].str.strip().str.lower()
    df['cat2'] = cat[1].str.strip().str.lower()

    df.cat1.fillna('', inplace=True)
    df.cat2.fillna('', inplace=True)
    
    df['cat1_lab'] = df.cat1.factorize(sort=True)[0]
    df['cat2_lab'] = df.cat2.factorize(sort=True)[0]

Using Levenshteining, the category names are clustered into groups with similar category names. The new features cat1_lev and cat2_lev are created. They contain labels of the clusters.

In [None]:
def get_string_correlation(array):
    def dist(x, y):
        lev = Levenshtein.distance(x[0],y[0])
        m = np.mean([len(x[0]), len(y[0])])
        return lev / m
    cor = pdist(array.reshape(-1, 1), dist)
    return squareform(cor)


def agglomerative_clustering(distance_matrix, num_clusters=None, threshold=None):
    model = AgglomerativeClustering(n_clusters=num_clusters, affinity='precomputed', linkage='average', distance_threshold=threshold)
    model.fit(distance_matrix)
    return model.labels_


def add_levenshtein_feature(df, cols):
    for col in cols:
        categories = df[col].unique()
        corr = get_string_correlation(categories)
        labels = agglomerative_clustering(corr, threshold=0.5)
        features = pd.DataFrame(np.array([categories, labels]).T, columns=[col, col + '_lev'])
        df = df.merge(features, how='left', on=col)
        df[col + '_lev'] = df[col + '_lev'].astype('int32')
    return df

In [None]:
pre_process_item_cat_data(ITEM_CAT)
ITEM_CAT = add_levenshtein_feature(ITEM_CAT, ['cat1', 'cat2'])
MATRIX = MATRIX.merge(ITEM_CAT[['item_category_id', 'cat1_lab', 'cat2_lab', 'cat1_lev', 'cat2_lev']], how='left',
                      on='item_category_id')


In [None]:
# MATRIX.head()

Preproessing of shop data. Features city_id and shopping_center are created.

In [None]:
def pre_process_shop_data(df):
    split = df.shop_name.str.split(n=1, expand=True)
    df['city'] = split[0].str.lower().str.strip()
    df['city_id'] = df.city.factorize(sort=True)[0]
    df['shopping_center'] = split[1].str.lower().str.contains('ТЦ'.lower())
    

In [None]:
pre_process_shop_data(SHOPS)
MATRIX = MATRIX.merge(SHOPS[['shop_id', 'city_id', 'shopping_center']], how='left', on='shop_id')

The shop revenue per month is added. Since the shop revenue will not be able for the 34th month, only a shop revenue lag feature will be used

In [None]:
def add_shop_revenue(df):
    # tmp = df.loc[:, ['shop_id', 'date_block_num', 'item_price', 'item_cnt_month']]
    df['revenue'] = df.item_price * df.item_cnt_month
    df['shop_revenue_month'] = df.groupby(['shop_id', 'date_block_num'])['revenue'].transform('sum')
    del df['revenue']

# Delete this feature later on, only use lag feature (since revenue will not be available for test set)
    

In [None]:
add_shop_revenue(MATRIX)
# MATRIX.head()

Add lag features for 1, 2 and 12 months.

In [None]:
def lag_feature(df, lags, col):
    """Add lag features based on date_block_num to test and train df"""
    tmp = df[['date_block_num', 'shop_id', 'item_lab', col]]
    
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num', 'shop_id', 'item_lab', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_lab'], how='left')
        df[col+'_lag_'+str(i)].fillna(0, inplace=True)
    
    return df

In [None]:
MATRIX = lag_feature(MATRIX, (1, 2, 12), 'item_cnt_month')
MATRIX = lag_feature(MATRIX, (1, 2, 12), 'shop_revenue_month')
del MATRIX['shop_revenue_month']
# MATRIX.head()

Add month

In [None]:
MATRIX['month'] = MATRIX.date_block_num.mod(12)
# MATRIX.head()

# Add some advanced features

Time since item was first released

In [None]:
MATRIX['release'] = MATRIX.item_lab.map(
    MATRIX[MATRIX.item_cnt_month > 0].groupby(['item_lab']).agg(release=('date_block_num', 'min')).loc[:,'release']
)
MATRIX['since_release'] = MATRIX.date_block_num - MATRIX.release

# substitute nans with 0 (since_release == nan -> release == nan -> item not seen before)
MATRIX['since_release'] = MATRIX.since_release.fillna(0)

# delete all entries where since_release is negative (item not released yet)
# those entries stem from the added item/shop pairs with item_cnt_month == 0
MATRIX = MATRIX[MATRIX.since_release >= 0]
del MATRIX['release']


Add target mean as feature. To get the correct value for the test set, use leaderbord probing:
(https://www.kaggle.com/c/competitive-data-science-predict-future-sales/discussion/79142)

* N=214200
* Predict 0: mse_0 = 1.25011 ** 2
* Predict 1: mse_1 = 1.41241 ** 2

sum_y_test = (mse_1 - mse_0 - 1) / -2

mean_y_test = sum_y_true / N

mean_y_test = 0.28394

In [None]:
MATRIX['mean_item_cnt_month'] = MATRIX.groupby('date_block_num')['item_cnt_month'].transform('mean')
MATRIX[MATRIX.item_cnt_month == 34].mean_item_cnt_month = 0.28394
# MATRIX.head()

Split data into train, validation and test set. Since the 12 month lag features are only available starting in date_block_num == 12, the previous data is not used for training.
Validation is done with month number 33, the test set is month number 34.

In [None]:
IDX_TRAIN = (MATRIX.date_block_num > 11) & (MATRIX.date_block_num < 33)
IDX_VAL = MATRIX.date_block_num == 33
IDX_TEST = MATRIX.date_block_num == 34

Max count of item in whole data set

In [None]:
def add_max_cnt(df):
    max_cnt = df.loc[IDX_TRAIN, ['item_lab', 'shop_id', 'item_cnt_month']].groupby(['item_lab', 'shop_id']).agg(
        max_cnt=('item_cnt_month', 'max')).reset_index()
    df = df.merge(max_cnt, how='left', on=['item_lab', 'shop_id'])
    return df

In [None]:
MATRIX = add_max_cnt(MATRIX)
fillna_means(MATRIX, 'max_cnt')

IDX_TRAIN = (MATRIX.date_block_num > 11) & (MATRIX.date_block_num < 33)
IDX_VAL = MATRIX.date_block_num == 33
IDX_TEST = MATRIX.date_block_num == 34

In [None]:
MATRIX.shape

In [None]:
MATRIX.loc[IDX_TRAIN, 'max_cnt'].shape

Add mean encoding with K-Fold

In [None]:
def get_mean_encodings_kfold(df, idx_train, idx_val, idx_test, feature_names, target_name, folds=5):
    
    skf = StratifiedKFold(n_splits=folds, shuffle=True)
    global_mean = df.loc[idx_train, target_name].mean()
    
    for feature_name in feature_names:
        print(feature_name)
        
        df.loc[:, feature_name + '_mean'] = np.nan
        data = df.loc[idx_train, [feature_name, feature_name + '_mean', target_name]]

        for idx_1, idx_2 in skf.split(data[[feature_name]], data[feature_name]):
            # use means from set 1 for mean encoding of set 2
            x_1 = data.iloc[idx_1].loc[:, [feature_name, feature_name + '_mean', target_name]]
            x_2 = data.iloc[idx_2].loc[:, [feature_name, feature_name + '_mean', target_name]]
            means = x_1.groupby(feature_name).agg(mean_target=(target_name, 'mean')).loc[:, 'mean_target']
            x_2[feature_name + '_mean'] = x_2[feature_name].map(means)
            data.update(x_2)

        data[feature_name + '_mean'].fillna(global_mean, inplace=True)
        df.update(data)
    
    # use means of complete training set for mean encoding of both validation and test set
    for feature_name in feature_names:
        x_1 = df.loc[idx_train, [feature_name, target_name]]
        x_2 = df.loc[np.logical_or(idx_val, idx_test), [feature_name, target_name]]
        means = x_1.groupby(feature_name).agg(mean_target=(target_name, 'mean')).loc[:, 'mean_target']
        x_2[feature_name + '_mean'] = x_2[feature_name].map(means)
        df.update(x_2)
    
    return df

In [None]:
MATRIX = get_mean_encodings_kfold(
    MATRIX, IDX_TRAIN, IDX_VAL, IDX_TEST,
    ['shop_id', 'city_id', 'item_lab', 'cat1_lab', 'cat2_lab', 'cat1_lev', 'cat2_lev', 'month'],
    'item_cnt_month')

# MATRIX[IDX_TRAIN].head()

Downcast types to save some RAM.

In [None]:
MATRIX = downcast_dtypes(MATRIX)
MATRIX.info()

Define train, validation and test matrices and target vectors.

In [None]:
TRAIN_X = MATRIX[IDX_TRAIN].drop('item_cnt_month', axis=1)
TRAIN_Y = MATRIX[IDX_TRAIN].item_cnt_month
VAL_X = MATRIX[IDX_VAL].drop('item_cnt_month', axis=1)
VAL_Y = MATRIX[IDX_VAL].item_cnt_month
TEST_X = MATRIX[IDX_TEST].drop('item_cnt_month', axis=1)

In [None]:
del SALES, ITEM_CAT, ITEMS, SHOPS, TEST

In [None]:
gc.collect()

Use XGBRegressor for predictions. Hyperparameters were slowly adapted over several runs:

* n_estimators: started with 50, increased to 100
* max_depth: started with 3, increased to 5
* learning_rate: started with 0.1, decreased to 0.08

In [None]:

# fit xgb regressor
regressor = xgb.XGBRegressor(n_estimators = 100,
                             learning_rate = 0.08,
                             max_depth = 5,
                             subsample = 0.8,
                             colsample_bytree = 0.8,
                             n_jobs = 8
                            )
reg = regressor.fit(
    X=TRAIN_X,
    y=TRAIN_Y,
    eval_metric='rmse',
    eval_set=[(TRAIN_X, TRAIN_Y), (VAL_X, VAL_Y)],
    verbose=True,
    early_stopping_rounds=8
)

Save model

In [None]:
pickle.dump(reg, open("xgb.pickle.dat", "wb"))

Plot RMSE for train and validation set

In [None]:
plt.plot(reg.evals_result()['validation_0']['rmse'], label='validation_0')
plt.plot(reg.evals_result()['validation_1']['rmse'], label='validation_1')
plt.show()

Check importance of features

In [None]:
importance = pd.Series(reg.feature_importances_, index=TRAIN_X.columns)

In [None]:
importance.sort_values(ascending=False)

Create submission file. Some merging needs to be done since the item_id was changed to item_lab.

In [None]:
TEST = pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv")
PRED = TEST_X[['item_lab', 'shop_id']]
PRED['item_cnt_month'] = reg.predict(TEST_X)
PRED.drop_duplicates(inplace=True)
print(TEST.shape, PRED.shape)

In [None]:
ITEMS = pd.read_csv("../input/competitive-data-science-predict-future-sales/items.csv")
pre_process_item_data(ITEMS)
TEST = TEST.merge(ITEMS[['item_id', 'item_lab']], how='left', on='item_id')
print(TEST.shape)

In [None]:
TEST = TEST.merge(PRED, how='left', on=['item_lab', 'shop_id'])
# TEST.drop_duplicates(ignore_index=True, inplace=True)
TEST.shape

In [None]:
TEST.item_cnt_month.mean()

In [None]:
TEST.item_cnt_month.clip(lower=0, upper=20, inplace=True)

In [None]:
TEST[['ID', 'item_cnt_month']].sort_values('ID').to_csv('submission.csv', index=False, header=True)