In [1]:
!ls ../input/*

../input/competitive-data-science-predict-future-sales:
item_categories.csv  sales_train.csv	    shops.csv
items.csv	     sample_submission.csv  test.csv

../input/pfs-dataprep:
__notebook__.ipynb  __results__.html   custom.css  mean_benchmark.csv
__output__.json     __results___files  data.h5	   prev_month_benchmark.csv


In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import mean_squared_error
from math import sqrt
import gc
import itertools
import time

from catboost import CatBoostRegressor, Pool

In [3]:
SEED = 42
DATA_PATH = '../input/pfs-dataprep/data.h5'
# Number of rows for tuning and training
TRAIN_ROWS = 3000000 # (~10 months)
# Months to reserve for ensembling methods
META_MONTHS = 4

EXCL_COLS = ['item_cnt_month']
FIXED_PARAMS = {
    'iterations': 1000,
    'learning_rate': 0.1,
    'random_state': SEED,
    'eval_metric': 'RMSE',
    'od_type': 'Iter', 
    'od_wait': 30
}
# Always treat those columns as categorical
FIXED_CAT_COLS = ['month', 'city_id', 'type_id', 'subtype_id']
# Choose the best combination of those columns based on the validation score
DYNAMIC_CAT_COLS = ['shop_id', 'item_category_id', 'item_id']

In [4]:
%%time
traintest = pd.read_hdf(DATA_PATH, key='traintest')

print(traintest.shape)

(7393103, 59)
CPU times: user 9.94 s, sys: 4.58 s, total: 14.5 s
Wall time: 15 s


In [5]:
train_idx = traintest[traintest.date_block_num<34-META_MONTHS].sample(n=TRAIN_ROWS, random_state=SEED).index
valid_idx = traintest[traintest.date_block_num==33].index
test_idx = traintest[traintest.date_block_num>=34-META_MONTHS].index

print(len(train_idx), len(valid_idx), len(test_idx))

3000000 238172 1114452


In [6]:
X_test = traintest.loc[test_idx].drop(EXCL_COLS, axis=1)

print(X_test.shape, sorted(X_test.date_block_num.unique()))

(1114452, 58) [30, 31, 32, 33, 34]


### Brute force categorical columns

In [7]:
def split_data(cat_cols):
    start_time = time.time()
    # Split data and build pools beforehand which takes longer if done dynamically
    df = traintest.copy()
    if len(cat_cols) > 0:
        df[cat_cols] = traintest[cat_cols].astype('category', copy=False)
        cat_features = [list(traintest.columns).index(col) for col in cat_cols]
    else:
        cat_features = []
    # Split data
    X_train = traintest.loc[train_idx].drop(EXCL_COLS, axis=1)
    y_train = traintest.loc[train_idx].item_cnt_month
    train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features)
    
    X_valid = traintest.loc[valid_idx].drop(EXCL_COLS, axis=1)
    y_valid = traintest.loc[valid_idx].item_cnt_month
    valid_pool = Pool(data=X_valid, label=y_valid, cat_features=cat_features)
    
    test_pool = Pool(data=X_test, cat_features=cat_features)
    
    return train_pool, valid_pool, test_pool

In [8]:
def setup_and_fit_model(train_pool, valid_pool, silent=True):
    catboost = CatBoostRegressor(**FIXED_PARAMS)
    catboost.fit(train_pool, silent=silent, eval_set=valid_pool, use_best_model=True)
    train_score = catboost.best_score_['learn']['RMSE']
    valid_score = catboost.best_score_['validation_0']['RMSE']
    print("train_score {:.4f}, valid_score {:.4f}".format(train_score, valid_score))
    return catboost

In [9]:
def get_feature_importance(catboost, train_pool):
    feature_scores = pd.Series(catboost.get_feature_importance(data=train_pool), 
                               index=traintest.drop('item_cnt_month', axis=1).columns)
    feature_scores = feature_scores.sort_values(ascending=False)
    return feature_scores

In [10]:
def save_preds(preds, fname):
    preds = pd.Series(preds, index=test_idx)
    preds.to_csv(fname)
    print(preds.head())
    print(preds.describe()[['mean', 'std', 'min', 'max']])

In [11]:
def all_list_combinations(lst):
    all_combs = []
    for n in range(0, len(lst)+1):
        for i, subset in enumerate(itertools.combinations(lst, n)):
            all_combs.append(list(subset))
    return all_combs

In [12]:
%%time
# Try all possible combinations of categorical columns and feed them to CatBoost
all_combs = all_list_combinations(DYNAMIC_CAT_COLS)

for i, subset in enumerate(all_combs):
    # Choose a name for the file
    join_name = '-'.join([s[:-3] for s in subset])
    join_name = join_name if len(join_name) > 0 else 'none'
    config_name = "{}_preds.csv".format(join_name)
    print(f"{i+1}/{len(all_combs)}: {config_name}")
    
    # split data and create pool objects
    start_time = time.time()
    train_pool, valid_pool, test_pool = split_data(subset + FIXED_CAT_COLS)
    print("Pools built - {:.2f}s".format(time.time() - start_time))
    
    # fit regressor
    catboost = setup_and_fit_model(train_pool, valid_pool)
    print("Model fitted - {:.2f}s".format(time.time() - start_time))
    
    # show the most important features
    print(get_feature_importance(catboost, train_pool).head())
    
    # submit predictions
    preds = catboost.predict(test_pool)
    save_preds(preds, config_name)
    
    # force the Garbage Collector to release unreferenced memory
    gc.collect()

1/8: none_preds.csv
Pools built - 21.96s
train_score 0.8958, valid_score 0.9247
Model fitted - 603.45s
Dataset is provided, but PredictionValuesChange feature importance don't use it, since non-empty LeafWeights in model.
item_cnt_month_lag_1       20.922557
item_id_sales_sum_lag_1    12.896804
subtype_id                  9.140871
num_pca_0                   6.866665
item_price_lag_1            5.269512
dtype: float64
6278651    0.160887
6278652    0.379191
6278653    0.022483
6278654    0.090597
6278655    0.134129
dtype: float64
mean     0.266209
std      0.705307
min     -0.437275
max     21.154886
dtype: float64
2/8: shop_preds.csv
Pools built - 22.18s
train_score 0.8907, valid_score 0.9231
Model fitted - 812.62s
Dataset is provided, but PredictionValuesChange feature importance don't use it, since non-empty LeafWeights in model.
item_cnt_month_lag_1       20.263913
item_id_sales_sum_lag_1    13.429879
subtype_id                  8.387386
num_pca_0                   5.729955
shop_i