In [1]:
!ls ../input/*

../input/competitive-data-science-predict-future-sales:
item_categories.csv  sales_train.csv	    shops.csv
items.csv	     sample_submission.csv  test.csv

../input/pfs-catboost:
__notebook__.ipynb	      item_preds.csv
__output__.json		      none_preds.csv
__results__.html	      shop-item_category-item_preds.csv
catboost_info		      shop-item_category_preds.csv
custom.css		      shop-item_preds.csv
item_category-item_preds.csv  shop_preds.csv
item_category_preds.csv

../input/pfs-dataprep:
__notebook__.ipynb  __results__.html   custom.css  mean_benchmark.csv
__output__.json     __results___files  data.h5	   prev_month_benchmark.csv

../input/pfs-lightgbm:
__notebook__.ipynb  baseline_preds.csv	    tuned_seed_bagging_preds.csv
__output__.json     custom.css		    tuned_trial_bagging_preds.csv
__results__.html    seed_bagging_preds.csv
__results___files   tuned_preds.csv

../input/pfs-linreg:
__notebook__.ipynb  __results__.html   custom.css
__output__.json     __r

In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from math import sqrt
import gc
from pathlib import Path
import random
import time
import math

from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV, cross_val_score, validation_curve
from sklearn.base import TransformerMixin

from fastai import tabular

In [3]:
SEED = 42
DATA_PATH = '../input/pfs-dataprep/data.h5'
TEMP_PATH = Path('temp')
BASE_DIRS = ['../input/pfs-lightgbm', '../input/pfs-catboost', '../input/pfs-linreg', '../input/pfs-nn']
EXCL_COLS = []
BATCH_SIZE = 256
FIT_ARGS = (1, 1e-2)
FIT_KWARGS = {
    'wd': 0.2
}
# Meta-models are usually simpler than base models
NN_PARAMS = {
    'layers': [50, 20], 
    'ps': [0.2, 0.5]
}

In [4]:
%%time
traintest = pd.read_hdf(DATA_PATH, key='traintest')

print(traintest.shape)

(7393103, 59)
CPU times: user 12.3 s, sys: 11.9 s, total: 24.2 s
Wall time: 24.8 s


In [5]:
%%time
base_preds = pd.DataFrame({fpath.parents[0].stem[4:]+'_'+fpath.stem[:-6]: pd.Series.from_csv(fpath) 
                           for base_dir in BASE_DIRS
                           for fpath in Path(base_dir).glob('*_preds.csv')})
base_preds_cols = base_preds.columns

print(base_preds.shape)

(1114452, 18)
CPU times: user 32.1 s, sys: 1.34 s, total: 33.4 s
Wall time: 33.6 s


In [6]:
cols = list(traintest.columns[traintest.columns.str.contains('pca')]) + ['date_block_num', 'item_cnt_month']

In [7]:
data = traintest.loc[base_preds.index, cols].merge(base_preds, how='outer', left_index=True, right_index=True)

print(data.shape)

(1114452, 30)


In [8]:
def rmse(y_true, y_pred):
    # Clipping required by the competition
    y_true = np.clip(y_true, 0, 20)
    y_pred = np.clip(y_pred, 0, 20)
    return sqrt(mean_squared_error(y_true, y_pred))

rmse_scoring = make_scorer(rmse, greater_is_better=False)

In [9]:
print(pd.DataFrame({
    'corr': data.corrwith(data['item_cnt_month']).sort_values(ascending=False),
    'r2': data.apply(lambda x: r2_score(data['item_cnt_month'], x), axis=0),
    'rmse': data.apply(lambda x: rmse(data['item_cnt_month'], x), axis=0)
}).sort_values(by='rmse'))

                                      corr           r2       rmse
item_cnt_month                    1.000000     1.000000   0.000000
lightgbm_tuned_trial_bagging      0.577723     0.311322   0.816117
lightgbm_tuned_seed_bagging       0.586693     0.303778   0.820436
lightgbm_seed_bagging             0.577826     0.300427   0.822661
lightgbm_baseline                 0.575957     0.296173   0.825103
lightgbm_tuned                    0.579606     0.291631   0.827279
catboost_item_category            0.556358     0.279770   0.834416
catboost_item                     0.558809     0.279852   0.834425
catboost_none                     0.554672     0.278651   0.835214
catboost_item_category-item       0.553222     0.276031   0.836559
catboost_shop-item                0.554308     0.273424   0.838194
catboost_shop-item_category       0.553042     0.269674   0.840360
catboost_shop                     0.549713     0.266774   0.841943
catboost_shop-item_category-item  0.553776     0.262611   0.84

In [10]:
data.drop(EXCL_COLS, axis=1, inplace=True)

In [11]:
# Unlink data from original dataframes to clean up RAM
# Shuffle training dataset
train = data[data.date_block_num<34].sample(frac=1, random_state=SEED).copy()
test = data[data.date_block_num==34].copy()
X_train = train.drop(['item_cnt_month'], axis=1)
y_train = train.item_cnt_month
X_test = test.drop(['item_cnt_month'], axis=1)

print(X_train.shape, X_test.shape)

(900252, 29) (214200, 29)


In [12]:
del traintest
del base_preds
del data
gc.collect()

109

In [13]:
# Custom TimeSeriesSplit-alike CV scheme using date blocks instead of timestamps
def split_by_date_block(df):
    df = df.reset_index()
    # Last n months as validation blocks
    valid_block_nums = sorted(df.date_block_num.unique())[1:]
    time_split = []
    for valid_block_num in valid_block_nums:
        train_idxs = df[df.date_block_num < valid_block_num].index
        valid_idxs = df[df.date_block_num == valid_block_num].index
        print(valid_block_num, len(train_idxs), len(valid_idxs))
        time_split.append((train_idxs, valid_idxs))
    return time_split

In [14]:
block_split = split_by_date_block(X_train)

31 228889 214536
32 443425 218655
33 662080 238172


### Linear regression

In [15]:
def cross_validate(model):
    return np.abs(cross_val_score(model, X_train, y_train, cv=block_split, scoring=rmse_scoring, n_jobs=-1))

In [16]:
print(cross_validate(LinearRegression()))
print(cross_validate(Ridge(random_state=SEED)))
print(cross_validate(Lasso(random_state=SEED)))
print(cross_validate(ElasticNet(random_state=SEED)))

[0.756481 0.869849 0.891928]
[0.756488 0.869848 0.891899]
[1.06302  1.154245 1.136213]
[0.988856 1.091881 1.080974]


In [17]:
lr = Ridge(random_state=SEED)
lr.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=42, solver='auto', tol=0.001)

In [18]:
# Feature importance
pd.Series(lr.coef_, index=X_train.columns).sort_values(ascending=False)

lightgbm_tuned_seed_bagging         0.476372
lightgbm_baseline                   0.390427
cat_pca_2                           0.387303
nn_bin                              0.344304
catboost_item_category              0.334648
catboost_item                       0.301433
lightgbm_tuned                      0.252630
lightgbm_tuned_trial_bagging        0.245174
catboost_shop-item_category-item    0.043903
nn_sigmoid                          0.041288
catboost_shop-item_category         0.034666
num_pca_2                           0.022826
num_pca_0                           0.019902
nn_reg                              0.014434
num_pca_3                           0.006136
date_block_num                      0.001123
num_pca_1                           0.000029
num_pca_4                          -0.009087
linreg_vw                          -0.054901
catboost_item_category-item        -0.084929
catboost_shop-item                 -0.110520
nn_bin_weighted                    -0.112905
catboost_n

In [19]:
lr_preds = lr.predict(X_test)

print(lr_preds.shape)

(214200,)


### Neural network

In [20]:
# Model parameters
LIST_ARGS = {
    'path': TEMP_PATH,
    'cont_names': train.columns.difference(['item_cnt_month'])
}

In [21]:
def set_random_seed():
    # This snippet gives repeatable results
    np.random.seed(SEED)
    tabular.torch.manual_seed(SEED)
    random.seed(SEED)
    if tabular.torch.cuda.is_available(): 
        tabular.torch.cuda.manual_seed(SEED)
        tabular.torch.cuda.manual_seed_all(SEED)
        tabular.torch.backends.cudnn.deterministic = True
        tabular.torch.backends.cudnn.benchmark = False

In [22]:
def setup_and_fit_nn(data, use_sigmoid=False):
    # Setup and fit NN using provided fastai data loaders
    set_random_seed()
    if use_sigmoid:
        y_range = tabular.torch.tensor([0, 20], device=tabular.defaults.device)
        learn = tabular.tabular_learner(data, **NN_PARAMS, y_range=y_range)
    else:
        learn = tabular.tabular_learner(data, **NN_PARAMS)
    learn.fit_one_cycle(*FIT_ARGS, **FIT_KWARGS)
    return learn

In [23]:
def cross_validate_nn(use_sigmoid=False):
    # Cross validate NN
    # Tried https://github.com/skorch-dev/skorch but got errors while importing the lib
    valid_scores = []
    for train_idx, valid_idx in block_split:
        train_set = train.iloc[train_idx]
        valid_set = train.iloc[valid_idx]
        valid_data = tabular.TabularList.from_df(valid_set, **LIST_ARGS)
        train_data = (tabular.TabularList.from_df(train_set, **LIST_ARGS)
                      .split_none()
                      .label_from_df('item_cnt_month', label_cls=tabular.FloatList)
                      .add_test(valid_data)
                      .databunch(bs=BATCH_SIZE))
        learn = setup_and_fit_nn(train_data, use_sigmoid=use_sigmoid)
        valid_preds = learn.get_preds(tabular.DatasetType.Test)
        valid_score = rmse(valid_set['item_cnt_month'], valid_preds[0].squeeze())
        valid_scores.append(valid_score)
    return np.array(valid_scores)

In [24]:
cross_validate_nn()

array([8.151894, 0.863121, 0.884979])

In [25]:
cross_validate_nn(use_sigmoid=True)

array([19.740632,  0.897649,  0.89988 ])

In [26]:
test_data = tabular.TabularList.from_df(test, **LIST_ARGS)
train_data = (tabular.TabularList.from_df(train, **LIST_ARGS)
              .split_none()
              .label_from_df('item_cnt_month', label_cls=tabular.FloatList)
              .add_test(test_data)
              .databunch(bs=BATCH_SIZE))

In [27]:
learn = setup_and_fit_nn(train_data)

epoch,train_loss,valid_loss,time
0,0.714119,#na#,00:52


In [28]:
nn_preds = learn.get_preds(tabular.DatasetType.Test)
nn_preds = nn_preds[0].data.numpy().T[0]

print(nn_preds.shape)

(214200,)


### Some tweaks

In [29]:
def apply_target_dist(preds, target):
    # Transfer target distribution to predictions by using their ranking
    norm_dist = pd.Series(target).value_counts(normalize=True).sort_index()
    preds_sr = pd.Series(preds).sort_values()
    left_idx = 0
    for i, (value, norm_count) in enumerate(norm_dist.items()):
        if i == len(norm_dist)-1:
            right_idx = len(preds_sr)
        else:
            right_idx = left_idx + max(math.floor(norm_count * len(preds_sr)), 1)
        preds_sr[left_idx:right_idx] = value
        left_idx = right_idx
    return preds_sr.sort_index().values

class RankingTransformer(TransformerMixin):
    # Sort data and apply target distribution to mimic it
    def __init__(self): 
        pass
    
    def transform(self, X): 
        if isinstance(X, pd.DataFrame):
            return X.apply(lambda x: apply_target_dist(x, self.y), axis=0)
        else:
            return apply_target_dist(X, self.y)
    
    def fit(self, y): 
        self.y = y; 
        return self
    
    def fit_transform(self, X, y): 
        return self.fit(y).transform(X)

In [30]:
# We can try to treat our predictions as ranks for true target distribution
# Knowing that 85% of labels are zero, we can set lowest 85% predictions as zero
tweak1_preds = RankingTransformer().fit_transform(lr_preds, y_train)

print(tweak1_preds.shape)

(214200,)


In [31]:
# We can also convert base-model predictions to ranks, apply target distribution, and average
tweak2_preds = RankingTransformer().fit_transform(X_test[base_preds_cols], y_train).mean(axis=1).values

print(tweak2_preds.shape)

(214200,)


### Save predictions

In [32]:
def submit(preds, fname):
    submission = pd.DataFrame({"item_cnt_month": preds})
    submission['ID'] = range(len(preds))
    submission['item_cnt_month'] = submission.item_cnt_month.clip(0, 20)
    submission.to_csv(fname, index=False)
    print(submission.head())
    print(submission.item_cnt_month.describe()[['mean', 'std', 'min', 'max']])

In [33]:
submit(lr_preds, 'lr_preds.csv')

   item_cnt_month  ID
0        0.432425   0
1        0.313191   1
2        1.066025   2
3        0.361724   3
4        2.540140   4
mean     0.305499
std      0.795801
min      0.000000
max     20.000000
Name: item_cnt_month, dtype: float64


In [34]:
submit(nn_preds, 'nn_preds.csv')

   item_cnt_month  ID
0        0.487601   0
1        0.368858   1
2        1.012811   2
3        0.353027   3
4        2.181544   4
mean     0.315731
std      0.737215
min      0.000000
max     18.507410
Name: item_cnt_month, dtype: float64


In [35]:
submit(tweak1_preds, 'tweak1_preds.csv')

   item_cnt_month  ID
0             0.0   0
1             0.0   1
2             1.0   2
3             0.0   3
4             3.0   4
mean     0.267418
std      1.093086
min      0.000000
max     20.000000
Name: item_cnt_month, dtype: float64


In [36]:
submit(tweak2_preds, 'tweak2_preds.csv')

   item_cnt_month  ID
0        0.611111   0
1        0.055556   1
2        1.055556   2
3        0.000000   3
4        3.444444   4
mean     0.267418
std      1.015246
min      0.000000
max     20.000000
Name: item_cnt_month, dtype: float64
