In [None]:
# 1/N ~ 1 in 100k

Chart performance on cold customers is 0.006, let's try to build a better model. We are mainly interested on improving performance on cold-customers and old customers that didn't buy something in last 3 weeks. To goal is to beat simple benchmark of top12 items sold last week.

features we may want to consider

customer:
- age
- postal-code
- FN
- frequency

item-features:
- color
- product_code
- product_group_name
- department
- popularity (#users bought it, #times sold)
- #this week popularity (normalized / quantile)
- hotness - #days since first transactions
- super-hotness - first transaction was on monday
- days since last sale (for filtering mainly)

time features:
- month

for customers with history
- items was bought
- color preference
- category preference

take all sales in test-period (target) generate random pairs

COLD CUSTOMER INDICATOR / COLD ITEM INDICATOR

take all items customer bought + random 20 items he didn't buy
- (from all items)
- (from latest items)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from kaggle_hm.chart_model import filter_data
from kaggle_hm.utils import init_nb, plot_item, visualize_items
from kaggle_hm.config import data_root, test_dates

from kaggle_hm.chart_model import compute_chart
from kaggle_hm.evaluation import compute_precision

init_nb()
sns.set_style('white')

In [3]:
df = pd.read_parquet(data_root / 'clean' / 'transactions.parquet')
customers = pd.read_parquet(data_root / 'clean' / 'customers.parquet')
items = pd.read_parquet(data_root / 'clean' / 'articles.parquet')

In [17]:
train = filter_data(df, to_date='2020-09-08')
test = filter_data(df, test_dates['start'], test_dates['end'])

In [18]:
test['buy'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['buy'] = 1


In [19]:
neg_samples = pd.DataFrame()
neg_samples['article_id'] = np.random.choice(items['article_id'], 1_000_000)
neg_samples['customer_id'] = np.random.choice(test['customer_id'].unique(), 1_000_000)
neg_samples['buy'] = 0

In [24]:
X = pd.concat([test[['article_id', 'customer_id', 'buy']], neg_samples[['article_id', 'customer_id', 'buy']]]).drop_duplicates(subset=['article_id', 'customer_id']).sample(frac=1.0)

In [35]:
from catboost import CatBoostClassifier, Pool

model = CatBoostClassifier(learning_rate=0.01, metric_period=100, task_type="GPU", devices='0:1')

In [36]:
features = X[['article_id']]
y = X['buy']

In [37]:
p = Pool(features, y, cat_features=['article_id'])

In [38]:
model.fit(p)

0:	learn: 0.6779531	total: 20.4ms	remaining: 20.4s
100:	learn: 0.2174210	total: 1.84s	remaining: 16.3s
200:	learn: 0.1885548	total: 3.62s	remaining: 14.4s
300:	learn: 0.1850515	total: 5.41s	remaining: 12.6s
400:	learn: 0.1844184	total: 7.17s	remaining: 10.7s
500:	learn: 0.1842528	total: 8.94s	remaining: 8.9s
600:	learn: 0.1841859	total: 10.7s	remaining: 7.12s
700:	learn: 0.1841436	total: 12.5s	remaining: 5.33s
800:	learn: 0.1841181	total: 14.3s	remaining: 3.55s
900:	learn: 0.1840983	total: 16.1s	remaining: 1.77s
999:	learn: 0.1840828	total: 17.9s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f2db2d00e50>

In [43]:
from sklearn.metrics import roc_auc_score, confusion_matrix

In [51]:
predictions = model.predict_proba(features)[:, 1]

In [52]:
roc_auc_score(y, predictions)

0.9765634870581436

In [53]:
confusion_matrix(y,  predictions >= 0.5)

array([[957990,  41922],
       [ 41639, 186271]])

In [54]:
confusion_matrix(y, np.ones(X.shape[0]), normalize='all')

array([[0.    , 0.8144],
       [0.    , 0.1856]])

In [58]:
confusion_matrix(y, predictions >= 0.5, normalize='pred')

array([[0.9583, 0.1837],
       [0.0417, 0.8163]])