In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from kaggle_hm.chart_model import filter_data
from kaggle_hm.utils import init_nb, plot_item, visualize_items
from kaggle_hm.config import data_root, test_dates

init_nb()

In [2]:
df = pd.read_parquet(data_root / 'clean' / 'transactions.parquet')

In [3]:
customers = pd.read_parquet(data_root / 'clean' / 'customers.parquet')

In [4]:
df['delta_days'] = (pd.to_datetime('2020-09-08') - df['t_dat']).dt.days

In [5]:
df['is_last_30d'] = ((df['delta_days'] > 0) & (df['delta_days'] < 30)).astype('int')
df['is_last_60d'] = ((df['delta_days'] > 0) & (df['delta_days'] < 60)).astype('int')
df['is_last_90d'] = ((df['delta_days'] > 0) & (df['delta_days'] < 90)).astype('int')
df['is_last_120d'] = ((df['delta_days'] > 0) & (df['delta_days'] < 120)).astype('int')
df['is_last_180d'] = ((df['delta_days'] > 0) & (df['delta_days'] < 180)).astype('int')
df['is_last_360d'] = ((df['delta_days'] > 0) & (df['delta_days'] < 360)).astype('int')

In [6]:
train = filter_data(df, '2020-08-01', '2020-09-08')
test = filter_data(df, test_dates['start'], test_dates['end'])

In [23]:
last_90 = filter_data(df, '2018-09-01', '2020-08-01')

In [24]:
s = last_90['article_id'].value_counts()

old_items = s[s >= 50]

In [25]:
old_items.shape

(53883,)

In [20]:
s = train['article_id'].value_counts()

new_items = s[s >= 50]

In [22]:
new_items.shape

(6624,)

In [26]:
len(
    set(new_items.index) & set(old_items.index)
)

4890

In [31]:
test[test['article_id'].isin(set(new_items.index))].shape

(186841, 12)

In [44]:
test.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,is_last_30d,is_last_60d,is_last_90d,is_last_120d,is_last_180d,is_last_360d,delta_days
31292772,2020-09-09,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...,399136061,0.0834,2,0,0,0,0,0,0,-1
31292773,2020-09-09,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...,732842014,0.0667,2,0,0,0,0,0,0,-1
31292774,2020-09-09,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...,556255001,0.01,2,0,0,0,0,0,0,-1
31292775,2020-09-09,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...,852219003,0.0083,2,0,0,0,0,0,0,-1
31292776,2020-09-09,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...,732842021,0.0667,2,0,0,0,0,0,0,-1


In [32]:
full_ds = set(filter_data(df, to_date='2020-09-08')['customer_id'])
train_customers = set(train['customer_id'])
test_customers = set(test['customer_id'])

cold_customers = set(test_customers - full_ds)

In [42]:
cond = (
    (test['customer_id'].isin(test_customers - cold_customers - train_customers)) &
    (test['article_id'].isin(set(new_items.index)))
)

test[cond].shape

(77127, 12)

In [101]:
customer_stats = (
    filter_data(df, to_date='2020-09-08')
    .groupby('customer_id', observed=True)
    .agg(
        transactions=('article_id', 'count'),
        t_30=('is_last_30d', 'sum'),
        t_60=('is_last_60d', 'sum'),
        t_90=('is_last_90d', 'sum'),
        t_120=('is_last_120d', 'sum'),
        t_180=('is_last_180d', 'sum'),
        t_360=('is_last_360d', 'sum'),
        first_t=('t_dat', 'min'),
        last_t=('t_dat', 'max')
    )
).reset_index()

In [102]:
customer_stats['delta_first'] = (pd.to_datetime('2020-09-08') - customer_stats['first_t']).dt.days
customer_stats['delta_last'] = (pd.to_datetime('2020-09-08') - customer_stats['last_t']).dt.days

In [46]:
customer_stats.describe()

Unnamed: 0,transactions,t_30,t_60,t_90,t_120,t_180,t_360
count,1362281.0,1362281.0,1362281.0,1362281.0,1362281.0,1362281.0,1362281.0
mean,23.3346,0.8169,1.7477,3.067,4.1257,5.7926,10.7392
std,39.2423,2.7331,4.634,7.0471,8.8087,11.6507,19.5747
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,9.0,0.0,0.0,0.0,0.0,1.0,4.0
75%,27.0,0.0,1.0,3.0,5.0,7.0,13.0
max,1895.0,132.0,294.0,339.0,442.0,548.0,1014.0


In [98]:
customer_stats[customer_stats['customer_id'].isin(test_customers)].describe()

Unnamed: 0,transactions,t_30,t_60,t_90,t_120,t_180,t_360,delta_first,delta_last,FN,Active,age
count,72019.0,72019.0,72019.0,72019.0,72019.0,72019.0,72019.0,72019.0,72019.0,32949.0,32489.0,71698.0
mean,64.1581,2.7836,5.6727,9.5385,12.5956,17.1513,30.6665,529.7363,-5.0647,1.0,1.0,35.686
std,79.0748,5.5698,9.6896,14.7666,18.3742,24.2426,40.4193,238.6228,3.3453,0.0,0.0,14.0428
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-7.0,-14.0,1.0,1.0,16.0
25%,16.0,0.0,0.0,0.0,1.0,3.0,6.0,432.0,-7.0,1.0,1.0,24.0
50%,40.0,0.0,2.0,5.0,7.0,10.0,18.0,655.0,-4.0,1.0,1.0,30.0
75%,82.0,4.0,7.0,12.0,17.0,22.0,40.0,703.0,-2.0,1.0,1.0,48.0
max,1895.0,132.0,294.0,339.0,442.0,548.0,1014.0,719.0,-1.0,1.0,1.0,96.0


In [99]:
customer_stats[customer_stats['customer_id'].isin(test_customers & train_customers)].describe()

Unnamed: 0,transactions,t_30,t_60,t_90,t_120,t_180,t_360,delta_first,delta_last,FN,Active,age
count,38168.0,38168.0,38168.0,38168.0,38168.0,38168.0,38168.0,38168.0,38168.0,18927.0,18693.0,38035.0
mean,89.2758,5.2524,9.8147,15.2918,19.5834,25.9584,44.7561,587.4293,-5.498,1.0,1.0,36.0095
std,93.4291,6.7505,11.5671,17.6499,21.8635,28.8399,47.8571,192.1685,3.6369,0.0,0.0,13.9315
min,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-14.0,1.0,1.0,16.0
25%,31.0,1.0,3.0,5.0,7.0,9.0,16.0,560.0,-7.0,1.0,1.0,24.0
50%,62.0,3.0,6.0,10.0,13.0,18.0,31.0,681.0,-5.0,1.0,1.0,31.0
75%,114.0,7.0,12.0,19.0,25.0,33.0,57.0,709.0,-3.0,1.0,1.0,48.0
max,1895.0,132.0,294.0,339.0,442.0,548.0,1014.0,719.0,-1.0,1.0,1.0,96.0


In [103]:
customer_stats[customer_stats['customer_id'].isin(test_customers - train_customers - cold_customers)].describe()

Unnamed: 0,transactions,t_30,t_60,t_90,t_120,t_180,t_360,delta_first,delta_last
count,28456.0,28456.0,28456.0,28456.0,28456.0,28456.0,28456.0,28456.0,28456.0
mean,37.9693,0.0,1.1926,3.6298,5.611,8.5899,17.5822,553.5073,138.2101
std,44.2599,0.0,2.9963,6.1209,8.3035,11.7775,21.1439,188.623,126.9646
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,39.0,39.0
25%,10.0,0.0,0.0,0.0,0.0,1.0,4.0,461.0,57.0
50%,24.0,0.0,0.0,1.0,3.0,5.0,11.0,644.0,83.0
75%,49.0,0.0,1.0,5.0,8.0,12.0,23.0,697.0,170.0
max,1196.0,0.0,55.0,129.0,172.0,297.0,678.0,719.0,719.0


In [91]:
test['article_id'].value_counts().sort_values(ascending=False).head(20)

0909370001    1283
0865799006     768
0918522001     729
0924243001     704
0448509014     609
0751471001     607
0809238001     563
0918292001     546
0762846027     539
0809238005     503
0673677002     463
0923758001     457
0706016001     453
0915529003     450
0863646001     446
0805947001     445
0915526001     443
0751471043     435
0850917001     434
0929165002     433
Name: article_id, dtype: int64

In [95]:
test[test['customer_id'].isin(test_customers - train_customers - cold_customers)]['article_id'].value_counts().sort_values(ascending=False).head(12)

0909370001    356
0918522001    315
0865799006    300
0751471001    278
0448509014    271
0918292001    254
0924243001    242
0762846027    213
0706016001    205
0809238001    196
0715624001    193
0809238005    192
Name: article_id, dtype: int64

In [86]:
customer_stats = customer_stats.merge(customers, on='customer_id')

In [73]:
from kaggle_hm.chart_model import compute_chart
from kaggle_hm.evaluation import compute_precision

In [75]:
top_12 = compute_chart(train)
results = test[test['customer_id'].isin(cold_customers)].groupby('customer_id', observed=True).agg(bought=('article_id', set))
results['prediction'] = [top_12 for _ in range(results.shape[0])]

In [76]:
results = compute_precision(results)

In [77]:
results['precision'].mean()

0.006861369315569231