In [1]:
import os 

os.environ['KAGGLE_HM_DATA'] = '/data/sstamenov/kaggle-data/'

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from kaggle_hm.config import train_dates, test_dates, data_root
from kaggle_hm.utils import init_nb, plot_item, visualize_items
import numpy as np
import json
from kaggle_hm.evaluation import compute_precision, enrich_data, plot_precision_at_k, plot_precision_at_k_segments, precision_by_age, precision_by_usage

init_nb()

In [3]:
print(data_root)

/data/sstamenov/kaggle-data


Cosine similarity on items works nicely.
There are very few items with very high similarity >= .4. which are usually bought together.

Items with similarity >.1 provide reasonable similarity to customer's items and are OK to be recommended.
Items had to have at least 10 customers that bought both items to be considered a candidate.

Similarity rec. system looks at items that user bought in the past, finds all items that are similar to it and appends to the candidate list.
Then the list is sorted by similarity and first 12 items are returned. If there are not enough items, the rest is filled with top items.

KNN recs provide marginal improvement over top-12 or top-12 by age group category and reach 0.0041 on public leaderboard.
They provide more consistent precision over age groups, but provide worse precision for customers with high number of purchased items.

In [4]:
customers = pd.read_parquet(data_root / 'clean/customers.parquet').set_index('customer_id')
customers['age_group'] = pd.cut(customers['age'], bins=[16, 21, 26, 30, 40, 50, 60, 100])
items = pd.read_parquet(data_root / 'clean/articles.parquet')
t = pd.read_parquet(data_root / 'clean/transactions.parquet')

In [5]:
# _ = (
#     (t['t_dat'] >= train_dates['start']) &
#     (t['t_dat'] <= train_dates['end'])
# )
# train = t[_]
_ = (
    (t['t_dat'] >= test_dates['start']) &
    (t['t_dat'] <= test_dates['end'])
)
test = t[_]

In [6]:
customers.shape

(1371980, 7)

In [7]:
items.shape

(105542, 25)

In [8]:
t['customer_id'].nunique()

1362281

In [9]:
t['article_id'].nunique()

104547

In [10]:
def pad_rec(recommended, top_items):
    try:
        if np.isnan(recommended):
            recommended = []
    except:
        pass
    n = len(recommended)
    if n == 12:
        return recommended
    m = 12 - n
    return recommended + top_items[:m]
# results['prediction']

In [83]:
similarity_df = pd.read_parquet(data_root / 'clean' / 'similarity_table.parquet')

In [85]:
similarity_df.head()

Unnamed: 0,simil,len_b,len_common,len_a,b,a
22,0.1682,72,13,83,912574002,912574001
26,0.1324,44,8,83,911056002,912574001
36,0.1332,133,14,83,917056001,912574001
88,0.206,92,18,83,894210001,912574001
123,0.1399,104,13,83,911034002,912574001


In [86]:
similarity_df.shape

(184304, 6)

# Knn predictions

In [68]:
_ = (
    (t['t_dat'] >= '2020-08-15') &
    (t['t_dat'] <= train_dates['end'])
)
train = t[_]

In [69]:
train.shape

(923967, 5)

In [70]:
top12 = (
    train
    .groupby('article_id')
    .agg(total_count=('customer_id', 'count'))
    .sort_values('total_count', ascending=False)[:12].reset_index()['article_id'].tolist()
)

In [38]:
# ground truth
test_items = test.groupby('customer_id', observed=True).agg(bought=('article_id', set)).reset_index()

In [39]:
def get_knn_recs(data, similarity_df, top12, test_items):
    nodup = data[['customer_id', 'article_id']].drop_duplicates()
    print(nodup.shape)
    
    m = nodup.merge(similarity_df, left_on='article_id', right_on='a')
    print(m.shape)

    # exclude rec items that customer already has
    _ = m.merge(nodup, left_on=['customer_id', 'b'], right_on=['customer_id', 'article_id'], how='outer')
    _ = _[_['article_id_y'].isna()]
    print(_.shape)
    _['r'] = _.groupby('customer_id')['simil'].rank(method='first', ascending=False)

    # leave up to 12 most similar items
    candidates = _[_['r'] <= 12]

    preds = candidates.sort_values(['customer_id', 'r']).groupby('customer_id',  observed=True).agg(recs=('b', list)).reset_index()
    preds['prediction'] = preds['recs'].apply(lambda recs: pad_rec(recs, top12))

    # test_items -> replace with submission
    results = test_items.merge(preds, on='customer_id', how='left')
    
    # fill cold-start users 
    f = results['prediction'].isna()
    results.loc[f, 'prediction'] = results.loc[f, 'prediction'].apply(lambda _: top12)
    
    return results 

In [79]:
results = get_knn_recs(train, similarity_df.query('len_common >= 5'), top12, test_items)

(821941, 2)
(795804, 8)
(669336, 9)


In [80]:
similarity_df.query('len_common >= 5')['simil'].describe()

count   8378.0000
mean       0.1841
std        0.1160
min        0.1000
25%        0.1174
50%        0.1449
75%        0.1980
max        0.8715
Name: simil, dtype: float64

In [78]:
similarity_df.query('len_common >= 5')['len_common'].describe()

count   8378.0000
mean      29.2667
std       44.9544
min        5.0000
25%        7.0000
50%       14.0000
75%       32.0000
max      609.0000
Name: len_common, dtype: float64

In [81]:
results = compute_precision(results)

In [82]:
results['precision'].mean()

0.009837033519230796

# sanity check

In [None]:
match = test[['customer_id', 'article_id']].drop_duplicates().merge(candidates, left_on=['customer_id', 'article_id'], right_on=['customer_id', 'b'])

In [None]:
match['r'].value_counts().sort_index()

In [None]:
match.shape

# test

In [None]:
item_stats = (
    t[t['t_dat'] <= train_dates['end']]
    .groupby('article_id', observed=True)
    .agg(
        total=('customer_id', 'count'),
        first_date=('t_dat', 'min'),
        last_date=('t_dat', 'max')
    )
)

In [None]:
global_top12 = item_stats.sort_values('total', ascending=False)[:12].index.tolist()

In [None]:
top12

In [None]:
global_top12

In [None]:
len(
    set(top12) & set(global_top12)
)

In [None]:
test['hot'] = test['article_id'].isin(top12)
test['oldy'] = test['article_id'].isin(global_top12)

In [None]:
test['hot'].mean() * 100

In [None]:
test.drop_duplicates(subset=['customer_id', 'article_id'])['hot'].mean()

In [None]:
test[test['hot']]

In [None]:
test['oldy'].mean() * 100

In [None]:
_ = test.groupby('article_id', observed=True).agg(test_count=('customer_id', 'count')).join(item_stats)

In [None]:
_['d'] = pd.to_datetime('2020-09-15') - _['first_date']

In [None]:
# unseen items
_[_['total'].isna()]['test_count'].sum()

In [None]:
plot_item('0805947001')

In [None]:
_.sort_values('test_count', ascending=False)[:20]

In [None]:
_.sort_values('test_count', ascending=True)[:20]

In [None]:
test.shape

# explore Knns

In [None]:
_ = (
        (t['t_dat'] >= '2020-08-01') &
        (t['t_dat'] <= train_dates['end'])
)
train = t[_]
_ = (
        (t['t_dat'] >= test_dates['start']) &
        (t['t_dat'] <= test_dates['end'])
)
test = t[_]

In [None]:
item_counts = train['article_id'].value_counts().reset_index()

item_counts = item_counts[item_counts['article_id'] > 1]

In [None]:
row = item_counts.sample(n=1)

print(f"{row['index'].iloc[0]} - {row['article_id'].iloc[0]}")

item_id = row['index'].iloc[0]
s = get_similar_items([item_id])

s = s[s['len_common'] > 5]

In [None]:
plot_item(item_id)

In [None]:
s.sort_values('simil', ascending=False).reset_index()[:20]

In [None]:
visualize_items(s.sort_values('simil', ascending=False).reset_index()['b'])

In [None]:
recs = t.merge(similarity_df, left_on='article_id', right_on='a')

In [None]:
cand_recs = recs.sort_values('simil', ascending=False).drop_duplicates(subset=['customer_id', 'b']).groupby('customer_id', observed=True).agg(top_k=('b', list)).reset_index()

# make submission

In [44]:
submission = pd.read_csv(data_root / 'sample_submission.csv')

In [50]:
_ = (
    (t['t_dat'] >= '2020-09-01')
)
train = t[_]
print(train.shape)
top12 = (
    train
    .groupby('article_id')
    .agg(total_count=('customer_id', 'count'))
    .sort_values('total_count', ascending=False)[:12].reset_index()['article_id'].tolist()
)
print(top12)
submission = get_knn_recs(train, similarity_df, top12, submission[['customer_id']])

(798269, 5)
['0751471001', '0909370001', '0918522001', '0924243001', '0918292001', '0915526001', '0915529003', '0448509014', '0751471043', '0706016001', '0865799006', '0863595006']
(705504, 2)
(1445354, 8)
(1318436, 9)


In [54]:
submission['recs'].isna().sum() / submission.shape[0]  # low coverage

0.8882855435210426

In [55]:
submission['prediction'] = submission['prediction'].apply(lambda _: ' '.join(_))

In [56]:
submission.head()

Unnamed: 0,customer_id,recs,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,[0858856005],0858856005 0751471001 0909370001 0918522001 09...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,0751471001 0909370001 0918522001 0924243001 09...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,0751471001 0909370001 0918522001 0924243001 09...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,0751471001 0909370001 0918522001 0924243001 09...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,,0751471001 0909370001 0918522001 0924243001 09...


In [57]:
submission.shape

(1371980, 3)

In [58]:
submission[['customer_id', 'prediction']].to_csv('knn_submission.csv', index=False)