In [1]:
# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

In [111]:
import os
import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, IterableDataset
from tqdm import tqdm

In [3]:
os.chdir('..')
# os.chdir('drive/My Drive/Colab Notebooks/Github/fashion-recommendations') 

In [4]:
from fashion_recommendations.metrics.average_precision import mapk

In [5]:
pd.options.display.max_columns = None

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


### Load transactions

In [126]:
# ' '.join(transactions_train['article_id'].value_counts().head(12).index.tolist())

'0751471001 0706016001 0372860002 0610776002 0448509014 0730683050 0918292001 0760084003 0866731001 0372860001 0827968001 0706016003'

In [7]:
transactions_train = pd.read_csv('data/transactions_train.csv', dtype={'article_id': str})
print(transactions_train.shape)
transactions_train.head()

(31788324, 5)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [8]:
transactions_train['t_dat'] = pd.to_datetime(transactions_train['t_dat'])

In [9]:
transactions_train.groupby('customer_id')['article_id'].count().describe()

count    1.362281e+06
mean     2.333463e+01
std      3.924225e+01
min      1.000000e+00
25%      3.000000e+00
50%      9.000000e+00
75%      2.700000e+01
max      1.895000e+03
Name: article_id, dtype: float64

### Load articles

In [10]:
articles_df = pd.read_csv(
    'data/articles.csv', 
    dtype={'article_id': str},
    usecols=['article_id', 'garment_group_name', 'detail_desc']
)
print(articles_df.shape)
articles_df.head()

(105542, 3)


Unnamed: 0,article_id,garment_group_name,detail_desc
0,108775015,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [11]:
articles_df.isnull().sum()

article_id              0
garment_group_name      0
detail_desc           416
dtype: int64

In [12]:
articles_df['detail_desc'] = np.where(
    articles_df['detail_desc'].isnull(), 
    articles_df['garment_group_name'], 
    articles_df['detail_desc']
)

### Find cut-offs

In [13]:
end = transactions_train['t_dat'].max()
endm1 = end - datetime.timedelta(days=7)
endm2 = endm1 - datetime.timedelta(days=7)
endm3 = endm2 - datetime.timedelta(days=7)

In [14]:
end, endm1, endm2, endm3

(Timestamp('2020-09-22 00:00:00'),
 Timestamp('2020-09-15 00:00:00'),
 Timestamp('2020-09-08 00:00:00'),
 Timestamp('2020-09-01 00:00:00'))

train
- inputs: start - endm3
- labels: emdm3 - endm2

dev
- inputs: start - endm2
- labels: emdm2 - endm1

test 
- inputs: start - endm1
- labels: endm1 - end

In [15]:
transactions_train = transactions_train.copy()[transactions_train['t_dat'] > '2020-07-01']

In [16]:
transactions_train['customer_id'].nunique()

484944

In [17]:
# % of items in last week which didn't appear in prior weeks' transactions
len(set(transactions_train[transactions_train['t_dat'] > endm1]['article_id']) - set(transactions_train[transactions_train['t_dat'] <= endm1]['article_id'])) / transactions_train[transactions_train['t_dat'] > endm1]['article_id'].nunique()

0.052763260313577226

In [18]:
transactions_train.shape

(3327520, 5)

In [19]:
transactions_train['article_id'].nunique() / articles_df.shape[0]

0.3840935362225465

### Filter out unnecessary articles

In [20]:
articles_df = articles_df.copy()[
    articles_df['article_id'].isin(transactions_train['article_id'].unique())
]

articles_df.reset_index(drop=True, inplace=True)
articles_df.drop(columns=['garment_group_name'], inplace=True)
print(articles_df.shape)
articles_df.head()

(40538, 2)


Unnamed: 0,article_id,detail_desc
0,108775015,Jersey top with narrow shoulder straps.
1,108775044,Jersey top with narrow shoulder straps.
2,110065001,"Microfibre T-shirt bra with underwired, moulde..."
3,110065002,"Microfibre T-shirt bra with underwired, moulde..."
4,110065011,"Microfibre T-shirt bra with underwired, moulde..."


In [21]:
articles_df['article_id_idx'] = articles_df.index

In [22]:
article_id_to_idx = dict(zip(articles_df['article_id'], articles_df['article_id_idx']))

In [23]:
transactions_train['article_id_idx'] = transactions_train['article_id'].map(article_id_to_idx)
transactions_train['article_id_idx'] = transactions_train['article_id_idx'].astype(str)

### Create splits

#### Illustrative steps:

In [24]:
dev_labels = transactions_train.copy()[transactions_train['t_dat'] > endm1]

In [25]:
dev_inputs = transactions_train.copy()[transactions_train['t_dat'] <= endm1]

In [26]:
print(dev_inputs.shape)
dev_inputs.head()

(3087209, 6)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,article_id_idx
28460804,2020-07-02,0007ee0394b764776edf7f49926235094702acddedc24b...,801068005,0.022017,1,20144
28460805,2020-07-02,0007ee0394b764776edf7f49926235094702acddedc24b...,888728007,0.018627,1,35804
28460806,2020-07-02,0007ee0394b764776edf7f49926235094702acddedc24b...,837981001,0.018627,1,26485
28460807,2020-07-02,0008a2dd68b9a347b6f6b6d567b48684d4a11e05a8b7cc...,730683001,0.042356,1,11603
28460808,2020-07-02,0008d30a148478dc88c69af6c51230ad5802590afc8488...,888843001,0.05422,2,35817


In [27]:
dev_inputs.sort_values(['customer_id', 't_dat'], inplace=True)
dev_inputs.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,article_id_idx
31140481,2020-09-05,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,568601043,0.050831,1,2964
28738780,2020-07-08,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,826211002,0.027102,1,24822
31521960,2020-09-15,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,794321007,0.061,2,19269
30223079,2020-08-12,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,896152002,0.027102,2,36886
30223080,2020-08-12,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,730683050,0.033881,2,11614


In [28]:
# dev_inputs['total_purchases'] = dev_inputs.groupby('customer_id')['article_id'].transform('count')

In [29]:
# print(dev_inputs.shape)
# dev_inputs = dev_inputs.copy()[dev_inputs['total_purchases'] >= 10]
# print(dev_inputs.shape)

In [30]:
dev_inputs['article_id_idx'] = dev_inputs['article_id_idx'].astype(str)

In [31]:
dev_inputs = (
    dev_inputs
        .groupby('customer_id')
        .tail(10)  # Most recent articles only
        .groupby('customer_id')['article_id_idx']
        .apply(lambda x: ','.join(x))
        .reset_index()
)
print(dev_inputs.shape)
dev_inputs.head()

(463088, 2)


Unnamed: 0,customer_id,article_id_idx
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2964
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,24822
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,19269
3,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,36886116143994918973
4,0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d...,10788915


In [32]:
dev_labels.shape

(240311, 6)

In [33]:
dev_labels = dev_labels[['customer_id', 'article_id_idx']].drop_duplicates()  # unique purchases
dev_labels.shape

(213728, 2)

In [34]:
dev_labels = (
    dev_labels
        .groupby('customer_id')['article_id_idx']
        .apply(lambda x: ','.join(x))
        .reset_index()
)
print(dev_labels.shape)
dev_labels.head()

(68984, 2)


Unnamed: 0,customer_id,article_id_idx
0,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,4930
1,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,24971
2,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...,15029186095646
3,000525e3fe01600d717da8423643a8303390a055c578ed...,33241
4,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,"37826,34169,41,31785,39073,40201,39074,1243,39..."


In [35]:
dev_set = dev_inputs.merge(dev_labels, on='customer_id', how='outer', indicator=True, suffixes=('_last10', '_label'))
dev_set.shape

(484944, 4)

In [36]:
dev_set['_merge'].value_counts() / dev_set.shape[0]

left_only     0.857749
both          0.097182
right_only    0.045069
Name: _merge, dtype: float64

`left_only` = customers who did not make purchases in last week

`both` = customers who did make purchases in last week and made purchases in the past

`right_only` = customers who did make purchases in last week but made no purchases in the past (cold start)

Keep `both` only. For cold start we can just predict top-12 and `left_only` won't contribute to MAP

In [37]:
dev_set = dev_set.copy()[dev_set['_merge'] == 'both']
dev_set.drop('_merge', inplace=True, axis=1)
print(dev_set.shape)
dev_set.head()

(47128, 3)


Unnamed: 0,customer_id,article_id_idx_last10,article_id_idx_label
26,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,"39441,36213,38872,37897,38110,5642,4783,34346,...",24971
33,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...,"11936,16155,685,11824,2456,28519,11826,3269,23...",15029186095646
54,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,"35875,24399,21130,21130,21130,21130,40107,3128...","37826,34169,41,31785,39073,40201,39074,1243,39..."
107,000fb6e772c5d0023892065e659963da90b1866035558e...,"24455,19793,37728,25547,35072,15616,39388,3938...",1834538864359413266339388
113,0010e8eb18f131e724d6997909af0808adbba057529edb...,26967156112506934918703,1832689435210113895137019375114332546


#### Function

train
- inputs: start - endm3
- labels: emdm3 - endm2

dev
- inputs: start - endm2
- labels: emdm2 - endm1

test 
- inputs: start - endm1
- labels: endm1 - end

In [38]:
def create_article_input_output_df(inputs, labels, max_purchase_history=10):
    
    inputs = inputs.copy()
    labels = labels.copy()
    
    inputs.sort_values(['customer_id', 't_dat'], inplace=True)

    inputs['article_id_idx'] = inputs['article_id_idx'].astype(str)

    inputs = (
        inputs
            .groupby('customer_id')
            .tail(max_purchase_history)  # Most recent articles only
            .groupby('customer_id')['article_id_idx']
            .apply(lambda x: ','.join(x))
            .reset_index()
    )

    labels = labels[['customer_id', 'article_id_idx']].drop_duplicates()  # unique purchases

    labels = (
        labels
            .groupby('customer_id')['article_id_idx']
            .apply(lambda x: ','.join(x))
            .reset_index()
    )

    data_set = inputs.merge(labels, on='customer_id', how='outer', indicator=True, suffixes=('_last10', '_label'))

    data_set = data_set.copy()[data_set['_merge'] == 'both']
    data_set.drop('_merge', inplace=True, axis=1)

    return data_set

In [39]:
train_set = create_article_input_output_df(
    inputs=transactions_train.copy()[transactions_train['t_dat'] <= endm3], 
    labels=transactions_train.copy()[
        (transactions_train['t_dat'] > endm3) &
        (transactions_train['t_dat'] < endm2)
    ]
)

print(train_set.shape)
train_set.head()

(43108, 3)


Unnamed: 0,customer_id,article_id_idx_last10,article_id_idx_label
10,000172a9c322560c849754ffbdfdb2180d408aa7176b94...,15861,7705
13,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,"38882,3152,31459,3157,39673,39510,17255,26222,...",316038515396424034035654
14,0001f8cef6b9702d54abf66fd89eb21014bf98567065a9...,216272117523351,23816
22,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,180993309733097399423944136213,388723789738110
33,0005340aa69bb5a28d98712a36d8f669024bce137e3c82...,"23950,25602,10293,29865,19920,34107,25721,2152...",2393015224211413773310396378271522037735


In [40]:
dev_set = create_article_input_output_df(
    inputs=transactions_train.copy()[transactions_train['t_dat'] <= endm2], 
    labels=transactions_train.copy()[
        (transactions_train['t_dat'] > endm2) &
        (transactions_train['t_dat'] < endm1)
    ]
)

print(dev_set.shape)
dev_set.head()

(44153, 3)


Unnamed: 0,customer_id,article_id_idx_last10,article_id_idx_label
14,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,"39510,17255,26222,39644,31316,3160,38515,39642...",12038
24,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,"18099,33097,33097,39942,39441,36213,38872,3789...",564247833434635841
30,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...,32860328601193616155,685118242456285191182632692308811825
50,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,"38758,37070,21131,23337,35875,24399,21130,2113...",4010731282101140263
58,0008968c0d451dbc5a9968da03196fe20051965edde741...,332863359296443289930360173302151531861,36503


In [41]:
test_set = create_article_input_output_df(
    inputs=transactions_train.copy()[transactions_train['t_dat'] <= endm1], 
    labels=transactions_train.copy()[
        (transactions_train['t_dat'] > endm1)
    ]
)

print(test_set.shape)
test_set.head()

(47128, 3)


Unnamed: 0,customer_id,article_id_idx_last10,article_id_idx_label
26,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,"39441,36213,38872,37897,38110,5642,4783,34346,...",24971
33,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...,"11936,16155,685,11824,2456,28519,11826,3269,23...",15029186095646
54,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,"35875,24399,21130,21130,21130,21130,40107,3128...","37826,34169,41,31785,39073,40201,39074,1243,39..."
107,000fb6e772c5d0023892065e659963da90b1866035558e...,"24455,19793,37728,25547,35072,15616,39388,3938...",1834538864359413266339388
113,0010e8eb18f131e724d6997909af0808adbba057529edb...,26967156112506934918703,1832689435210113895137019375114332546


Inputs for submission:

In [42]:
submission_inputs = transactions_train.copy()

submission_inputs.sort_values(['customer_id', 't_dat'], inplace=True)

submission_inputs['article_id_idx'] = submission_inputs['article_id_idx'].astype(str)

submission_inputs = (
    submission_inputs
        .groupby('customer_id')
        .tail(10)  # Most recent articles only
        .groupby('customer_id')['article_id_idx']
        .apply(lambda x: ','.join(x))
        .reset_index()
)

In [43]:
submission_inputs.head()

Unnamed: 0,customer_id,article_id_idx
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2964
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,24822
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,19269
3,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,36886116143994918973
4,0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d...,10788915


In [44]:
submission_inputs.rename(columns={'article_id_idx': 'article_id_idx_last10'}, inplace=True)
submission_inputs['article_id_idx_label'] = '0'
print(submission_inputs.shape)
submission_inputs.head()

(484944, 3)


Unnamed: 0,customer_id,article_id_idx_last10,article_id_idx_label
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2964,0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,24822,0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,19269,0
3,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,36886116143994918973,0
4,0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d...,10788915,0


In a given week what proportion of total customers make a purchase?

In [45]:
all_customers = pd.read_csv('data/sample_submission.csv').shape[0]

In [46]:
submission_inputs.shape[0]

484944

In [47]:
print(dev_set['customer_id'].shape[0] / all_customers)
print(test_set['customer_id'].shape[0] / all_customers)

0.03218195600518958
0.034350354961442585


In [48]:
print(dev_set['customer_id'].shape[0] / submission_inputs.shape[0])
print(test_set['customer_id'].shape[0] / submission_inputs.shape[0])

0.09104762611765482
0.09718235507605001


Proportion of customers in dev/test sets who appear in training

In [49]:
dev_set[dev_set['customer_id'].isin(train_set['customer_id'])].shape[0] / dev_set.shape[0]

0.20655448100921794

In [50]:
test_set[test_set['customer_id'].isin(train_set['customer_id'])].shape[0] / dev_set.shape[0]

0.1977895046769189

Filter out customers who don't appear in training

In [51]:
dev_set = dev_set.copy()[dev_set['customer_id'].isin(train_set['customer_id'])]
test_set = test_set.copy()[test_set['customer_id'].isin(train_set['customer_id'])]

In [52]:
train_set.shape

(43108, 3)

In [53]:
dev_set.shape

(9120, 3)

In [54]:
test_set.shape

(8733, 3)

### Filter customers and get customer features

In [55]:
customers = pd.read_csv('data/customers.csv', usecols=['customer_id', 'FN', 'Active', 'age'])
print(customers.shape)
customers.head()

(1371980, 4)


Unnamed: 0,customer_id,FN,Active,age
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,49.0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,25.0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,24.0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,,54.0
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,52.0


In [56]:
customers_to_keep = set(train_set['customer_id'])
len(customers_to_keep)

43108

Filter out other customers from submission inputs (for other customers we'll just predict top-12)

In [57]:
len(customers_to_keep) / len(submission_inputs)

0.08889273813058828

In [58]:
submission_inputs = submission_inputs.copy()[submission_inputs['customer_id'].isin(customers_to_keep)]
submission_inputs.shape

(43108, 3)

In [59]:
customers = customers.copy()[customers['customer_id'].isin(customers_to_keep)]
customers.reset_index(drop=True, inplace=True)
customers.head()

Unnamed: 0,customer_id,FN,Active,age
0,000172a9c322560c849754ffbdfdb2180d408aa7176b94...,,,45.0
1,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,1.0,1.0,44.0
2,0001f8cef6b9702d54abf66fd89eb21014bf98567065a9...,,,21.0
3,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,1.0,1.0,33.0
4,0005340aa69bb5a28d98712a36d8f669024bce137e3c82...,,,21.0


In [60]:
customers.isnull().sum() / customers.shape[0]

customer_id    0.000000
FN             0.512689
Active         0.519277
age            0.003340
dtype: float64

In [61]:
customers[['FN', 'Active']] = customers[['FN', 'Active']].fillna(0)

Despite potential data leakage we impute the nulls for age here:

In [62]:
customers.shape

(43108, 4)

In [63]:
from sklearn.impute import SimpleImputer

In [64]:
simple_imputer = SimpleImputer()

In [65]:
customers[['age']] = simple_imputer.fit_transform(customers[['age']])

In [66]:
customers['age'] /= 100

In [67]:
customers.isnull().sum() / customers.shape[0]

customer_id    0.0
FN             0.0
Active         0.0
age            0.0
dtype: float64

In [68]:
customers['customer_id_idx'] = customers.index

In [69]:
customers.head()

Unnamed: 0,customer_id,FN,Active,age,customer_id_idx
0,000172a9c322560c849754ffbdfdb2180d408aa7176b94...,0.0,0.0,0.45,0
1,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,1.0,1.0,0.44,1
2,0001f8cef6b9702d54abf66fd89eb21014bf98567065a9...,0.0,0.0,0.21,2
3,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,1.0,1.0,0.33,3
4,0005340aa69bb5a28d98712a36d8f669024bce137e3c82...,0.0,0.0,0.21,4


In [70]:
train_set = train_set.merge(customers, on=['customer_id'])
dev_set = dev_set.merge(customers, on=['customer_id'])
test_set = test_set.merge(customers, on=['customer_id'])

In [118]:
submission_inputs = submission_inputs.merge(customers, on=['customer_id'])

In [71]:
train_set.head()

Unnamed: 0,customer_id,article_id_idx_last10,article_id_idx_label,FN,Active,age,customer_id_idx
0,000172a9c322560c849754ffbdfdb2180d408aa7176b94...,15861,7705,0.0,0.0,0.45,0
1,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,"38882,3152,31459,3157,39673,39510,17255,26222,...",316038515396424034035654,1.0,1.0,0.44,1
2,0001f8cef6b9702d54abf66fd89eb21014bf98567065a9...,216272117523351,23816,0.0,0.0,0.21,2
3,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,180993309733097399423944136213,388723789738110,1.0,1.0,0.33,3
4,0005340aa69bb5a28d98712a36d8f669024bce137e3c82...,"23950,25602,10293,29865,19920,34107,25721,2152...",2393015224211413773310396378271522037735,0.0,0.0,0.21,4


In [72]:
train_set = train_set[['customer_id_idx', 'article_id_idx_last10', 'article_id_idx_label', 'FN', 'Active', 'age']]
dev_set = dev_set[['customer_id_idx', 'article_id_idx_last10', 'article_id_idx_label', 'FN', 'Active', 'age']]
test_set = test_set[['customer_id_idx', 'article_id_idx_last10', 'article_id_idx_label', 'FN', 'Active', 'age']]

In [119]:
submission_inputs = submission_inputs[['customer_id_idx', 'article_id_idx_last10', 'article_id_idx_label', 'FN', 'Active', 'age']]

In [73]:
train_set.head()

Unnamed: 0,customer_id_idx,article_id_idx_last10,article_id_idx_label,FN,Active,age
0,0,15861,7705,0.0,0.0,0.45
1,1,"38882,3152,31459,3157,39673,39510,17255,26222,...",316038515396424034035654,1.0,1.0,0.44
2,2,216272117523351,23816,0.0,0.0,0.21
3,3,180993309733097399423944136213,388723789738110,1.0,1.0,0.33
4,4,"23950,25602,10293,29865,19920,34107,25721,2152...",2393015224211413773310396378271522037735,0.0,0.0,0.21


In [74]:
dev_set.head()

Unnamed: 0,customer_id_idx,article_id_idx_last10,article_id_idx_label,FN,Active,age
0,1,"39510,17255,26222,39644,31316,3160,38515,39642...",12038,1.0,1.0,0.44
1,3,"18099,33097,33097,39942,39441,36213,38872,3789...",564247833434635841,1.0,1.0,0.33
2,13,"26177,31831,31831,31831,32833,34362,20231,7682...",150572023215514136682023834203,0.0,0.0,0.41
3,15,"39072,36872,38476,37455,27914,4387,24455,19793...",35072156163938839975,1.0,1.0,0.42
4,17,152702258226023407402644026215389,3651238312,1.0,1.0,0.23


In [120]:
submission_inputs.head()

Unnamed: 0,customer_id_idx,article_id_idx_last10,article_id_idx_label,FN,Active,age
0,0,15861770577057705,0,0.0,0.0,0.45
1,1,"17255,26222,39644,31316,3160,38515,39642,40340...",0,1.0,1.0,0.44
2,2,21627211752335123816,0,0.0,0.0,0.21
3,3,"36213,38872,37897,38110,5642,4783,34346,34346,...",0,1.0,1.0,0.33
4,4,"21522,31083,23930,15224,21141,37733,10396,3782...",0,0.0,0.0,0.21


### Save files

In [75]:
train_set.to_csv('data/final_train_set.tsv', sep='\t', index=False)
dev_set.to_csv('data/final_dev_set.tsv', sep='\t', index=False)
test_set.to_csv('data/final_test_set.tsv', sep='\t', index=False)
submission_inputs.to_csv('data/final_submission_inputs.tsv', sep='\t', index=False)

articles_df.to_csv('data/articles_df_filt.csv', index=False)
customers.to_csv('data/customers_filt.csv', index=False)

### IterableDataset

In [112]:
class RecommendationDatasetMultiLabel(IterableDataset):

    def __init__(self, dataset_filepath, article_emb_bag, total_articles):
        
        self.dataset_itr = open(dataset_filepath, 'r')
        next(self.dataset_itr)  # skip header
        
        self.article_emb_bag = article_emb_bag
        
        self.total_articles = total_articles
    
    def process_label(self, label_str: str):
        
        labels = torch.tensor([int(v) for v in label_str.split(',')])
        
        target = torch.zeros(self.total_articles).scatter_(0, labels, 1.)

        return target
    
    def mean_historical_purchases_embedding(self, input_str: str):
        
        indices = torch.tensor([int(v) for v in input_str.split(',')])

        mean_emb = self.article_emb_bag(indices.unsqueeze(0)).flatten()
            
        return mean_emb
    
    def process_numeric_features(self, fn: str, active: str, age: str):
        numeric_features_tensor = torch.tensor([
            float(fn),
            float(active),
            float(age)
        ])

        return numeric_features_tensor
    
    def parse_itr(self, dataset_itr):
        
        for line in dataset_itr:
        
            line_items = line.rstrip('\n').split('\t')
            
            customer_id_idx, article_id_idx_last10, article_id_idx_label, fn, active, age = line_items
            
            customer_id_idx = int(customer_id_idx)
            
            numeric_features_tensor = self.process_numeric_features(fn, active, age)
            
            mean_emb = self.mean_historical_purchases_embedding(article_id_idx_last10)

            label = self.process_label(article_id_idx_label)    
            
            inputs = torch.concat((mean_emb, numeric_features_tensor))

            yield customer_id_idx, inputs, label
        
    def get_stream(self, dataset_itr):
        
        return self.parse_itr(dataset_itr)

    def __iter__(self):
        
        return self.get_stream(self.dataset_itr)

In [113]:
dataset = RecommendationDatasetMultiLabel(dataset_filepath='data/final_train_set.tsv', article_emb_bag=article_emb_bag, total_articles=total_articles)

In [115]:
train_loader = DataLoader(dataset, batch_size=4)  
    
for idx, data in enumerate(train_loader):
    
    if idx == 5:
        break
    else:
        customer_id_idx, inputs, label = data
        print(customer_id_idx)
        print(inputs)
        print(label)
        print('\n')

tensor([0, 1, 2, 3])
tensor([[ 0.6880,  0.7994,  0.1675,  ...,  0.0000,  0.0000,  0.4500],
        [-0.5039,  0.2605, -0.6052,  ...,  1.0000,  1.0000,  0.4400],
        [ 0.0408, -0.3197, -1.0627,  ...,  0.0000,  0.0000,  0.2100],
        [ 0.2613, -0.0234,  0.5033,  ...,  1.0000,  1.0000,  0.3300]],
       grad_fn=<StackBackward0>)
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])


tensor([4, 5, 6, 7])
tensor([[ 0.0690,  0.0961, -0.5127,  ...,  0.0000,  0.0000,  0.2100],
        [ 0.6053, -0.6315, -0.3857,  ...,  0.0000,  0.0000,  0.1900],
        [-0.0239,  0.2512, -0.0091,  ...,  0.0000,  0.0000,  0.3600],
        [ 0.1321, -0.2569,  0.0335,  ...,  0.0000,  0.0000,  0.4000]],
       grad_fn=<StackBackward0>)
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])


tens

### TODO

- Wrap processing steps in a function and apply to train, test sets  [DONE]
- Add customer features here (FN, Active, Age)  [DONE]
- Save sets as CSVs  [DONE]
- Save reduced articles df - use to generate correct embeddings by index  [DONE]
- Save customers df - use to generate correct embeddings by index  [DONE]
- Write iterable dataset which processes dataset  [DONE]
    - Get mean for each of inputs and take average to use as input
    - Multi-hot encode label

Checks of proecessing steps:

Multi-hot encoding:

https://discuss.pytorch.org/t/what-kind-of-loss-is-better-to-use-in-multilabel-classification/32203/3

In [101]:
article_id_idx_label = dev_set.iloc[3]['article_id_idx_label']
article_id_idx_label

'18345,38864,35941,32663,39388'

In [104]:
total_articles = articles_df.shape[0]

In [112]:
labels = torch.tensor([int(v) for v in article_id_idx_label.split(',')])
labels

tensor([18345, 38864, 35941, 32663, 39388])

In [111]:
target = torch.zeros(total_articles).scatter_(0, labels, 1.)
target

tensor([0., 0., 0.,  ..., 0., 0., 0.])

In [114]:
target[labels]

tensor([1., 1., 1., 1., 1.])

In [115]:
target.sum()

tensor(5.)

Getting mean encoding of historical basket

In [117]:
article_emb_bag = nn.EmbeddingBag(num_embeddings=total_articles, embedding_dim=384)

In [135]:
article_id_idx_last10 = dev_set.iloc[5]['article_id_idx_last10']
article_id_idx_last10

'37757,34393,5646,38997'

In [136]:
indices = torch.tensor([int(v) for v in article_id_idx_last10.split(',')])
indices

tensor([37757, 34393,  5646, 38997])

In [137]:
indices.unsqueeze(0)

tensor([[37757, 34393,  5646, 38997]])

In [138]:
mean_emb = article_emb_bag(indices.unsqueeze(0))
mean_emb.shape

torch.Size([1, 384])

In [139]:
torch.testing.assert_allclose(
    mean_emb.flatten(),
    article_emb_bag.weight[indices].mean(dim=0)
)

In [141]:
mean_emb = article_emb_bag(indices, offsets=torch.tensor([0]))
mean_emb.shape

torch.Size([1, 384])

In [142]:
torch.testing.assert_allclose(
    mean_emb.flatten(),
    article_emb_bag.weight[indices].mean(dim=0)
)