In [1]:
# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

In [2]:
import os
import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, IterableDataset
from tqdm import tqdm

In [3]:
os.chdir('..')
# os.chdir('drive/My Drive/Colab Notebooks/Github/fashion-recommendations') 

In [4]:
from fashion_recommendations.metrics.average_precision import mapk

In [5]:
pd.options.display.max_columns = None

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


### Load transactions

In [7]:
# ' '.join(transactions_train['article_id'].value_counts().head(12).index.tolist())

In [8]:
transactions_train = pd.read_csv('data/transactions_train.csv', dtype={'article_id': str})
print(transactions_train.shape)
transactions_train.head()

(31788324, 5)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [9]:
transactions_train['t_dat'] = pd.to_datetime(transactions_train['t_dat'])

In [10]:
transactions_train.groupby('customer_id')['article_id'].count().describe()

count    1.362281e+06
mean     2.333463e+01
std      3.924225e+01
min      1.000000e+00
25%      3.000000e+00
50%      9.000000e+00
75%      2.700000e+01
max      1.895000e+03
Name: article_id, dtype: float64

### Load articles

In [11]:
articles_df = pd.read_csv(
    'data/articles.csv', 
    dtype={'article_id': str},
    usecols=['article_id', 'garment_group_name', 'detail_desc']
)
print(articles_df.shape)
articles_df.head()

(105542, 3)


Unnamed: 0,article_id,garment_group_name,detail_desc
0,108775015,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [12]:
articles_df.isnull().sum()

article_id              0
garment_group_name      0
detail_desc           416
dtype: int64

In [13]:
articles_df['detail_desc'] = np.where(
    articles_df['detail_desc'].isnull(), 
    articles_df['garment_group_name'], 
    articles_df['detail_desc']
)

### Find cut-offs

In [14]:
end = transactions_train['t_dat'].max()
endm1 = end - datetime.timedelta(days=7)
endm2 = endm1 - datetime.timedelta(days=7)
endm3 = endm2 - datetime.timedelta(days=7)

In [15]:
end, endm1, endm2, endm3

(Timestamp('2020-09-22 00:00:00'),
 Timestamp('2020-09-15 00:00:00'),
 Timestamp('2020-09-08 00:00:00'),
 Timestamp('2020-09-01 00:00:00'))

train (old)
- inputs: start - endm3
- labels: emdm3 - endm2

train (new)
- inputs: start - endm1
- labels: emdm1 - end

In [16]:
transactions_train = transactions_train.copy()[transactions_train['t_dat'] > '2020-07-01']

In [17]:
transactions_train['customer_id'].nunique()

484944

In [18]:
# % of items in last week which didn't appear in prior weeks' transactions
len(set(transactions_train[transactions_train['t_dat'] > endm1]['article_id']) - set(transactions_train[transactions_train['t_dat'] <= endm1]['article_id'])) / transactions_train[transactions_train['t_dat'] > endm1]['article_id'].nunique()

0.052763260313577226

In [19]:
transactions_train.shape

(3327520, 5)

In [20]:
transactions_train['article_id'].nunique() / articles_df.shape[0]

0.3840935362225465

### Filter out unnecessary articles

In [21]:
articles_df = articles_df.copy()[
    articles_df['article_id'].isin(transactions_train['article_id'].unique())
]

articles_df.reset_index(drop=True, inplace=True)
articles_df.drop(columns=['garment_group_name'], inplace=True)
print(articles_df.shape)
articles_df.head()

(40538, 2)


Unnamed: 0,article_id,detail_desc
0,108775015,Jersey top with narrow shoulder straps.
1,108775044,Jersey top with narrow shoulder straps.
2,110065001,"Microfibre T-shirt bra with underwired, moulde..."
3,110065002,"Microfibre T-shirt bra with underwired, moulde..."
4,110065011,"Microfibre T-shirt bra with underwired, moulde..."


In [22]:
articles_df['article_id_idx'] = articles_df.index

In [23]:
article_id_to_idx = dict(zip(articles_df['article_id'], articles_df['article_id_idx']))

In [24]:
transactions_train['article_id_idx'] = transactions_train['article_id'].map(article_id_to_idx)
transactions_train['article_id_idx'] = transactions_train['article_id_idx'].astype(str)

### Create splits

#### Function

train (old)
- inputs: start - endm3
- labels: emdm3 - endm2

train (new)
- inputs: start - endm1
- labels: emdm1 - end

In [25]:
def create_article_input_output_df(inputs, labels, max_purchase_history=10):
    
    inputs = inputs.copy()
    labels = labels.copy()
    
    inputs.sort_values(['customer_id', 't_dat'], inplace=True)

    inputs['article_id_idx'] = inputs['article_id_idx'].astype(str)

    inputs = (
        inputs
            .groupby('customer_id')
            .tail(max_purchase_history)  # Most recent articles only
            .groupby('customer_id')['article_id_idx']
            .apply(lambda x: ','.join(x))
            .reset_index()
    )

    labels = labels[['customer_id', 'article_id_idx']].drop_duplicates()  # unique purchases

    labels = (
        labels
            .groupby('customer_id')['article_id_idx']
            .apply(lambda x: ','.join(x))
            .reset_index()
    )

    data_set = inputs.merge(labels, on='customer_id', how='outer', indicator=True, suffixes=('_last10', '_label'))

    data_set = data_set.copy()[data_set['_merge'] == 'both']
    data_set.drop('_merge', inplace=True, axis=1)

    return data_set

In [26]:
train_set = create_article_input_output_df(
    inputs=transactions_train.copy()[transactions_train['t_dat'] <= endm1], 
    labels=transactions_train.copy()[
        (transactions_train['t_dat'] > endm1)
    ]
)

print(train_set.shape)
train_set.head()

(47128, 3)


Unnamed: 0,customer_id,article_id_idx_last10,article_id_idx_label
26,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,"39441,36213,38872,37897,38110,5642,4783,34346,...",24971
33,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...,"11936,16155,685,11824,2456,28519,11826,3269,23...",15029186095646
54,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,"35875,24399,21130,21130,21130,21130,40107,3128...","37826,34169,41,31785,39073,40201,39074,1243,39..."
107,000fb6e772c5d0023892065e659963da90b1866035558e...,"24455,19793,37728,25547,35072,15616,39388,3938...",1834538864359413266339388
113,0010e8eb18f131e724d6997909af0808adbba057529edb...,26967156112506934918703,1832689435210113895137019375114332546


Inputs for submission:

In [27]:
submission_inputs = transactions_train.copy()

submission_inputs.sort_values(['customer_id', 't_dat'], inplace=True)

submission_inputs['article_id_idx'] = submission_inputs['article_id_idx'].astype(str)

submission_inputs = (
    submission_inputs
        .groupby('customer_id')
        .tail(10)  # Most recent articles only
        .groupby('customer_id')['article_id_idx']
        .apply(lambda x: ','.join(x))
        .reset_index()
)

In [28]:
submission_inputs.head()

Unnamed: 0,customer_id,article_id_idx
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2964
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,24822
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,19269
3,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,36886116143994918973
4,0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d...,10788915


In [29]:
submission_inputs.rename(columns={'article_id_idx': 'article_id_idx_last10'}, inplace=True)
submission_inputs['article_id_idx_label'] = '0'
print(submission_inputs.shape)
submission_inputs.head()

(484944, 3)


Unnamed: 0,customer_id,article_id_idx_last10,article_id_idx_label
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2964,0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,24822,0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,19269,0
3,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,36886116143994918973,0
4,0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d...,10788915,0


### Filter customers and get customer features

In [30]:
customers = pd.read_csv('data/customers.csv', usecols=['customer_id', 'FN', 'Active', 'age'])
print(customers.shape)
customers.head()

(1371980, 4)


Unnamed: 0,customer_id,FN,Active,age
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,49.0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,25.0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,24.0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,,54.0
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,52.0


In [31]:
customers_to_keep = set(train_set['customer_id'])
len(customers_to_keep)

47128

Filter out other customers from submission inputs (for other customers we'll just predict top-12)

In [32]:
len(customers_to_keep) / len(submission_inputs)

0.09718235507605001

In [33]:
submission_inputs = submission_inputs.copy()[submission_inputs['customer_id'].isin(customers_to_keep)]
submission_inputs.shape

(47128, 3)

In [34]:
customers = customers.copy()[customers['customer_id'].isin(customers_to_keep)]
customers.reset_index(drop=True, inplace=True)
customers.head()

Unnamed: 0,customer_id,FN,Active,age
0,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,1.0,1.0,33.0
1,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...,,,29.0
2,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,,,23.0
3,000fb6e772c5d0023892065e659963da90b1866035558e...,1.0,1.0,42.0
4,0010e8eb18f131e724d6997909af0808adbba057529edb...,1.0,1.0,25.0


In [35]:
customers.isnull().sum() / customers.shape[0]

customer_id    0.000000
FN             0.518779
Active         0.525717
age            0.003268
dtype: float64

In [36]:
customers[['FN', 'Active']] = customers[['FN', 'Active']].fillna(0)

Despite potential data leakage we impute the nulls for age here:

In [37]:
customers.shape

(47128, 4)

In [38]:
from sklearn.impute import SimpleImputer

In [39]:
simple_imputer = SimpleImputer()

In [40]:
customers[['age']] = simple_imputer.fit_transform(customers[['age']])

In [41]:
customers['age'] /= 100

In [42]:
customers.isnull().sum() / customers.shape[0]

customer_id    0.0
FN             0.0
Active         0.0
age            0.0
dtype: float64

In [43]:
customers['customer_id_idx'] = customers.index

In [44]:
customers.head()

Unnamed: 0,customer_id,FN,Active,age,customer_id_idx
0,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,1.0,1.0,0.33,0
1,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...,0.0,0.0,0.29,1
2,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,0.0,0.0,0.23,2
3,000fb6e772c5d0023892065e659963da90b1866035558e...,1.0,1.0,0.42,3
4,0010e8eb18f131e724d6997909af0808adbba057529edb...,1.0,1.0,0.25,4


In [45]:
train_set = train_set.merge(customers, on=['customer_id'])

In [46]:
submission_inputs = submission_inputs.merge(customers, on=['customer_id'])

In [47]:
train_set.head()

Unnamed: 0,customer_id,article_id_idx_last10,article_id_idx_label,FN,Active,age,customer_id_idx
0,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,"39441,36213,38872,37897,38110,5642,4783,34346,...",24971,1.0,1.0,0.33,0
1,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...,"11936,16155,685,11824,2456,28519,11826,3269,23...",15029186095646,0.0,0.0,0.29,1
2,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,"35875,24399,21130,21130,21130,21130,40107,3128...","37826,34169,41,31785,39073,40201,39074,1243,39...",0.0,0.0,0.23,2
3,000fb6e772c5d0023892065e659963da90b1866035558e...,"24455,19793,37728,25547,35072,15616,39388,3938...",1834538864359413266339388,1.0,1.0,0.42,3
4,0010e8eb18f131e724d6997909af0808adbba057529edb...,26967156112506934918703,1832689435210113895137019375114332546,1.0,1.0,0.25,4


In [48]:
train_set = train_set[['customer_id_idx', 'article_id_idx_last10', 'article_id_idx_label', 'FN', 'Active', 'age']]

In [49]:
submission_inputs = submission_inputs[['customer_id_idx', 'article_id_idx_last10', 'article_id_idx_label', 'FN', 'Active', 'age']]

In [50]:
train_set.head()

Unnamed: 0,customer_id_idx,article_id_idx_last10,article_id_idx_label,FN,Active,age
0,0,"39441,36213,38872,37897,38110,5642,4783,34346,...",24971,1.0,1.0,0.33
1,1,"11936,16155,685,11824,2456,28519,11826,3269,23...",15029186095646,0.0,0.0,0.29
2,2,"35875,24399,21130,21130,21130,21130,40107,3128...","37826,34169,41,31785,39073,40201,39074,1243,39...",0.0,0.0,0.23
3,3,"24455,19793,37728,25547,35072,15616,39388,3938...",1834538864359413266339388,1.0,1.0,0.42
4,4,26967156112506934918703,1832689435210113895137019375114332546,1.0,1.0,0.25


In [51]:
submission_inputs.head()

Unnamed: 0,customer_id_idx,article_id_idx_last10,article_id_idx_label,FN,Active,age
0,0,"36213,38872,37897,38110,5642,4783,34346,34346,...",0,1.0,1.0,0.33
1,1,"11824,2456,28519,11826,3269,23088,11825,15029,...",0,0.0,0.0,0.29
2,2,"39073,40201,39074,1243,39277,40370,38239,40344...",0,0.0,0.0,0.23
3,3,"39975,18345,38864,35941,38864,35941,32663,3266...",0,1.0,1.0,0.42
4,4,7031832689435210113895137019375114332546,0,1.0,1.0,0.25


### Save files

In [52]:
train_set.to_csv('data/final_train_set_for_submission.tsv', sep='\t', index=False)
submission_inputs.to_csv('data/final_submission_inputs_for_submission.tsv', sep='\t', index=False)

articles_df.to_csv('data/articles_df_filt_for_submission.csv', index=False)
customers.to_csv('data/customers_filt_for_submission.csv', index=False)