Aim: Create smaller subset which is easier to experiment with

In [2]:
import os
import datetime

import pandas as pd
import numpy as np

In [3]:
os.chdir('..')

In [4]:
from fashion_recommendations.metrics.average_precision import mapk

### Load data

In [5]:
transactions_train = pd.read_csv('data/transactions_train.csv', dtype={'article_id': str})
print(transactions_train.shape)
transactions_train.head()

(31788324, 5)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [6]:
transactions_train['t_dat'] = pd.to_datetime(transactions_train['t_dat'])

In [7]:
articles_df = pd.read_csv('data/articles.csv', dtype={'article_id': str})
print(articles_df.shape)
articles_df.head()

(105542, 25)


Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


### Last week as test

In [8]:
end = transactions_train['t_dat'].max()
endm1 = end - datetime.timedelta(days=7)

endm1, end

(Timestamp('2020-09-15 00:00:00'), Timestamp('2020-09-22 00:00:00'))

In [9]:
transactions_train['last_week'] = np.where(
    transactions_train['t_dat'].between(endm1, end, inclusive='both'),
    1,
    0
)

In [10]:
transactions_train[transactions_train['last_week'] == 1]['article_id'].nunique()

18684

In [11]:
transactions_train['article_id'].nunique()

104547

In [12]:
transactions_train[transactions_train['last_week'] == 1]['customer_id'].nunique()

75481

In [13]:
transactions_train['customer_id'].nunique()

1362281

Sample 10k users who made purchases in the last week

In [14]:
np.random.seed(3)
selected_customers = np.random.choice(transactions_train[transactions_train['last_week'] == 1]['customer_id'].unique(), size=10000, replace=False)
selected_customers

array(['d1d203046658c8638beb20f021445d73c0cad2c3d63e3650d1d9797be05c9eea',
       '0bf708430c392f7e79596fed48e94524091476a4dd602dedcd3f58a2a58dc631',
       '42dcc3c8cbfeffb08989ba0370e71fb262e9fd5928d5c72176d473b90587ec79',
       ...,
       'a2e8fe66b192d6d6b5501f26cbb213b5822375c0e69a81d2616f8bb955ab8f4b',
       '667adc48bda4fd1ab45ac200344b18fe4a3aafe7f6da3d295ae1177ca29b7b2e',
       '56f29cd4b59e0a75b8697d898028a4175c4d8651e59c69f566a46082657060da'],
      dtype=object)

In [15]:
test_set = transactions_train.copy()[
    (transactions_train['last_week'] == 1) & 
    (transactions_train['customer_id'].isin(selected_customers))
].drop(columns='last_week')

In [16]:
test_set['customer_id'].nunique()

10000

In [17]:
test_set.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
31521960,2020-09-15,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,794321007,0.061,2
31521967,2020-09-15,0015f37f752a41a75c3be6f3f92deedc4c87d039f1758e...,778745010,0.033881,2
31521968,2020-09-15,0015f37f752a41a75c3be6f3f92deedc4c87d039f1758e...,778745010,0.033881,2
31521969,2020-09-15,0015f37f752a41a75c3be6f3f92deedc4c87d039f1758e...,610776105,0.008458,2
31521970,2020-09-15,0015f37f752a41a75c3be6f3f92deedc4c87d039f1758e...,929745001,0.050831,2


Take previous transactions for these customers. This is our training data

In [18]:
train_set = transactions_train.copy()[
    (transactions_train['last_week'] == 0) & 
    (transactions_train['customer_id'].isin(selected_customers))
].drop(columns='last_week')

In [19]:
train_set['customer_id'].nunique()

9208

Remove test set customers not in training (in actual problem we are not predicting for many cold start users)

In [20]:
test_set = test_set[test_set['customer_id'].isin(train_set['customer_id'].unique())]

In [21]:
test_set['customer_id'].nunique()

9208

In [22]:
train_set.shape, test_set.shape

((588758, 5), (32995, 5))

In [23]:
train_set.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
265,2018-09-20,016d8f0519d9e0572b3abebeab87408bad7a5c3a284016...,650193004,0.050831,1
266,2018-09-20,016d8f0519d9e0572b3abebeab87408bad7a5c3a284016...,527687006,0.101678,1
546,2018-09-20,02bfe1a5248beb9cd28ad4ac630a6d75e78d9a3e14551a...,668767002,0.016932,2


### Top 12 baseline

In [24]:
top_12_article_id = train_set['article_id'].value_counts().head(12).index.tolist()
top_12_article_id

['0706016001',
 '0706016002',
 '0372860001',
 '0610776002',
 '0759871002',
 '0448509014',
 '0673677002',
 '0610776001',
 '0751471001',
 '0372860002',
 '0720125001',
 '0399223001']

In [25]:
test_set_by_customer = test_set.groupby('customer_id').apply(lambda x: list(x['article_id'])).reset_index().rename(columns={0: 'article_id'})
test_set_by_customer.head()

Unnamed: 0,customer_id,article_id
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,[0794321007]
1,0015f37f752a41a75c3be6f3f92deedc4c87d039f1758e...,"[0778745010, 0778745010, 0610776105, 092974500..."
2,00282135561702f5b3b750fa3382d8fd83ce5d761a507e...,"[0855249006, 0822171001]"
3,00356a94bb9bed341f6dba58ad722974b01a1cbd9f06ef...,"[0920012003, 0572797002, 0572797002, 056860104..."
4,00462904b288681a9facb555f75dd2cf4d0f730a6e0ea7...,"[0827968001, 0887757001, 0673677002, 079936500..."


In [26]:
actuals = test_set_by_customer['article_id'].to_list()

In [27]:
predictions = [top_12_article_id for _ in range(test_set_by_customer.shape[0])]

In [28]:
mapk(actuals, predictions, k=12)

0.0031954765745193844

### Save splits for later

In [29]:
train_set.to_csv('data/splits/train_subset.tsv',sep='\t', index=False)

In [30]:
test_set.to_csv('data/splits/test_subset.tsv',sep='\t', index=False)

Models to try:
- Multi-class prediction (single-label)
- Binary prediction with negative sampling
- Multi-class prediction (multi-label)