In [1]:
import os
import datetime

import pandas as pd
import numpy as np

In [2]:
os.chdir('..')

In [3]:
from fashion_recommendations.data.constants import ARTICLE_ID_TO_IDX
from fashion_recommendations.data.prepare_splits import prepare_splits

#### Create splits

In [4]:
transactions_train_df = pd.read_csv('data/transactions_train.csv', dtype={'article_id': str})  # Make sure article_id is being loading in as a string
print(transactions_train_df.shape)
transactions_train_df.head()

(31788324, 5)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [5]:
transactions_train_df['t_dat'] = pd.to_datetime(transactions_train_df['t_dat'])

In [6]:
transactions_train_df.sort_values(['customer_id', 't_dat'], inplace=True)

In [7]:
transactions_train_df['article_id_idx'] = transactions_train_df['article_id'].map(ARTICLE_ID_TO_IDX)

In [8]:
transactions_train_df.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,article_id_idx
4212358,2018-12-27,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,625548001,0.044051,1,29518
4212359,2018-12-27,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,176209023,0.035576,1,101
4212360,2018-12-27,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,627759010,0.030492,1,30329
9663224,2019-05-02,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,697138006,0.010153,2,50726
10754876,2019-05-25,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,568601006,0.050831,2,16005


\#__cv1__

_train_: start to t-1

_eval_: t-1 to end

\#__cv2__

_train_: start to t-2

_eval_: t-2 to t-1


\#__cv3__

_train_: start to t-3

_eval_: t-3 to t-2

\#__submission__

_train_: start to end

_eval_: submission customers

In [10]:
start = transactions_train_df['t_dat'].min()
end = transactions_train_df['t_dat'].max()
tm1 = end - datetime.timedelta(days=7)
tm2 = tm1 - datetime.timedelta(days=7)
tm3 = tm2 - datetime.timedelta(days=7)

end, tm1, tm2, tm3, start

(Timestamp('2020-09-22 00:00:00'),
 Timestamp('2020-09-15 00:00:00'),
 Timestamp('2020-09-08 00:00:00'),
 Timestamp('2020-09-01 00:00:00'),
 Timestamp('2018-09-20 00:00:00'))

In [12]:
cv1_train = transactions_train_df.copy()[transactions_train_df['t_dat'].between(start, tm1, inclusive='left')]

cv1_eval = transactions_train_df.copy()[transactions_train_df['t_dat'].between(tm1, end, inclusive='both')]

In [14]:
prepare_splits(train_df=cv1_train, dev_df=cv1_eval, name='cv1')

Train no history customers prop: 0.0070691172939952515
Dev no history customers prop: 0.007062733693394266


In [15]:
del cv1_train, cv1_eval

In [16]:
cv2_train = transactions_train_df.copy()[transactions_train_df['t_dat'].between(start, tm2, inclusive='left')]

cv2_eval = transactions_train_df.copy()[transactions_train_df['t_dat'].between(tm2, tm1, inclusive='both')]

prepare_splits(train_df=cv2_train, dev_df=cv2_eval, name='cv2')

Train no history customers prop: 0.007069340535889285
Dev no history customers prop: 0.007055429395718182


In [17]:
del cv2_train, cv2_eval

In [18]:
cv3_train = transactions_train_df.copy()[transactions_train_df['t_dat'].between(start, tm3, inclusive='left')]

cv3_eval = transactions_train_df.copy()[transactions_train_df['t_dat'].between(tm3, tm2, inclusive='both')]

prepare_splits(train_df=cv3_train, dev_df=cv3_eval, name='cv3')

Train no history customers prop: 0.00706852995524994
Dev no history customers prop: 0.007058568506049234


In [19]:
del cv3_train, cv3_eval

### Submission data

#### Train

In [9]:
prepare_splits(train_df=transactions_train_df, dev_df=None, name='for_submission')

Train no history customers prop: 0.007068431214131552
No dev data, ending


In [None]:
'data/splits/train_single_purchase_label_cv1.tsv'
'data/splits/dev_all_purchase_label_cv1.tsv'
'data/splits/dev_single_purchase_label_cv1.tsv'

In [31]:
df = pd.read_csv('data/splits/train_single_purchase_label_for_submission.tsv', sep='\t')
print(df.shape)
df.head()

(1054548, 3)


Unnamed: 0,customer_id,article_id_idx_label,article_id_idx_historical
0,460d77739e77188377bee03ff0588e826b4ce7f151e50f...,74105,"25865,28076,67151,43972,76154,70223,69226,7949..."
1,dc5630be6cd7f6aa04a197d2162a12c8116db93333cf31...,48943,6100748943
2,669f44d134feefdbb2345e47f1499f964bce762d0c69f3...,67912,81898
3,7bf52bb2bea24c2b3922073dfdeba90f31666385628d5d...,58493,"44488,66580,66580,57052,57052,67705,67705,6522..."
4,7b05e7568e8bf83b82fe6db98a03ed5746cb5f0b7de186...,85671,4404031031


#### Prepare submission data

In [58]:
submission_customers = pd.read_csv('data/sample_submission.csv', usecols=['customer_id'])
print(submission_customers.shape)
submission_customers.head()

(1371980, 1)


Unnamed: 0,customer_id
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...


In [68]:
all_historical_transactions = pd.read_csv('data/transactions_train.csv', dtype={'article_id': str})
all_historical_transactions['article_id_idx'] = all_historical_transactions['article_id'].map(article_id_to_idx)
all_historical_transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,article_id_idx
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,40181
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,10522
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2,6389
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2,46306
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2,46307


In [69]:
all_historical_transactions.sort_values(['customer_id', 't_dat'], inplace=True)

In [70]:
all_historical_transactions['article_id_idx'] = all_historical_transactions['article_id_idx'].astype(str)

In [71]:
all_historical_transactions_by_cust = (
    all_historical_transactions[['customer_id', 'article_id_idx']]
    .groupby('customer_id')
    .agg({
        'article_id_idx': ','.join
    })
    .reset_index()
)

all_historical_transactions_by_cust.head()

Unnamed: 0,customer_id,article_id_idx
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,"29518,101,30329,50726,16005,16005,23998,65669,..."
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,"19335,33750,33993,8218,41026,19335,42628,41026..."
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,"40181,10522,40181,18199,59460,1471,1471,60255,..."
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,6452761177
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,"32249,43444,51126,54463,2183,2183,20519,87478,..."


In [72]:
all_historical_transactions_by_cust.shape

(1362281, 2)

In [94]:
submission_inputs = submission_customers.merge(all_historical_transactions_by_cust, on='customer_id', how='left', indicator=True)

In [95]:
submission_inputs['_merge'].value_counts() / submission_inputs.shape[0]

both          0.992931
left_only     0.007069
right_only    0.000000
Name: _merge, dtype: float64

In [96]:
submission_inputs.head()

Unnamed: 0,customer_id,article_id_idx,_merge
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,"29518,101,30329,50726,16005,16005,23998,65669,...",both
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,"19335,33750,33993,8218,41026,19335,42628,41026...",both
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,"40181,10522,40181,18199,59460,1471,1471,60255,...",both
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,6452761177,both
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,"32249,43444,51126,54463,2183,2183,20519,87478,...",both


In [98]:
submission_inputs['article_id_idx'].fillna(NO_HISTORY_ARTICLE_ID_IDX, inplace=True)

In [99]:
submission_inputs.drop('_merge', axis=1, inplace=True)

In [100]:
submission_inputs.rename(columns={'article_id_idx': 'article_id_idx_historical'}, inplace=True)

In [101]:
# Create dummy label so Dataset can process it
submission_inputs['dummy_label'] = '999'

In [102]:
submission_inputs = submission_inputs.copy()[['customer_id', 'dummy_label', 'article_id_idx_historical']]
submission_inputs.head()

Unnamed: 0,customer_id,dummy_label,article_id_idx_historical
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,999,"29518,101,30329,50726,16005,16005,23998,65669,..."
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,999,"19335,33750,33993,8218,41026,19335,42628,41026..."
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,999,"40181,10522,40181,18199,59460,1471,1471,60255,..."
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,999,6452761177
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,999,"32249,43444,51126,54463,2183,2183,20519,87478,..."


In [103]:
submission_inputs.to_csv('data/splits/submission_inputs.tsv',sep='\t', index=False)