In [1]:
!pip install -U sentence-transformers



In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
import os
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, IterableDataset
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

In [4]:
# os.chdir('..')
os.chdir('drive/My Drive/Colab Notebooks/Github/fashion-recommendations') 

In [5]:
from fashion_recommendations.metrics.average_precision import mapk

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [7]:
class RecommendationDatasetMultiLabel(IterableDataset):

    def __init__(self, dataset_filepath, article_emb_bag, total_articles):
        
        self.dataset_itr = open(dataset_filepath, 'r')
        next(self.dataset_itr)  # skip header
        
        self.article_emb_bag = article_emb_bag
        
        self.total_articles = total_articles
    
    def process_label(self, label_str: str, num_purchases_str: str):
        
        labels = torch.tensor([int(v) for v in label_str.split(',')])
        
        num_purchases = torch.tensor([float(v) for v in num_purchases_str.split(',')])
        
        target = torch.zeros(self.total_articles).scatter_(0, labels, num_purchases)
        
        target = target / target.sum()  # Normalise

        return target
    
    def mean_historical_purchases_embedding(self, input_str: str):
        
        indices = torch.tensor([int(v) for v in input_str.split(',')])

        mean_emb = self.article_emb_bag(indices.unsqueeze(0)).flatten()
            
        return mean_emb
    
    def process_numeric_features(self, fn: str, active: str, age: str, example_age: str, number_of_inputs: float):
        numeric_features_tensor = torch.tensor([
            float(fn),
            float(active),
            float(age),
            float(example_age),
            number_of_inputs  # NEW
        ])

        return numeric_features_tensor
    
    def parse_itr(self, dataset_itr):
        
        for line in dataset_itr:
        
            line_items = line.rstrip('\n').split('\t')
            
            customer_id_idx, article_id_idx_last10, article_id_idx_label, fn, active, age, num_purchases, example_age, fashion_news_frequency_idx, club_member_status_idx, postal_code_idx = line_items
            
            customer_id_idx = int(customer_id_idx)
            fashion_news_frequency_idx = int(fashion_news_frequency_idx)
            club_member_status_idx = int(club_member_status_idx)
            postal_code_idx = int(club_member_status_idx)

            number_of_inputs = len(article_id_idx_last10.split(',')) / 100 # NEW
            
            numeric_features_tensor = self.process_numeric_features(fn, active, age, example_age, number_of_inputs)
            
            mean_emb = self.mean_historical_purchases_embedding(article_id_idx_last10)

            label = self.process_label(article_id_idx_label, num_purchases)    
            
            inputs = torch.concat((mean_emb, numeric_features_tensor))

            yield customer_id_idx, fashion_news_frequency_idx, club_member_status_idx, postal_code_idx, inputs, label
        
    def get_stream(self, dataset_itr):
        
        return self.parse_itr(dataset_itr)

    def __iter__(self):
        
        return self.get_stream(self.dataset_itr)

In [8]:
articles_df = pd.read_csv('data/articles_df_filt_with_counts_and_age_extended_100_hist_with_more_cust_feat_for_submission.csv', dtype={'article_id': str})
print(articles_df.shape)
articles_df.head()

(104547, 3)


Unnamed: 0,article_id,detail_desc,article_id_idx
0,108775015,Jersey top with narrow shoulder straps.,0
1,108775044,Jersey top with narrow shoulder straps.,1
2,108775051,Jersey top with narrow shoulder straps.,2
3,110065001,"Microfibre T-shirt bra with underwired, moulde...",3
4,110065002,"Microfibre T-shirt bra with underwired, moulde...",4


In [9]:
customers = pd.read_csv('data/customers_filt_with_counts_and_age_extended_100_hist_with_more_cust_feat_for_submission.csv')
print(customers.shape)
customers.head()

(63412, 11)


Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,customer_id_idx,fashion_news_frequency_idx,club_member_status_idx,postal_code_idx
0,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,0.0,0.0,ACTIVE,MISSING,0.27,43cbf97df3d118b937551fb21a08d513bfb2e58223315f...,0,0,0,0
1,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,1.0,1.0,ACTIVE,Regularly,0.33,d647e4ede3d0eb4ce0750440a110350b5f4c758165d89d...,1,1,0,1
2,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...,0.0,0.0,ACTIVE,MISSING,0.29,72afbb92c0200628bfa8f983c241eb0dc14e107f87d95b...,2,0,0,2
3,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,0.0,0.0,ACTIVE,MISSING,0.23,3119ea10ffe5ac3419b9127589a61b33e1ae38ecbb997b...,3,0,0,3
4,000fb6e772c5d0023892065e659963da90b1866035558e...,1.0,1.0,ACTIVE,Regularly,0.42,68ca4d9d6051d9c10b917d36bf9cb4afbadc551f7e4feb...,4,1,0,4


### Create article EmbeddingBag

In [10]:
model = SentenceTransformer('all-distilroberta-v1')

In [11]:
article_descriptions = articles_df['detail_desc'].to_list()
article_descriptions[:5]

['Jersey top with narrow shoulder straps.',
 'Jersey top with narrow shoulder straps.',
 'Jersey top with narrow shoulder straps.',
 'Microfibre T-shirt bra with underwired, moulded, lightly padded cups that shape the bust and provide good support. Narrow adjustable shoulder straps and a narrow hook-and-eye fastening at the back. Without visible seams for greater comfort.',
 'Microfibre T-shirt bra with underwired, moulded, lightly padded cups that shape the bust and provide good support. Narrow adjustable shoulder straps and a narrow hook-and-eye fastening at the back. Without visible seams for greater comfort.']

In [12]:
model.encode(article_descriptions[0]).shape

(768,)

In [13]:
embeddings = model.encode(article_descriptions)

In [14]:
embeddings = torch.tensor(embeddings)

In [15]:
embeddings.shape

torch.Size([104547, 768])

In [16]:
ARTICLE_EMBEDDING_BAG = nn.EmbeddingBag.from_pretrained(embeddings=embeddings, freeze=True)

### Model

In [17]:
class FashionRecV3(nn.Module):

    def __init__(self, user_embedding_dim):
        super(FashionRecV3, self).__init__()
        
        # Embeddings
        self.user_embeddings = nn.Embedding(num_embeddings=len(customers), embedding_dim=user_embedding_dim)   
        self.postal_code_embeddings = nn.Embedding(num_embeddings=54126, embedding_dim=384)
        self.fashion_news_frequency_embeddings = nn.Embedding(num_embeddings=3, embedding_dim=32)
        self.club_member_status_embeddings = nn.Embedding(num_embeddings=4, embedding_dim=32)
        
        # Initialise embeddings
        torch.nn.init.xavier_uniform_(self.user_embeddings.weight)
        torch.nn.init.xavier_uniform_(self.postal_code_embeddings.weight)
        torch.nn.init.xavier_uniform_(self.fashion_news_frequency_embeddings.weight)
        torch.nn.init.xavier_uniform_(self.club_member_status_embeddings.weight)
        
        # FC layers
        self.fc_1 = nn.Linear(in_features=user_embedding_dim+768+4+1+384+32*2, out_features=2048)
        self.fc_2 = nn.Linear(in_features=2048, out_features=1024)
        self.fc_3 = nn.Linear(in_features=1024, out_features=len(articles_df))
        
        # Activation functions
        self.relu = nn.ReLU()
        
        # Dropout
        # self.dropout = nn.Dropout(0.01)
        
    def forward(
        self, 
        customer_id_idx,
        fashion_news_frequency_idx, 
        club_member_status_idx, 
        postal_code_idx,
        inputs
    ):

        user_emb = self.user_embeddings(customer_id_idx)
        pc_emb = self.postal_code_embeddings(postal_code_idx)
        fash_news_emb = self.fashion_news_frequency_embeddings(fashion_news_frequency_idx)
        mem_status_emb = self.club_member_status_embeddings(club_member_status_idx)
              
        concatenated_features = torch.concat(
            [
                user_emb, 
                pc_emb,
                fash_news_emb,
                mem_status_emb,
                inputs
            ], 
            dim=1
        )
        
        x = self.fc_1(concatenated_features)
        x = self.relu(x)
        # x = self.dropout(x)
        
        x = self.fc_2(x)
        x = self.relu(x)
        # x = self.dropout(x)

        x = self.fc_3(x)

        return x

In [18]:
fashion_rec_v3 = FashionRecV3(user_embedding_dim=384)

In [19]:
fashion_rec_v3.to(device)

FashionRecV3(
  (user_embeddings): Embedding(63412, 384)
  (postal_code_embeddings): Embedding(54126, 384)
  (fashion_news_frequency_embeddings): Embedding(3, 32)
  (club_member_status_embeddings): Embedding(4, 32)
  (fc_1): Linear(in_features=1605, out_features=2048, bias=True)
  (fc_2): Linear(in_features=2048, out_features=1024, bias=True)
  (fc_3): Linear(in_features=1024, out_features=104547, bias=True)
  (relu): ReLU()
)

In [20]:
BATCH_SIZE = 1024

In [21]:
total_training_examples = pd.read_csv('data/final_train_set_with_counts_and_age_extended_100_hist_with_more_cust_feat_for_submission.tsv', sep='\t').shape[0]
total_batches = np.ceil(total_training_examples/BATCH_SIZE)
total_batches

62.0

In [22]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=fashion_rec_v3.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.1)

In [91]:
MAX_EPOCHS = 15

# training_losses = []

for epoch in range(MAX_EPOCHS):
    
    # Since we use an IterableDataset we need to reinstaniate the dataset since file end will have been reached:
    train_dataset = RecommendationDatasetMultiLabel(dataset_filepath='data/final_train_set_with_counts_and_age_extended_100_hist_with_more_cust_feat_for_submission.tsv', article_emb_bag=ARTICLE_EMBEDDING_BAG, total_articles=len(articles_df))    
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE)  
    
    for data in tqdm(train_loader, total=total_batches):
        
        customer_id_idx, fashion_news_frequency_idx, club_member_status_idx, postal_code_idx, inputs, label = data
        customer_id_idx, fashion_news_frequency_idx, club_member_status_idx, postal_code_idx, inputs, label = customer_id_idx.to(device), fashion_news_frequency_idx.to(device), club_member_status_idx.to(device), postal_code_idx.to(device), inputs.to(device), label.to(device)

        optimizer.zero_grad()  # Set gradients to 0 otherwise will accumulate

        y_pred = fashion_rec_v3(customer_id_idx, fashion_news_frequency_idx, club_member_status_idx, postal_code_idx, inputs)
        
        loss = criterion(y_pred, label)

        loss.backward()    
        
        optimizer.step()
    
    scheduler.step()

    # if ((epoch + 1) % 200 == 0) | (epoch == 0):
    if epoch == 39:

        print(f"Learning rate: {scheduler.get_last_lr()}")

        # Compute train loss:
        
        total_train_loss = 0

        fashion_rec_v3.eval()
        
        train_dataset = RecommendationDatasetMultiLabel(dataset_filepath='data/final_train_set_with_counts_and_age_extended_100_hist_with_more_cust_feat_for_submission.tsv', article_emb_bag=ARTICLE_EMBEDDING_BAG, total_articles=len(articles_df))    
        
        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE)  
        
        with torch.no_grad():
            for data in tqdm(train_loader, total=total_batches):

                customer_id_idx, fashion_news_frequency_idx, club_member_status_idx, postal_code_idx, inputs, label = data
                customer_id_idx, fashion_news_frequency_idx, club_member_status_idx, postal_code_idx, inputs, label = customer_id_idx.to(device), fashion_news_frequency_idx.to(device), club_member_status_idx.to(device), postal_code_idx.to(device), inputs.to(device), label.to(device)

                y_pred = fashion_rec_v3(customer_id_idx, fashion_news_frequency_idx, club_member_status_idx, postal_code_idx, inputs)

                loss = nn.CrossEntropyLoss(reduction='sum')(y_pred, label)

                total_train_loss += loss.item()

            mean_train_loss = total_train_loss / total_training_examples
            print(f"Training loss: {mean_train_loss}")
            training_losses.append(mean_train_loss)            
            
        fashion_rec_v3.train()

100%|██████████| 62/62.0 [00:43<00:00,  1.41it/s]
100%|██████████| 62/62.0 [00:43<00:00,  1.42it/s]
100%|██████████| 62/62.0 [00:43<00:00,  1.42it/s]
100%|██████████| 62/62.0 [00:43<00:00,  1.42it/s]
100%|██████████| 62/62.0 [00:43<00:00,  1.43it/s]
100%|██████████| 62/62.0 [00:43<00:00,  1.43it/s]
100%|██████████| 62/62.0 [00:43<00:00,  1.44it/s]
100%|██████████| 62/62.0 [00:43<00:00,  1.43it/s]
100%|██████████| 62/62.0 [00:43<00:00,  1.44it/s]
100%|██████████| 62/62.0 [00:42<00:00,  1.46it/s]
100%|██████████| 62/62.0 [00:43<00:00,  1.44it/s]
100%|██████████| 62/62.0 [00:42<00:00,  1.46it/s]
100%|██████████| 62/62.0 [00:42<00:00,  1.45it/s]
100%|██████████| 62/62.0 [00:43<00:00,  1.42it/s]
100%|██████████| 62/62.0 [00:42<00:00,  1.46it/s]


In [59]:
training_losses

[2.5214108386671685, 1.235813842303729]

### MAP@12

#### Train

In [60]:
fashion_rec_v3.eval()

predictions = []

dev_dataset = RecommendationDatasetMultiLabel(dataset_filepath='data/final_train_set_with_counts_and_age_extended_100_hist_with_more_cust_feat_for_submission.tsv', article_emb_bag=ARTICLE_EMBEDDING_BAG, total_articles=len(articles_df))    

dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE)  

with torch.no_grad():
    for data in tqdm(dev_loader, total=total_batches):

        customer_id_idx, fashion_news_frequency_idx, club_member_status_idx, postal_code_idx, inputs, label = data
        customer_id_idx, fashion_news_frequency_idx, club_member_status_idx, postal_code_idx, inputs, label = customer_id_idx.to(device), fashion_news_frequency_idx.to(device), club_member_status_idx.to(device), postal_code_idx.to(device), inputs.to(device), label.to(device)

        y_pred = fashion_rec_v3(customer_id_idx, fashion_news_frequency_idx, club_member_status_idx, postal_code_idx, inputs)
        
        predictions.append(y_pred.topk(1000, dim=1).indices)

100%|██████████| 62/62.0 [00:43<00:00,  1.44it/s]


In [61]:
predictions = torch.concat(predictions)

In [62]:
predictions.shape

torch.Size([63412, 1000])

In [63]:
actuals = pd.read_csv('data/final_train_set_with_counts_and_age_extended_100_hist_with_more_cust_feat_for_submission.tsv', sep='\t')['article_id_idx_label'].apply(lambda x: [int(i) for i in x.split(',')]).tolist()
len(actuals)

63412

In [64]:
mapk(actuals, predictions.tolist(), 12)

0.984268504274005

🤯

In [65]:
def precision(a: list, b: list):
    a = set(a)
    b = set(b)
    num_common = len(a.intersection(b))
    return num_common / len(a)

In [66]:
np.mean(
    [precision(act, pred) for act, pred in zip(actuals, predictions.tolist())]
)

0.9999391461114415

In [67]:
number_of_positive_examples = [len(set(act).intersection(set(pred))) for act, pred in zip(actuals, predictions.tolist())]

In [68]:
np.mean(np.array(number_of_positive_examples ) > 0)

1.0

Model is a strong top-1000 candidate generator:
- 99+% precision
- Can use all customers in training set since each has at least 1 positive example 

This could then facilitate a 2-stage approach like YouTube

#### Submission

In [92]:
total_sub_batches = np.ceil(pd.read_csv('data/final_submission_inputs_with_counts_and_age_extended_for_submission.tsv', sep='\t').shape[0]/BATCH_SIZE)
total_sub_batches

62.0

In [93]:
fashion_rec_v3.eval()

predictions = []

dev_dataset = RecommendationDatasetMultiLabel(dataset_filepath='data/final_submission_inputs_with_counts_and_age_extended_100_hist_with_more_cust_feat_for_submission.tsv', article_emb_bag=ARTICLE_EMBEDDING_BAG, total_articles=len(articles_df))    

dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE)  

with torch.no_grad():
    for data in tqdm(dev_loader, total=total_sub_batches):

        customer_id_idx, fashion_news_frequency_idx, club_member_status_idx, postal_code_idx, inputs, label = data
        customer_id_idx, fashion_news_frequency_idx, club_member_status_idx, postal_code_idx, inputs, label = customer_id_idx.to(device), fashion_news_frequency_idx.to(device), club_member_status_idx.to(device), postal_code_idx.to(device), inputs.to(device), label.to(device)

        y_pred = fashion_rec_v3(customer_id_idx, fashion_news_frequency_idx, club_member_status_idx, postal_code_idx, inputs)
        
        predictions.append(y_pred.topk(12, dim=1).indices)

100%|██████████| 62/62.0 [00:42<00:00,  1.46it/s]


In [94]:
predictions = torch.concat(predictions)

In [95]:
predictions.shape

torch.Size([63412, 12])

In [96]:
sample_submission = pd.read_csv('data/sample_submission.csv')
print(sample_submission.shape)
sample_submission.head()

(1371980, 2)


Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0706016001 0706016002 0372860001 0610776002 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0706016001 0706016002 0372860001 0610776002 07...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0706016001 0706016002 0372860001 0610776002 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0706016001 0706016002 0372860001 0610776002 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0706016001 0706016002 0372860001 0610776002 07...


In [97]:
article_idx_to_id = dict(zip(articles_df['article_id_idx'], articles_df['article_id']))

In [98]:
predictions_article_id = [[article_idx_to_id[i] for i in l] for l in predictions.tolist()]
predictions_article_id[0]

['0624486001',
 '0624486064',
 '0763988006',
 '0820484007',
 '0624486069',
 '0902388003',
 '0826508003',
 '0579541077',
 '0797988001',
 '0763988001',
 '0880312004',
 '0824995006']

In [99]:
predictions_strings = [' '.join(l) for l in predictions_article_id]
predictions_strings[0]

'0624486001 0624486064 0763988006 0820484007 0624486069 0902388003 0826508003 0579541077 0797988001 0763988001 0880312004 0824995006'

In [100]:
submission_set_filt = pd.read_csv('data/final_submission_inputs_with_counts_and_age_extended_for_submission.tsv', sep='\t')
print(submission_set_filt.shape)
submission_set_filt.head()

(63412, 8)


Unnamed: 0,customer_id_idx,article_id_idx_last10,article_id_idx_label,FN,Active,age,num_purchases,example_age
0,0,"74711,2861,92243,42086,42086,59253,53988,73222...",0,0.0,0.0,0.27,1,-0.05
1,1,"100157,102878,101898,102115,33833,27875,98220,...",0,1.0,1.0,0.33,1,-0.07
2,2,"61234,13023,91804,61236,17917,84492,61235,6961...",0,0.0,0.0,0.29,1,-0.06
3,3,"103079,104210,103080,4736,103285,104379,102244...",0,0.0,0.0,0.23,1,-0.06
4,4,"103984,76557,102870,99876,102870,99876,96449,9...",0,1.0,1.0,0.42,1,-0.06


In [101]:
len(predictions_strings)

63412

In [102]:
customer_idx_to_id = dict(zip(customers['customer_id_idx'], customers['customer_id']))

In [103]:
submission_set_filt['customer_id'] = submission_set_filt['customer_id_idx'].map(customer_idx_to_id)

In [104]:
submission_set_filt['prediction_model'] = predictions_strings

In [105]:
submission_set_filt.isnull().sum()

customer_id_idx          0
article_id_idx_last10    0
article_id_idx_label     0
FN                       0
Active                   0
age                      0
num_purchases            0
example_age              0
customer_id              0
prediction_model         0
dtype: int64

In [106]:
submission_set_filt.head()

Unnamed: 0,customer_id_idx,article_id_idx_last10,article_id_idx_label,FN,Active,age,num_purchases,example_age,customer_id,prediction_model
0,0,"74711,2861,92243,42086,42086,59253,53988,73222...",0,0.0,0.0,0.27,1,-0.05,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,0624486001 0624486064 0763988006 0820484007 06...
1,1,"100157,102878,101898,102115,33833,27875,98220,...",0,1.0,1.0,0.33,1,-0.07,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,0827487003 0870611002 0795243002 0868054004 09...
2,2,"61234,13023,91804,61236,17917,84492,61235,6961...",0,0.0,0.0,0.29,1,-0.06,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...,0640021019 0757926001 0788575004 0788575002 07...
3,3,"103079,104210,103080,4736,103285,104379,102244...",0,0.0,0.0,0.23,1,-0.06,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,0915529005 0935892001 0903762001 0879189005 08...
4,4,"103984,76557,102870,99876,102870,99876,96449,9...",0,1.0,1.0,0.42,1,-0.06,000fb6e772c5d0023892065e659963da90b1866035558e...,0871519008 0889669006 0913272003 0786022008 09...


In [107]:
sample_submission = sample_submission.merge(submission_set_filt[['customer_id', 'prediction_model']], on='customer_id', how='left')
sample_submission.head()

Unnamed: 0,customer_id,prediction,prediction_model
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0706016001 0706016002 0372860001 0610776002 07...,
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0706016001 0706016002 0372860001 0610776002 07...,
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0706016001 0706016002 0372860001 0610776002 07...,
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0706016001 0706016002 0372860001 0610776002 07...,
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0706016001 0706016002 0372860001 0610776002 07...,


In [108]:
sample_submission.isnull().sum() / sample_submission.shape[0]

customer_id         0.000000
prediction          0.000000
prediction_model    0.953781
dtype: float64

In [109]:
sample_submission['prediction_model'].fillna(
    '0924243001 0924243002 0918522001 0923758001 0866731001 0909370001 0751471001 0915529003 0915529005 0448509014 0762846027 0714790020',
    inplace=True
)

In [110]:
sample_submission.drop(columns=['prediction'], inplace=True)
sample_submission.rename(columns={'prediction_model': 'prediction'}, inplace=True)

In [111]:
sample_submission

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0924243001 0924243002 0918522001 0923758001 08...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0924243001 0924243002 0918522001 0923758001 08...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0924243001 0924243002 0918522001 0923758001 08...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0924243001 0924243002 0918522001 0923758001 08...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0924243001 0924243002 0918522001 0923758001 08...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0924243001 0924243002 0918522001 0923758001 08...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0924243001 0924243002 0918522001 0923758001 08...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0924243001 0924243002 0918522001 0923758001 08...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0924243001 0924243002 0918522001 0923758001 08...


In [112]:
sample_submission.to_csv('data/20220509-fashionrecv3_FINAL.csv', index=False)