In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

'''
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
'''

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import pandas as pd 

df = pd.read_csv('/kaggle/input/shopee-product-matching/train.csv')
len(df), df['label_group'].value_counts()

# Product matching using cotrastive learning

<img src="https://lh6.googleusercontent.com/UVN-ArRhK7YeFcmWaTyPE8Qzqmt1cU_9Krnupz77OIW0qu3cT8GLuDvQEzuwGbN4kWLEwfkkdkaSAyhJPJUC2oY1MvmkU-Ghitj4XzRCvMCnUNgkpfVKphXTLyMhc4tqyF6OhnTB" width="750" align="center">

## install 3rd party pip packages

In [None]:
#!pip install pytorch-lightning pytorch-metric-learning torchvision faiss-cpu scikit-learn
!pip install ../input/wheels/faiss_cpu-1.7.0-cp37-cp37m-manylinux2014_x86_64.whl
!pip install ../input/wheels/pytorch_metric_learning-0.9.98-py3-none-any.whl
!pip install ../input/wheels/timm-0.4.5-py3-none-any.whl

## define custom dataset

In [None]:
# custom dataset
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

from PIL import Image
from tqdm import tqdm

import torch
import os, sys
import json
import pandas as pd
import PIL
import argparse
import random
import matplotlib.pyplot as plt

class ProductPairDataset(Dataset):
    def __init__(self, df, root_dir, img_size=260, train_mode=True, test_mode=False, transform=None):
        """
        Args:
            df (DataFrame): part of entire dataframe
            root_dir (str): root image path
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.products_frame = df
        self.root_dir = root_dir
        self.train_mode = train_mode
        self.test_mode = test_mode
        
        self.images = []
        self.labels = []

        if transform is not None:
            self.transform = transform
        else:
            if self.train_mode:  # set default image tranform
                self.transform = transforms.Compose(
                    [
                        transforms.Resize([int(img_size // 0.9), int(img_size // 0.9)]),
                        transforms.RandomCrop([img_size, img_size]),
                        transforms.RandomHorizontalFlip(),
                        transforms.ToTensor(),
                        transforms.Normalize(
                            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
                        ),
                    ]
                )
            else:
                self.transform = transforms.Compose(
                    [
                        transforms.Resize([img_size, img_size]),
                        transforms.ToTensor(),
                        transforms.Normalize(
                            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
                        ),
                    ]
                )

    def __len__(self):
        return len(self.products_frame)

    def __getitem__(self, index):
        image_tensor = self.transform(PIL.Image.open(self.root_dir + os.sep + self.products_frame.iloc[index]["image"]))
        
        if not self.test_mode:
            label = int(self.products_frame.iloc[index]["label_group"])
            label_tensor = torch.LongTensor([label])

            return image_tensor, label_tensor
        
        return image_tensor

## implementation background of this

## ContrastiveLoss based on cosine similarity

![](https://kevinmusgrave.github.io/pytorch-metric-learning/imgs/contrastive_loss_similarity_equation.png)

appropriate values would be **pos_margin = 1 and neg_margin = 0**, in case of cosine similarity

## using in-batch constrastive loss

![In-batch constrastive learning](https://d3i71xaburhd42.cloudfront.net/bbe55736e6f4681c54ec4a889b9b12b6e4c25b56/2-Figure1-1.png)

## It restrict only in-batch pairs relation! How to expand it?

![](https://i.ytimg.com/vi/SDKDSvv9oTk/maxresdefault.jpg)

## model definition

In [None]:
## model definition
from torchvision import transforms
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data.sampler import BatchSampler

from pytorch_metric_learning import miners, losses
from pytorch_metric_learning.distances import CosineSimilarity

import torch
import torch.nn as nn
import torch.nn.functional as F

import pytorch_lightning as pl

import PIL
import pandas as pd
import random

class ProductFeatureNet(nn.Module):
    def __init__(self, backbone_net, feature_dim=768):
        super(ProductFeatureNet, self).__init__()

        self.feature_dim = feature_dim
        self.backbone_net = backbone_net
        self.feature_layer = nn.Linear(
            #self.backbone_net.classifier.out_features, self.feature_dim, bias=False
            self.backbone_net.fc.out_features, self.feature_dim, bias=False
        )
        nn.init.xavier_uniform_(self.feature_layer.weight)

    def forward(self, images):
        features = self.backbone_net(images)
        features = self.feature_layer(features)
        features = F.normalize(features)

        return features

class ProductFeatureEncoder(pl.LightningModule):
    def __init__(
        self,
        model,
        margin=0.5,
        lr=1e-3,
        lr_patience=2,
        lr_decay_ratio=0.5,
        memory_batch_max_num=1024,
    ):
        super().__init__()

        self.save_hyperparameters()

        self.model = model
        self.margin = margin
        self.lr = lr
        self.lr_patience = lr_patience
        self.lr_decay_ratio = lr_decay_ratio

        self.memory_batch_max_num = memory_batch_max_num
        
        self.loss_func = losses.CrossBatchMemory(
            losses.ContrastiveLoss(pos_margin=1, neg_margin=0, distance=CosineSimilarity()),
            self.model.feature_dim, 
            memory_size=self.memory_batch_max_num, 
            miner=miners.MultiSimilarityMiner(epsilon=self.margin)
        )
        
    def forward(self, images):
        features = self.model(images)

        return features

    def configure_optimizers(self):
        optim = torch.optim.Adam(
            [
                {"params": self.model.backbone_net.parameters(), "lr": self.lr * 0.1},
                {"params": self.model.feature_layer.parameters()},
                
            ],
            #self.parameters(),
            lr=self.lr,
        )
        
        return {
            "optimizer": optim,
            "lr_scheduler": ReduceLROnPlateau(
                optim,
                patience=self.lr_patience,
                threshold=1e-8,
                factor=self.lr_decay_ratio,
            ),
            "monitor": "val_loss",
        }

    def training_step(self, train_batch, batch_idx):
        self.model.train()

        images, labels = train_batch
        features = self.model(images)
        
        xbm_loss = self.loss_func(features, labels.squeeze(1))
        self.log("train/loss", xbm_loss, prog_bar=True)
        
        return xbm_loss

    def validation_step(self, validation_batch, batch_idx):
        self.model.eval()

        images, labels = validation_batch
        features = self.model(images)

        with torch.no_grad():
            xbm_loss = self.loss_func(features, labels.squeeze(1))
            self.log("train/loss", xbm_loss, prog_bar=True)
            
            return {
                "features": features,
                "labels": labels,
                "val_loss": xbm_loss,
            }


## define custom batch sampler for ensuring positive pair portion including

In [None]:
## positive pair augment custom batch sampler
class PositivePairAugBatchSampler(BatchSampler):
    def __init__(self, dataset_df, min_positive_instances=4, num_labels_per_batch=16):
        #self.max_iter = len(dataset_df) // (num_labels_per_batch)
        self.max_iter = len(dataset_df) // (min_positive_instances * num_labels_per_batch)
        self.min_positive_instances = min_positive_instances
        self.num_labels_per_batch = num_labels_per_batch

        self.label_index_dict = {}  # key: batch, value: [batch_indices]
        for label in dataset_df["label_group"]:
            self.label_index_dict[label] = [index for index in list(dataset_df[dataset_df["label_group"] == label].index) if index < len(dataset_df)]

        delete_label_list = []
        for k, v in self.label_index_dict.items():
            if len(v) == 0:
                delete_label_list.append(k)
        for label in delete_label_list:
            del self.label_index_dict[label]
        
    def __len__(self):
        return self.max_iter

    def __iter__(self):
        for _ in range(self.max_iter):
            batch_indices = []

            selected_labels = random.choices(
                list(self.label_index_dict.keys()), k=self.num_labels_per_batch
            )

            for label in selected_labels:
                batch_indices.extend(
                    random.choices(
                        self.label_index_dict[label], k=self.min_positive_instances
                    )
                )

            yield batch_indices

## import pre-trained backbone model for extracting image feature

In [None]:
import timm
import torch

'''
backbone_net = timm.create_model('tf_efficientnet_b0')
backbone_net.load_state_dict(torch.load('../input/timm-pretrained-efficientnet/efficientnet/tf_efficientnet_b0_aa-827b6e33.pth'))


backbone_net = timm.create_model('tf_efficientnet_es')
backbone_net.load_state_dict(torch.load('../input/timm-pretrained-efficientnet/efficientnet/tf_efficientnet_es-ca1afbfe.pth'))

backbone_net = timm.create_model('tf_efficientnet_b0_ap')
backbone_net.load_state_dict(torch.load('../input/timm-pretrained-efficientnet/efficientnet/tf_efficientnet_b0_ap-f262efe1.pth'))

backbone_net = timm.create_model('tf_efficientnet_b1_ns')
backbone_net.load_state_dict(torch.load('../input/timm-pretrained-efficientnet/efficientnet/tf_efficientnet_b1_ns-99dd0c41.pth'))

backbone_net = timm.create_model('tf_efficientnet_b2_ns')
backbone_net.load_state_dict(torch.load('../input/timm-pretrained-efficientnet/efficientnet/tf_efficientnet_b2_ns-00306e48.pth'))
'''

backbone_net = timm.create_model('resnet101')
backbone_net.load_state_dict(torch.load('../input/timm-pretrained-resnet/resnet/resnet101-5d3b4d8f.pth'))

backbone_net

## model training start(based on pytorch-lightning way)

In [None]:
%%time

from pytorch_lightning.metrics.functional import f1, accuracy
from pytorch_lightning.callbacks import EarlyStopping
from torch.utils.data import DataLoader, random_split
from sklearn.utils import shuffle
from tqdm import tqdm

import pandas as pd
import multiprocessing
import numpy as np
import faiss

args = {
    # model parameters
    'feature_dim' : 512,
    
    # training parameters
    'epochs': 20,
    'margin': 0.4,
    'lr' : 4e-4,
    'lr_patience': 2,
    'early_stop_patience': 4,
    'lr_decay_ratio':0.1,
    'batch' : 128,
    'memory_batch_max_num' : 2048,
    
    # dataset parameters
    'train_portion' : 0.95,
    'train_csv_file' : "/kaggle/input/shopee-product-matching/train.csv",
    'train_root_dir' : "/kaggle/input/shopee-product-matching/train_images"
}

# Init model
embedding_net = ProductFeatureNet(
    backbone_net=backbone_net, 
    feature_dim=args['feature_dim']
)
product_encoder = ProductFeatureEncoder(
    model=embedding_net, lr=args['lr'], margin=args['margin'], memory_batch_max_num=args['memory_batch_max_num']
)

# Init DataLoader from Custom Dataset
dataset_df = pd.read_csv(args['train_csv_file'])
dataset_df = shuffle(dataset_df)

train_df = dataset_df[: int(len(dataset_df) * args['train_portion'])]
train_batch_sampler = PositivePairAugBatchSampler(train_df)
train_dataset = ProductPairDataset(
    df=train_df,
    root_dir=args['train_root_dir'],
    train_mode=True,
)
train_loader = DataLoader(
    train_dataset,
    num_workers=multiprocessing.cpu_count(),
    pin_memory=True,
    batch_sampler = train_batch_sampler
)

valid_df = dataset_df[len(train_df):]
valid_batch_sampler = PositivePairAugBatchSampler(valid_df)
valid_dataset = ProductPairDataset(
    df=valid_df,
    root_dir=args['train_root_dir'],
    train_mode=False,
)
valid_loader = DataLoader(
    valid_dataset, 
    num_workers=multiprocessing.cpu_count(),
    batch_sampler = valid_batch_sampler
)

test_loader = DataLoader(
    valid_dataset, 
    num_workers=multiprocessing.cpu_count(),
    batch_size=args['batch']
)

early_stopping = EarlyStopping("val_loss", patience=args['early_stop_patience'])

# Initialize a trainer
trainer = pl.Trainer(
    gpus=torch.cuda.device_count(),
    progress_bar_refresh_rate=1,
    accelerator="ddp",
    max_epochs=args['epochs'],
    #callbacks=[early_stopping],
    tpu_cores=8,
    replace_sampler_ddp=False
)

# Train the model
print ('training start!')
trainer.fit(product_encoder, train_loader, valid_loader)
print ('training end!')

## pulling image features using trained model

In [None]:
#del train_dataset
#del train_loader

dataset_df = pd.read_csv(args['train_csv_file'])

valid_dataset = ProductPairDataset(
    df=dataset_df,
    root_dir=args['train_root_dir'],
    train_mode=False,
)
valid_loader = DataLoader(
    valid_dataset, batch_size=args['batch'], num_workers=multiprocessing.cpu_count(), shuffle=False
)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
product_encoder.model = product_encoder.model.to(device)
product_encoder.model.eval()

embeddings = None

# store image feature embedding iterating over data
for images, labels in tqdm(valid_loader, desc='storing image features ...'):
    images = images.to(device)
    with torch.no_grad():
        features = product_encoder.model(images)
        
        if embeddings is None:
            embeddings = features.cpu()
        else:
            embeddings = torch.cat([embeddings, features.cpu()])

## find similarity threshold for optimal f1 score

In [None]:
df = pd.read_csv(args['train_csv_file'])

matches_column = []
for i in tqdm(range(len(df)), desc='matching target posting_ids ...'):
    matches_column.append(' '.join(list(df[df['label_group']==df.iloc[i]['label_group']]['posting_id'])))
df['matches'] = matches_column

df.head()

In [None]:
index = faiss.IndexFlatIP(args['feature_dim'])
index.add(embeddings.numpy())

distances, indices = index.search(embeddings.numpy(), k=50) # search max 50 candidates
distances, indices

In [None]:
# search similarity threshold for optimal f1 score
def f1_score(y_true, y_pred):
    '''
    precision = len(intersection(y_true, y_pred)) / len(y_pred)
    recall = len(intersection(y_true, y_pred)) / len(y_true)
    f1 = (2 * (precision * recall)) / (precision + recall) => (2 * len(intersection(y_true, y_pred))) / (len(y_true) + len(y_pred))
    '''
    #print (f'y_true: {y_true}')
    #print (f'y_pred: {y_pred}')
    
    intersection = list(set(y_true) & set(y_pred))
    
    return (2 * len(intersection)) / float(len(y_true) + len(y_pred))


max_f1 = 0.0
similarity_threshold = 0.0

for threshold in tqdm(np.arange(0.6, 1.0, 0.05), desc='searching similarity threshold for optimal f1 score ...'):
    matches_pred = []
    for distance, index in zip(distances, indices):
        selected_distance = list(np.where(distance >= threshold))[0]
        each_matches_pred = []
        for selected_index in selected_distance:
            each_matches_pred.append(df.iloc[index[selected_index]].values[0]) # posting_id
            
        matches_pred.append(' '.join(each_matches_pred))

    df['matches_pred'] = matches_pred
    #print (df.head())
    
    df['f1'] = df.apply(lambda row: f1_score(row['matches'], row['matches_pred']), axis=1)

    print (f"f1 score of similarity threshold({threshold}): {df['f1'].mean()}")
    
    if df['f1'].mean() > max_f1:
        similarity_threshold = threshold
        max_f1 = df['f1'].mean()

In [None]:
# generate submission file
test_dataset = ProductPairDataset(
    df=pd.read_csv('../input/shopee-product-matching/test.csv'),
    root_dir='../input/shopee-product-matching/test_images',
    train_mode=False,
    test_mode=True
)
test_loader = DataLoader(
    test_dataset, batch_size=args['batch'], num_workers=multiprocessing.cpu_count(), shuffle=False
)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
product_encoder.model = product_encoder.model.to(device)
product_encoder.model.eval()

embeddings = None

# store image feature embedding iterating over data
for images in tqdm(test_loader, desc='storing image features ...'):
    images = images.to(device)
    with torch.no_grad():
        features = product_encoder.model(images)
        
        if embeddings is None:
            embeddings = features.cpu()
        else:
            embeddings = torch.cat([embeddings, features.cpu()])
            
index = faiss.IndexFlatIP(args['feature_dim'])
index.add(embeddings.numpy())

distances, indices = index.search(embeddings.numpy(), k=50) # search max 50 candidates

distances, indices, similarity_threshold

## generate submission file

In [None]:
df = pd.read_csv('../input/shopee-product-matching/test.csv')

matches_pred = []
for distance, index in zip(distances, indices):
    selected_distance = list(np.where(distance >= similarity_threshold))[0]
    each_matches_pred = []
    for selected_index in selected_distance:
        each_matches_pred.append(df.iloc[index[selected_index]].values[0]) # posting_id

    matches_pred.append(' '.join(each_matches_pred))
    
df['matches_pred'] = matches_pred

df

In [None]:
'''
torch.mm(embeddings[0].unsqueeze(0), embeddings[33231].unsqueeze(0).t())

img_array = np.array(Image.open(f"{args['train_root_dir']}/{df.iloc[0]['image']}"))
plt.imshow(img_array)

img_array = np.array(Image.open(f"{args['train_root_dir']}/{df.iloc[33231]['image']}"))
plt.imshow(img_array)
'''

with open('submission.csv', 'w') as resultFile:
    resultFile.write('posting_id,matches')
    for i in range(len(df)):
        resultFile.write(f"\n{df.iloc[i]['posting_id']},{df.iloc[i]['matches_pred']}")
        
!cat submission.csv