# Shopee Product Match

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.nn import functional as F
from torchvision import transforms
import transformers

import os
import sys
import gc

import math
import cv2
from torch.utils.data import DataLoader,Dataset
from tqdm.notebook import tqdm

if torch.cuda.is_available():
    import cuml
    import cudf
    import cupy

sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')
import timm
import torchvision

In [None]:
PATH = "/kaggle/input/shopee-product-matching/"
os.listdir(PATH)

In [None]:
test = pd.read_csv(PATH + 'test.csv')
if len(test) > 3:
    TRAIN = False
else:
    TRAIN = True

In [None]:
# Before submitting, you should set TRAIN = False to see whether the notebook can run on test set normally

TRAIN = False
DEBUG = False

In [None]:
def read_dataset(name="train"):
    df = pd.read_csv('/kaggle/input/shopee-product-matching/{}.csv'.format(name))
    df["image_path"] = '/kaggle/input/shopee-product-matching/{}_images/'.format(name) + df['image']

    return df

In [None]:
if TRAIN:
    train = read_dataset("train")
    label_group_dict = train.groupby("label_group").posting_id.agg("unique").to_dict()
    train['target'] = train.label_group.map(label_group_dict)
else:
    train = read_dataset("test")

if DEBUG:
    train = pd.concat([train]*2)   

if torch.cuda.is_available():
    train_cu = cudf.DataFrame(train)    
    
train.head()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
device

In [None]:
def euclidean_dist(x, y, norm=False):
    m, n = x.size(0), y.size(0)
    
    if norm:
        x = x / x.norm(p=2, dim=1, keepdim=True)
        y = y / y.norm(p=2, dim=1, keepdim=True)
    
    xx = torch.pow(x, 2).sum(dim=1, keepdim=True).expand(m, n)
    yy = torch.pow(y, 2).sum(dim=1, keepdim=True).expand(n, m).t()
    dist = xx + yy
    dist.addmm_(1, -2, x, y.t())
    dist = dist.clamp(min=1e-12).sqrt()
    return dist

def cosine_dist(x,y):
    m, n = x.size(0), y.size(0)
    
    norm_x = x.norm(p=2, dim=1, keepdim=True).expand(m,n)
    norm_y = y.norm(p=2, dim=1, keepdim=True).expand(n,m).t()
    dist = torch.matmul(x, y.t()) / (norm_x * norm_y)
    return dist

def DistancePredict(features, threshold = 0.9, chunk = 1024, distance_type="cosine"):
    assert(distance_type in ("cosine","euclidean"))
    
    predict = []
    n = (features.size(0) + chunk - 1) // chunk
    with torch.no_grad():
        for i in tqdm(range(n)):
            a = i*chunk
            b = (i+1)*chunk
            b = min(b, features.size(0))
            x = features[a:b]
            y = features

            if distance_type =="cosine":
                distance = cosine_dist(x,y).data.cpu().numpy()
            elif distance_type == "euclidean":
                distance = euclidean_dist(x,y, norm=True).data.numpy()

            for k in range(b-a):
                if distance_type == "euclidean":
                    mask = distance[k] < threshold
                else :
                    mask = distance[k] > threshold
                    
                if np.sum(mask) > 50:
                    index = np.argwhere(mask == True).flatten()
                    index_idx = np.argsort(-distance[k, index])[:50]
                    mask = index[index_idx]
                    
                pred = train.posting_id[mask].to_numpy()
                predict.append(pred)
            del x,y,distance
            
    return predict

In [None]:
def f1(target, predict):
    n = len(np.intersect1d(target,predict))
    return 2*n/(len(target)+len(predict))

def precision(target, predict):
    n = len(np.intersect1d(target,predict))
    return n / len(predict)
    
def recall(target, predict):
    n = len(np.intersect1d(target,predict))
    return n / len(target)

def get_metric(target, predict):
    tmp = pd.DataFrame({"target":target.reset_index(drop=True), "predict":predict.reset_index(drop=True)})
    f1_score = tmp.apply(lambda row: f1(row['target'], row["predict"]),axis=1)
    precision_score = tmp.apply(lambda row: precision(row['target'], row["predict"]),axis=1)
    recall_score = tmp.apply(lambda row: recall(row['target'], row["predict"]),axis=1)
    print("Mean F1: {:f}".format(f1_score.mean()))
    print("Mean Precision: {:f}".format(precision_score.mean()))
    print("Mean Recall: {:f}".format(recall_score.mean()))

## Image

In [None]:
class ShopeeImageDataset(Dataset):
    def __init__(self, dataset, transform=None, train=True, resize = 256):
        self.dataset = dataset
        self.transform = transform
        self.train = train
        self.resize = resize
    
    def __len__(self):
        return self.dataset.shape[0]
    
    def __getitem__(self, index):
        image_path = self.dataset.image_path.iloc[index]
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = cv2.resize(image, (self.resize,self.resize))
        if self.transform:
            image = self.transform(image)
        if self.train:
            label_group = self.dataset.label_group.iloc[index]
            return image, label_group
        else:
            return image

In [None]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])


shopee_image_dataset = ShopeeImageDataset(train, transform = transform, train = TRAIN)
shopee_image_dataloader =  torch.utils.data.DataLoader(shopee_image_dataset, batch_size=64, shuffle=False, num_workers=2, prefetch_factor = 8)

shopee_swin_dataset = ShopeeImageDataset(train, transform = transform, train = TRAIN, resize = 224)
shopee_swin_dataloader =  torch.utils.data.DataLoader(shopee_swin_dataset, batch_size=64, shuffle=False, num_workers=2, prefetch_factor = 8)

In [None]:
def get_image_feature(model_name):
    # load
    if os.path.exists("image_features_{}.pt".format(model_name)):
        image_features = torch.load("image_features_{}.pt".format(model_name), map_location = device)
    else:
        dataloader = shopee_image_dataloader if "swin" not in model_name else shopee_swin_dataloader
        model = get_model(model_name)
        image_features = []
        with torch.no_grad():
            if TRAIN:
                for (images, labels) in tqdm(dataloader):
                    images, labels = images.to(device), labels.to(device)
                    features = model(images)
                    image_features.append(features)
                    del images, labels
            else:
                for images in tqdm(dataloader):
                    images = images.to(device)
                    features = model(images)
                    image_features.append(features.data)
                    del images 
        image_features = torch.cat(image_features, axis=0)
        # save
        torch.save(image_features, "image_features_{}.pt".format(model_name))
        
        del model
        gc.collect()
        torch.cuda.empty_cache()   
    
    print(image_features.shape)
    return image_features

In [None]:
class ArcFace(nn.Module):
    """ NN module for projecting extracted embeddings onto the sphere surface """
    
    def __init__(self, in_features, out_features, s=30, m=0.5):
        super(ArcFace, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.arc_min = math.cos(math.pi - m)
        self.margin_min = math.sin(math.pi - m) * m
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)


    def forward(self, embedding, label):
        cos = F.linear(F.normalize(embedding), F.normalize(self.weight))
        sin = torch.sqrt(1.0 - torch.pow(cos, 2)).clamp(0, 1)
        phi = cos * self.cos_m - sin * self.sin_m
        phi = torch.where(cos > self.arc_min, phi, cos - self.margin_min)

        one_hot = torch.zeros(cos.size(), device='cuda')
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        logits = one_hot * phi + (1.0 - one_hot) * cos
        logits *= self.s
        return logits

In [None]:
class Model(nn.Module):
    def __init__(self, model_name, n_classes, fc_dim=512):
        super(Model, self).__init__()
        print("Building Model Backbone for {} model".format(model_name))
        
        if "eca_nfnet" in model_name:
            self.backbone = timm.create_model(model_name)
            feat_size = self.backbone.head.fc.in_features
            self.backbone.head.fc = nn.Identity()
                
        elif "efficientnet" in model_name:
            self.backbone = timm.create_model(model_name)
            feat_size = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()

        elif "dm_nfnet" in model_name:
            self.backbone = timm.create_model(model_name)
            feat_size = self.backbone.head.fc.in_features
            self.backbone.head.fc = nn.Identity()
        
        elif "swin" in model_name:
            self.backbone = timm.create_model(model_name)
            feat_size = self.backbone.head.in_features
            self.backbone.head = nn.Identity()
        
        else:
            raise ValueError("Invalid model name: {}".format(model_name))
        
        self.fc = nn.Linear(feat_size, fc_dim)
        self.margin = ArcFace(fc_dim, n_classes)
        
    def forward(self, x, labels=None):
        x = self.backbone(x)
        x = self.fc(x)
        if labels is not None:
            return self.margin(x,labels)
        return F.normalize(x,dim=1)

In [None]:
def get_model(model_name):
    name = "_".join(model_name.split("_")[:-2])
    if "swin" in name:
        model = Model(name, 9977)
    else:
        model = Model(name, 9499)
    model.load_state_dict(torch.load("../input/arcface-pretrained-model/{}.pt".format(model_name), map_location=device))
    model.to(device)

    # eval
    model.eval()
    
    return model

### Single net

In [None]:
"""THRESHOLD = 0.55
image_features = get_image_feature("swin_small_patch4_window7_224_arcface_25")
image_pred = DistancePredict(image_features, threshold= THRESHOLD)
train["image_pred"] = image_pred
if TRAIN:
    get_metric(train["target"], train["image_pred"])
    
del image_features
gc.collect()
torch.cuda.empty_cache()"""

### UnionEnsemble

In [None]:
# Union/Intersect Ensemble
def EasyEnsemblePredict(model_names, thresholds, ensemble_type="union"):
    tmp = pd.DataFrame()
    for model_name, threshold in zip(MODEL_NAME, THRESHOLD):
        # extract feature
        image_features = get_image_feature(model_name)

        # distance-based prediction
        image_pred = DistancePredict(image_features, threshold=threshold)
        tmp["{}_pred".format(model_name)] = image_pred
        
        del image_features
        gc.collect()
        torch.cuda.empty_cache()
        
        # metric
        if TRAIN:
            print("MODEL: {} THRESHOLD: {}".format(model_name, threshold))
            get_metric(train["target"], tmp["{}_pred".format(model_name)])
    
    # ensemble
    from functools import reduce
    if ensemble_type == "union":
        ensemble_pred = tmp.apply(lambda row: reduce(np.union1d, row.to_numpy()), axis=1)
    elif ensemble_type == "intersect":
        ensemble_pred = tmp.apply(lambda row: reduce(np.intersect1d, row.to_numpy()), axis=1)
    
    return ensemble_pred

In [None]:
"""MODEL_NAME = ["eca_nfnet_l0_arcface_7", "eca_nfnet_l1_arcface_8" , "efficientnet_b4_arcface_13"]
THRESHOLD = [0.55, 0.55, 0.6]

ensemble_pred = EasyEnsemblePredict(MODEL_NAME, THRESHOLD, ensemble_type="union")
train["image_pred"] = ensemble_pred
    
if TRAIN:
    print("\nMODEL:  UnionEnsemble")
    get_metric(train["target"], train["image_pred"])"""

### ConcatEnsemble

In [None]:
def ConcatEnsemblePredict(model_names, threshold, normalize = True, distance_type="cosine"):
    concat_features = []
    for model_name in model_names:
        image_features = get_image_feature(model_name)
        concat_features.append(image_features)
    concat_features = torch.hstack(concat_features)
    print(concat_features.shape)
    # normalize
    if normalize:
        concat_features = F.normalize(concat_features, dim=1)
    # predict
    concat_pred = DistancePredict(concat_features, threshold=threshold, distance_type=distance_type)
    
    del image_features, concat_features
    gc.collect()
    torch.cuda.empty_cache()
    
    return concat_pred

In [None]:
MODEL_NAME = ["eca_nfnet_l0_arcface_7", "eca_nfnet_l1_arcface_8" , "efficientnet_b4_arcface_13"]
THRESHOLD = 0.65

concat_pred = ConcatEnsemblePredict(MODEL_NAME, THRESHOLD)
train["image_pred"] = concat_pred
    
if TRAIN:
    print("\nMODEL:  ConcatEnsemble THRESHOLD: {}".format(THRESHOLD))
    get_metric(train["target"], train["image_pred"])

torch.cuda.empty_cache()

THRESHOLD = 0.46

concat_pred = ConcatEnsemblePredict(MODEL_NAME, THRESHOLD)
train["image_pred_wait"] = concat_pred
    
if TRAIN:
    print("\nMODEL:  ConcatEnsemble THRESHOLD: {}".format(THRESHOLD))
    get_metric(train["target"], train["image_pred_wait"])

torch.cuda.empty_cache()

## Text

### TF-IDF

In [None]:
def TFIDFExtractFeature(df, max_features):
    if torch.cuda.is_available():
        from cuml.feature_extraction.text import TfidfVectorizer
    else:
        from sklearn.feature_extraction.text import TfidfVectorizer
    
    model = TfidfVectorizer(stop_words='english', max_features=max_features)
    model.fit(df.title)

    tfidf_features = model.transform(df.title).toarray()
    print(tfidf_features.shape)
    return tfidf_features

In [None]:
MAX_FEATURES = 25000
THRESHOLD = 0.75

if torch.cuda.is_available():
    text_features = TFIDFExtractFeature(train_cu, MAX_FEATURES)
else:
    text_features = TFIDFExtractFeature(train, MAX_FEATURES)
text_features = torch.Tensor(text_features).to(device)
text_pred = DistancePredict(text_features, threshold = THRESHOLD, distance_type="cosine")
train["text_pred"] = text_pred

if TRAIN:
    get_metric(train["target"], train["text_pred"])

THRESHOLD = 0.51
text_pred = DistancePredict(text_features, threshold = THRESHOLD, distance_type="cosine")
train["text_pred_wait"] = text_pred

if TRAIN:
    get_metric(train["target"], train["text_pred_wait"])

del text_features
gc.collect()
torch.cuda.empty_cache()

### Bert

In [None]:
class ShopeeTextDataset(Dataset):
    def __init__(self, dataset, train):
        self.dataset = dataset
        self.train = train
    
    def __len__(self):
        return self.dataset.shape[0]
    
    def __getitem__(self, index):
        title = self.dataset.title.iloc[index]
        if self.train:
            label_group = self.dataset.label_group.iloc[index]
            return title, label_group
        else:
            return title
    
shopee_text_dataset = ShopeeTextDataset(train, train=TRAIN)
shopee_text_dataloader =  torch.utils.data.DataLoader(shopee_text_dataset, batch_size=64, shuffle=False, num_workers=2)

In [None]:
"""from transformers import RobertaTokenizer, RobertaModel
bert = RobertaModel.from_pretrained("../input/roberta-base/pytorch_model.bin",
                                         config = "../input/roberta-base/config.json")
bert.load_state_dict"""

In [None]:
def get_bert_feature(model_name, max_length):
    # load
    if TRAIN and os.path.exists("text_features_{}.pt".format(model_name)):
        bert_features = torch.load("text_features_{}.pt".format(model_name), map_location = device)
    else:
        from transformers import RobertaTokenizer, RobertaModel
        tokenizer = RobertaTokenizer.from_pretrained("../input/roberta-base/vocab.json")
        bert = BertModel.from_pretrained("../input/roberta-base/pytorch_model.bin",
                                         config = "../input/roberta-base/config.json").to(device)
        model = get_model(model_name) # e.g. roberta-based
        text_features = []
        with torch.no_grad():
            if TRAIN:
                for (texts, labels) in tqdm(shopee_text_dataloader):
                    inputs = tokenizer(texts, max_length = max_length, truncation=True, padding=True, return_tensors="pt")
                    input_ids = inputs["input_ids"].to(device)
                    token_type_ids = inputs["token_type_ids"].to(device)
                    attention_mask = inputs["attention_mask"].to(device)
                    features = model(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = attention_mask)
                    text_features.append(features.data)
                    del inputs, input_ids, token_type_ids, attention_mask
            else:
                for texts in tqdm(shopee_text_dataloader):
                    inputs = tokenizer(texts, max_length = max_length, truncation=True, padding=True, return_tensors="pt")
                    input_ids = inputs["input_ids"].to(device)
                    token_type_ids = inputs["token_type_ids"].to(device)
                    attention_mask = inputs["attention_mask"].to(device)
                    features = model(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = attention_mask)
                    text_features.append(features.data)
                    del inputs, input_ids, token_type_ids, attention_mask
        text_features = torch.cat(text_features, axis=0)
        # save
        if TRAIN:
            torch.save(text_features, "text_features_{}.pt".format(model_name))
        
        del model
        gc.collect()
        torch.cuda.empty_cache()   
    
    print(text_features.shape)
    return text_features

In [None]:
"""MAX_LENGTH = 30
THRESHOLD = 0.7
text_features = get_text_feature(model_name, max_length=MAX_LENGTH) # roberta-based
text_pred = DistancePredict(text_features, threshold = THRESHOLD, distance_type="cosine")

if TRAIN:
    get_metric(train["target"], train["text_pred"])
    
del text_features
gc.collect()
torch.cuda.empty_cache()    
"""

## MultiModal Fusion

In [None]:
def union(x,y):
    return np.union1d(x,y)

def intersect(x,y):
    return np.intersect1d(x,y)

train["pred"] = train.apply(lambda row: union(row['image_pred'], row["text_pred"]),axis=1)
train["wait"] = train.apply(lambda row: intersect(row['image_pred_wait'], row["text_pred_wait"]),axis=1)
train["pred"] = train.apply(lambda row: union(row['pred'], row["wait"]),axis=1)

if TRAIN:
    get_metric(train["target"], train["pred"])

In [None]:
def submission(row):
    return ' '.join(row)

train["matches"] = train["pred"].apply(lambda x: submission(x))
# submit
train[['posting_id','matches']].to_csv('submission.csv',index=False)
submission = pd.read_csv('submission.csv')
submission.head()