Big thanks to Chris for his [kernel](https://www.kaggle.com/cdeotte/part-2-rapids-tfidfvectorizer-cv-0-700)

In [None]:
import numpy as np
import cupy, cudf
import gc
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import random
import torch
import torchvision
from torchvision import  models, transforms
from transformers import BertTokenizer, BertModel
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors
import torch.nn as nn
import torch.nn.functional as F
import os
import glob
from PIL import Image
import seaborn as sns
import cv2, matplotlib.pyplot as plt
import matplotlib.image as mpimg
from textwrap import wrap

# Data import

In [None]:
device = 'cuda'if torch.cuda.is_available() else 'cpu'
device

In [None]:
PATH = '../input/shopee-product-matching/'
PATH_TO_IMG = '../input/shopee-product-matching/train_images/'
PATH_TO_TEST = '../input/shopee-product-matching/test_images/'
os.listdir(PATH)

In [None]:
COMPUTE_CV = True
if len(pd.read_csv(PATH + 'test.csv')) > 3: COMPUTE_CV = False

In [None]:
if COMPUTE_CV:
    dataset = pd.read_csv(PATH + 'train.csv')
    tmp = dataset.groupby('label_group').posting_id.agg('unique').to_dict()
    dataset['target'] = dataset.label_group.map(tmp)
else:    
    dataset = pd.read_csv(PATH + 'test.csv')

In [None]:
dataset.head()

In [None]:
def show_random_img():
    # choose randomly two instances per each class
    labels_to_show = np.random.choice(dataset.label_group.unique(), 
                                      replace=False, size=24)
    img_to_show = []
    for label in labels_to_show:
        rows = dataset[dataset.label_group==label].copy()
        pair = np.random.choice([i for i in range(len(rows))], 
                                    replace=False, size=2)
        img_pair = rows.iloc[pair][['image', 'title']].values
        
        img_to_show += list(img_pair)
    
    fig, axes = plt.subplots(figsize = (18, 12), nrows=4,ncols=6)
    for imp, ax in zip(img_to_show, axes.ravel()):
        img = cv2.imread(PATH_TO_IMG + imp[0])
        title = '\n'.join(wrap(imp[1], 20))
        ax.set_title(title)
        ax.imshow(img)
        ax.axis('off')

    fig.tight_layout()

In [None]:
if COMPUTE_CV:
    show_random_img()

# ResNet block

In [None]:
class ResNetEmbedder(nn.Module):
    
    def __init__(self, device='cpu'):
        super(ResNetEmbedder, self).__init__()
        self.model = models.resnet50(pretrained=False)
        self.device = device
        path = '../input/pretrained-model-weights-pytorch/resnet50-19c8e357.pth'
        self.model.load_state_dict(torch.load(path))
#         to freeze weights
        for param in self.model.parameters():
                param.requires_grad = False
        self.model.to(device)
        
    
    def transform(self, img):
        image_transform = torchvision.transforms.Compose(
            [
                torchvision.transforms.Resize(256),
                transforms.CenterCrop(224),
                torchvision.transforms.ToTensor(),
                torchvision.transforms.Normalize(
                    mean=(0.485, 0.456, 0.406), 
                    std=(0.229, 0.224, 0.225)
                ),
            ]
        )
        return image_transform(img)
    
    def forward(self, img):
        img_tr = self.transform(img).unsqueeze(0)
        img_tr = img_tr.to(self.device)
        features = self.model(img_tr).squeeze()
        return features

In [None]:
model_img = ResNetEmbedder(device)

In [None]:
def vectorize_img(img_path):
    img = Image.open(img_path).convert('RGB')
    model_img.eval()
    with torch.no_grad():
        output = model_img(img).cpu().numpy()
    return output

In [None]:
%%time
if COMPUTE_CV:
    dataset['resnet_v'] = dataset['image'].progress_apply(lambda x: vectorize_img(PATH_TO_IMG + x))
else:
    dataset['resnet_v'] = dataset['image'].progress_apply(lambda x: vectorize_img(PATH_TO_TEST + x))

In [None]:
del model_img

Check cosine metrics. Vectors should be normalized.

In [None]:
vectors = np.stack(dataset.resnet_v)
vectors = torch.Tensor(vectors).to(device)
vectors = F.normalize(vectors)

In [None]:
preds = []
CHUNK = 1024

print('Finding similar titles...')
CTS = len(dataset)//CHUNK
if len(dataset)%CHUNK!=0: CTS += 1
for j in range( CTS ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(dataset))
    print('chunk',a,'to',b)
    
    # COSINE SIMILARITY DISTANCE
    cts = torch.matmul( vectors, vectors[a:b].T).T
    cts = cts.cpu().numpy()
    
    for k in range(b-a):
        IDX = np.where(cts[k,]>0.9)[0]
        o = dataset.iloc[IDX].posting_id.values
        preds.append(o)

del vectors, cts, IDX, o
_ = gc.collect()

In [None]:
dataset['preds_resnet'] = preds
dataset.head()

In [None]:
def getMetric(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

In [None]:
if COMPUTE_CV:
    dataset['f1_resnet'] = dataset.apply(getMetric('preds_resnet'), axis=1)
    print('CV score for baseline =', dataset.f1_resnet.mean())

# Sentence Bert block

To familiarize with implementation details please see [documentation](https://huggingface.co/sentence-transformers/bert-base-nli-mean-tokens).

In [None]:
class BERTEmbedder(nn.Module):
    
    def __init__(self, device='cpu'):
        super(BERTEmbedder, self).__init__()
        self.bert_path = "../input/sentence-transformer/"
        self.model = BertModel.from_pretrained(self.bert_path)
#         to freeze weights
        for param in self.model.parameters():
                param.requires_grad = False
        self.model.to(device)
        
    def transform(self, txt):
        tokenizer = BertTokenizer.from_pretrained(self.bert_path)
        encoded_input  = tokenizer.encode_plus( txt, 
                                                truncation=True, 
                                                max_length=128,
                                                add_special_tokens=True,
                                                padding=True,
                                                return_tensors='pt').values()
        return encoded_input
    
    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask
        
    def forward(self, txt):
        inputs_ids, token_type_ids, attention_mask = self.transform(txt)
        inputs_ids, token_type_ids, attention_mask = inputs_ids.to(device), \
                                                token_type_ids.to(device), attention_mask.to(device)
        with torch.no_grad():
            encoded_layers = self.model(inputs_ids, 
                                        attention_mask=attention_mask, 
                                        token_type_ids=token_type_ids)
        features = self.mean_pooling(encoded_layers, attention_mask)
        return features

In [None]:
model_txt = BERTEmbedder(device)

In [None]:
def vectorize_txt(txt):
    model_txt.eval()
    with torch.no_grad():
        output = model_txt(txt).cpu().numpy()
    return output

In [None]:
%%time
dataset['sbert_v'] = dataset['title'].progress_apply(lambda x: vectorize_txt(x))

In [None]:
del model_txt

In [None]:
vectors = np.stack(dataset.sbert_v).squeeze(1)
vectors = torch.Tensor(vectors).to(device)
vectors = F.normalize(vectors)

In [None]:
preds = []
CHUNK = 1024

print('Finding similar titles...')
CTS = len(dataset)//CHUNK
if len(dataset)%CHUNK!=0: CTS += 1
for j in range( CTS ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(dataset))
    print('chunk',a,'to',b)
    
    # COSINE SIMILARITY DISTANCE
    cts = torch.matmul( vectors, vectors[a:b].T).T
    cts = cts.cpu().numpy()
    
    for k in range(b-a):
        IDX = np.where(cts[k,]>0.95)[0]
        o = dataset.iloc[IDX].posting_id.values
        preds.append(o)

del vectors, cts, IDX, o
_ = gc.collect()

In [None]:
dataset['preds_sbert'] = preds
dataset.head()

In [None]:
del preds

In [None]:
if COMPUTE_CV:
    dataset['f1_sbert'] = dataset.apply(getMetric('preds_sbert'), axis=1)
    print('CV score for baseline =', dataset.f1_sbert.mean())

# Concatenation block

In [None]:
def concat():
    def cat(row):
        comm = np.concatenate([row.resnet_v,row.sbert_v.squeeze()])
        return comm
    return cat

In [None]:
dataset['concat_v'] = dataset.progress_apply(concat(), axis=1)

In [None]:
vectors = np.stack(dataset.concat_v)

In [None]:
KNN = 50
model = NearestNeighbors(n_neighbors=KNN)
model.fit(vectors)

In [None]:
preds = []
CHUNK = 1024*4

print('Finding similar images...')
CTS = len(vectors)//CHUNK
if len(vectors)%CHUNK!=0: CTS += 1
for j in range( CTS ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(vectors))
    print('chunk',a,'to',b)
    distances, indices = model.kneighbors(vectors[a:b,])
    
    for k in range(b-a):
        IDX = np.where(distances[k,]<35.0)[0]
        IDS = indices[k,IDX]
        o = dataset.iloc[IDS].posting_id.values
        preds.append(o)
        
del model, distances, indices, vectors, IDX, o, IDS
_ = gc.collect()

In [None]:
dataset['preds_concat'] = preds
dataset.head()

In [None]:
del preds

In [None]:
if COMPUTE_CV:
    dataset['f1_concat'] = dataset.apply(getMetric('preds_concat'), axis=1)
    print('CV score for baseline =', dataset.f1_concat.mean())

# Phash block

In [None]:
tmp = dataset.groupby('image_phash').posting_id.agg('unique').to_dict()
dataset['preds_phash'] = dataset.image_phash.map(tmp)
dataset.head()

In [None]:
del tmp

# TF-IDF block

In [None]:
dataset_gf = cudf.DataFrame(dataset[['posting_id', 'title']])

In [None]:
model = TfidfVectorizer(stop_words='english', binary=True, max_features=25_000)
text_embeddings = model.fit_transform(dataset_gf.title)

In [None]:
del model

In [None]:
preds = []
CHUNK = 1024

print('Finding similar titles...')
CTS = len(dataset)//CHUNK
if len(dataset)%CHUNK!=0: CTS += 1
for j in range( CTS ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(dataset))
    print('chunk',a,'to',b)
    
    # COSINE SIMILARITY DISTANCE
    cts = text_embeddings.dot(text_embeddings[a:b].T).T.toarray()
    
    for k in range(b-a):
        IDX = cupy.where(cts[k,]>0.7)[0]
        o = dataset.iloc[cupy.asnumpy(IDX)].posting_id.values
        preds.append(o)
        
del text_embeddings, IDX, o, cts
_ = gc.collect()

In [None]:
dataset['preds_tfidf'] = preds

In [None]:
del preds

In [None]:
if COMPUTE_CV:
    dataset['f1_tfidf'] = dataset.apply(getMetric('preds_tfidf'), axis=1)
    print('CV score for baseline =', dataset.f1_tfidf.mean())

# Submission block

In [None]:
def combine_for_sub(row):
    x = np.concatenate([row.preds_concat,row.preds_phash, row.preds_tfidf])
    return ' '.join( np.unique(x) )

def combine_for_train(row):
    x = np.concatenate([row.preds_concat,row.preds_phash, row.preds_tfidf])
    return list(np.unique(x))

In [None]:
if COMPUTE_CV:
    dataset['matches'] = dataset.apply(combine_for_train, axis=1)
else:
    dataset['matches'] = dataset.apply(combine_for_sub, axis=1)

In [None]:
dataset.to_pickle('train_data.pkl')

In [None]:
dataset[['posting_id', 'matches']].to_csv('submission.csv',index=False)

In [None]:
subm = pd.read_csv('submission.csv')
subm.head()

In [None]:
if COMPUTE_CV:
    dataset['f1_final'] = dataset.apply(getMetric('matches'), axis=1)
    print('CV score for baseline =', dataset.f1_final.mean())