# Load Libraries

In [None]:
test_path = '../input/shopee-product-matching/test.csv'
train_path = '../input/shopee-product-matching/train.csv'
train_fold_path = '../input/shopee-folds/train_fold.csv'
geffnet_path = '../input/geffnet-20200820'
root_of_shopee = '../input/shopee-product-matching/'
model_path = '../input/shopee-b0-bert/b0ns_256_bert_20ep_fold0_epoch27.pth'
pretrain_path = '../input/bert-base-uncased'

# import geffnet pakkage
import sys
sys.path = [geffnet_path] + sys.path

import os

# math calculations
import math
from tqdm import tqdm
import numpy as np, pandas as pd, gc
import cv2, matplotlib.pyplot as plt
import cudf, cuml, cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors

# image handdling
import albumentations
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F

import geffnet
from transformers import *

# Load Train Data

In [None]:
test_data = pd.read_csv(test_path)

In [None]:
# extract unique elements of label group, which used to defind whether two product are the same
train_data = pd.read_csv(train_path)
label_group = train_data.groupby('label_group').posting_id.agg('unique').to_dict()
train_data['target'] = train_data.label_group.map(label_group)
print('We could find that the train shape is', train_data.shape )
print("The first few elements in the train data are:")
train_data.head()

# Compute Baseline CV Score

In [None]:
# extract unique elements of phash of image
phash = train_data.groupby('image_phash').posting_id.agg('unique').to_dict()
train_data['phash_map'] = train_data.image_phash.map(phash)

In [None]:
def get_metric(col):
    def f1_score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1_score

In [None]:
train_data['f1_score'] = train_data.apply(get_metric('phash_map'),axis=1)
print('We could get the CV score for baseline is',train_data.f1_score.mean())

# Compute RAPIDS Model CV and Infer Submission

In [None]:
test_data = pd.read_csv(train_fold_path)
test_df = cudf.DataFrame(test_data)
print('Using train data as test to compute CV. Shape is', test_df.shape )
test_df.head()

# Use Image Embeddings

In [None]:
def get_transforms(image_size=256):
    return  albumentations.Compose([
                albumentations.Resize(image_size, image_size), # resize the image as a square
                albumentations.Normalize()
            ])


class shopee_product_dataset(Dataset):
    def __init__(self, csv, split, mode, transforms=get_transforms(image_size=256), tokenizer=None):

        self.csv = csv.reset_index()
        self.split = split
        self.mode = mode
        self.transform = transforms
        self.tokenizer = tokenizer

    def __len__(self):
        return self.csv.shape[0]

    def __getitem__(self, idx):
        info_row = self.csv.iloc[idx]
        
        info_text = info_row.title
        
        image = cv2.imread(info_row.filepath)
        image = image[:, :, ::-1]
        
        res = self.transform(image=image)
        new_image = res['image'].astype(np.float32)
        image = new_image.transpose(2, 0, 1)        

        info_text = self.tokenizer(info_text, padding='max_length', truncation=True, max_length=16, return_tensors="pt")
        input_ids = info_text['input_ids'][0]
        attention_masks = info_text['attention_mask'][0]

        if self.mode == 'test':
            return torch.tensor(image), input_ids, attention_masks
        else:
            return torch.tensor(image), input_ids, attention_masks, torch.tensor(info_row.label_group)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(pretrain_path)

In [None]:
data_frame_sub = test_data

data_frame_test = data_frame_sub.copy()
data_frame_test['filepath'] = data_frame_test['image'].apply(lambda x: os.path.join(root_of_shopee, 'train_images', x))

dataset_test = shopee_product_dataset(data_frame_test, 'test', 'test', transforms=get_transforms(image_size=256), tokenizer=tokenizer)
test_loader = DataLoader(dataset_test, batch_size=32, num_workers=8)

print("The length of data set is: \n",len(dataset_test))
print("element in it is like: \n",dataset_test[0])

In [None]:
class arcmargin_product(nn.Module):
    def __init__(self, in_features, out_features, k=3):
        super().__init__()
        self.weight = nn.Parameter(torch.FloatTensor(out_features*k, in_features))
        self.reset_parameters()
        self.k = k
        self.out_features = out_features
        
    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        
    def forward(self, features):
        cosine_all = F.linear(F.normalize(features), F.normalize(self.weight))
        cosine_all = cosine_all.view(-1, self.out_features, self.k)
        cosine, _ = torch.max(cosine_all, dim=2)
        return cosine 
    
sigmoid = torch.nn.Sigmoid()

class Swish(torch.autograd.Function):
    @staticmethod
    def forward(ctx, i):
        result = i * sigmoid(i)
        ctx.save_for_backward(i)
        return result
    @staticmethod
    def backward(ctx, grad_output):
        i = ctx.saved_variables[0]
        sigmoid_i = sigmoid(i)
        return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i)))

class for_swish(nn.Module):
    def forward(self, x):
        return Swish.apply(x)

    
 
    
class Arcface_model(nn.Module):

    def __init__(self, net_type, out_dim):
        super(Arcface_model, self).__init__()
        self.bert = AutoModel.from_pretrained(pretrain_path)
        self.enet = geffnet.create_model(net_type, pretrained=None)
        self.feat = nn.Linear(self.enet.classifier.in_features+self.bert.config.hidden_size, 512)
        self.swish = for_swish()
        self.dropout = nn.Dropout(0.5)
        self.metric_classify = arcmargin_product(512, out_dim)
        self.enet.classifier = nn.Identity()
 
    def forward(self, info,input_ids, attention_mask):
        info = self.enet(info)
        text = self.bert(input_ids=input_ids, attention_mask=attention_mask)[1]
        info = torch.cat([info, text], 1)
        info = self.swish(self.feat(info))
        return F.normalize(info), self.metric_classify(info)
    
def load_model(model_struct, model_path):
    model_dict = torch.load(model_path)
    if "model_state_dict" in model_dict.keys():
        model_dict = model_dict["model_state_dict"]
    model_dict = {key[7:] if key.startswith('module.') else key: model_dict[key] for key in model_dict.keys()}
    model_struct.load_state_dict(model_dict, strict=True)
    print(f"loaded {model_path}")
    model_struct.eval()    
    return model_struct

In [None]:
model = Arcface_model('tf_efficientnet_b0_ns', out_dim=11014).cuda()
model = load_model(model, model_path)


embedings = []

with torch.no_grad():
    for img, input_ids, attention_mask in tqdm(test_loader): 
        img, input_ids, attention_mask = img.cuda(), input_ids.cuda(), attention_mask.cuda()
        feat, _ = model(img, input_ids, attention_mask)
        image_embeddings = feat.detach().cpu().numpy()
        embedings.append(image_embeddings)

    
del model
_ = gc.collect()
image_embeddings = np.concatenate(embedings)
print('image embeddings shape',image_embeddings.shape)

In [None]:
knn_num = 50
if len(test_data)==3: knn_num = 2
model = NearestNeighbors(n_neighbors=knn_num)
model.fit(image_embeddings)

In [None]:
image_embeddings = cupy.array(image_embeddings)
image_predicts = []
Bucket = 1024*2

print('Finding similar images...')
num_of_loop = len(image_embeddings)//Bucket
if len(image_embeddings)%Bucket!=0: num_of_loop += 1
for num in range( num_of_loop ):
    
    from_num = num*Bucket
    to_num = (num+1)*Bucket
    to_num = min(to_num,len(image_embeddings))
    print('Bucket',from_num,'to',to_num)
   
    cts = cupy.matmul(image_embeddings, image_embeddings[from_num:to_num].T).T
    
    for k in range(to_num-from_num):
        idx = cupy.where(cts[k,]>0.5)[0]
        image_predict = test_data.iloc[cupy.asnumpy(idx)].posting_id.values
        image_predicts.append(image_predict)

In [None]:
test_data['image_predict'] = image_predicts
test_data.head()

# Use Text Embedding

In [None]:
# To prevent memory errors, we will find similar titles in chunks. 
# To faciliate this, we will use cosine similarity between text embeddings instead of KNN.
vectorizer = TfidfVectorizer(stop_words="english", 
                        binary=True, 
                        max_features=25000)
text_embeddings = vectorizer.fit_transform(test_df.title).toarray()
print('text embeddings shape',text_embeddings.shape)

In [None]:
text_predicts = []
Bucket = 1024*2

print('Finding similar titles...')
num_of_loop = len(test_data)//Bucket
if len(test_data)%Bucket!=0: num_of_loop += 1
for num in range( num_of_loop ):
    
    from_num = num*Bucket
    to_num = (num+1)*Bucket
    to_num = min(to_num,len(test_data))
    print('Bucket',from_num,'to',to_num)
    
    #COSINE SIMILARITY DISTANCE
    cts = cupy.matmul(text_embeddings, text_embeddings[from_num:to_num].T).T
    
    for k in range(to_num-from_num):
        idx = cupy.where(cts[k,]>0.75)[0]
        text_predict = test_data.iloc[cupy.asnumpy(idx)].posting_id.values
        text_predicts.append(text_predict)

In [None]:
test_data['text_predict'] = text_predicts
test_data.head()

# Use Phash Feature


In [None]:
# We will predict all items with the same phash as duplicates

image_phash = test_data.groupby('image_phash').posting_id.agg('unique').to_dict()
test_data['phash_predict'] = test_data.image_phash.map(image_phash)
test_data.head()

# Compute CV Score

In [None]:
def combine_for_match(row):
    all_predict = np.concatenate([row.phash_predict, row.text_predict, row.image_predict])
    return ' '.join( np.unique(all_predict) )

def combine_for_cal_f1(row):
    all_predict = np.concatenate([row.phash_predict, row.text_predict, row.image_predict])
    return np.unique(all_predict)

In [None]:
label_group = test_data.groupby('label_group').posting_id.agg('unique').to_dict()
test_data['target'] = test_data.label_group.map(label_group)
test_data['cv'] = test_data.apply(combine_for_cal_f1,axis=1)
test_data['f1_score'] = test_data.apply(get_metric('cv'),axis=1)
print('CV Score =', test_data.f1_score.mean() )

test_data['result'] = test_data.apply(combine_for_match,axis=1)

In [None]:
print("CV for image :", round(test_data.apply(get_metric('image_predict'),axis=1).mean(), 3))
print("CV for text  :", round(test_data.apply(get_metric('text_predict'),axis=1).mean(), 3))
print("CV for phash :", round(test_data.apply(get_metric('phash_predict'),axis=1).mean(), 3))

In [None]:
test_data

# Write Submission CSV

In [None]:
test_data[['posting_id','result']].to_csv('submission.csv',index=False)
sub = pd.read_csv('submission.csv')
sub.head()