In [None]:
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
import cv2,math,gc,sys

import torch
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
from torch.nn import Parameter

!pip install "../input/faissgpuwheel/faiss_gpu-1.7.0-cp37-cp37m-manylinux2014_x86_64.whl"
#!pip install faiss-cpu
import faiss

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import cudf, cuml, cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors

sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')
import timm


import warnings
warnings.simplefilter('ignore')

torch.backends.cudnn.benchmark = True

import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup,get_cosine_schedule_with_warmup
from transformers import get_cosine_with_hard_restarts_schedule_with_warmup

In [None]:
class cfg:
    NUM_WORKERS = 2
    BATCH_SIZE = 8  
    transformer_model = 'bert-base-uncased'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
TOKENIZER1 = transformers.AutoTokenizer.from_pretrained('../input/bert-base-uncased')
TOKENIZER2 = transformers.AutoTokenizer.from_pretrained('../input/paraphrase-xlm-r-multilingual-v1/0_Transformer')

    
model_params = {
    'n_classes':8812,
    #'n_classes':11014,
    'model_name':"../input/bert-base-uncased",
    'pooling':'clf',
    'use_fc':False,
    'fc_dim':1280,
    'dropout':0.0,
    #'loss_module':loss_module,
    's':30.0,
    'margin':0.50,
    'ls_eps':0.0,
    'theta_zero':0.785
}

model_params2 = {
    'n_classes':8811,
    #'n_classes':11014,
    'model_name':"../input/bert-base-uncased",
    'pooling':'clf',
    'use_fc':False,
    'fc_dim':1280,
    'dropout':0.0,
    #'loss_module':loss_module,
    's':30.0,
    'margin':0.50,
    'ls_eps':0.0,
    'theta_zero':0.785
}

model_params_2 = {
    'n_classes':8811,
    #'n_classes':11014,
    'model_name':'../input/paraphrase-xlm-r-multilingual-v1/0_Transformer',
    'pooling':'clf',
    'use_fc':False,
    'fc_dim':1280,
    'dropout':0.0,
    #'loss_module':loss_module,
    's':30.0,
    'margin':0.50,
    'ls_eps':0.0,
    'theta_zero':0.785
}
    
class ShopeeDataset1(Dataset):
    def __init__(self, csv):
        self.csv = csv.reset_index()

    def __len__(self):
        return self.csv.shape[0]

    def __getitem__(self, index):
        row = self.csv.iloc[index]
        text = row.title
        text = TOKENIZER1(text, padding='max_length', truncation=True, max_length=64, return_tensors="pt")
        input_ids = text['input_ids'][0]
        attention_mask = text['attention_mask'][0]  
        
        return input_ids, attention_mask
    
class ShopeeDataset2(Dataset):
    def __init__(self, csv):
        self.csv = csv.reset_index()

    def __len__(self):
        return self.csv.shape[0]

    def __getitem__(self, index):
        row = self.csv.iloc[index]
        text = row.title
        text = TOKENIZER2(text, padding='max_length', truncation=True, max_length=64, return_tensors="pt")
        input_ids = text['input_ids'][0]
        attention_mask = text['attention_mask'][0]  
        
        return input_ids, attention_mask

In [None]:
COMPUTE_CV = False

#make target clustering
if COMPUTE_CV:
    df = pd.read_csv("../input/shopee-product-matching/train.csv")
    tmp = df.groupby('label_group').posting_id.agg('unique').to_dict()
    df['target'] = df.label_group.map(tmp)
    df['target'] = df['target'].apply(lambda x: ' '.join(x))
    #df_cu = cudf.DataFrame(df)
else:
    df = pd.read_csv("../input/shopee-product-matching/test.csv")
    df_cu = cudf.DataFrame(df)
    if len(df)==3:
        cfg.batch = 3
    
print('df shape is', df.shape )
df.head()

In [None]:
"""
df_unique = df.label_group.unique()
u0, u1, u2, u3, u4 = np.array_split(df_unique, 5)
u_train = np.concatenate([u0,u1,u2,u3],axis=0)
u_test = u4

df_train = df[df.label_group.isin(u_train)]
df_test = df[df.label_group.isin(u_test)]

print(len(df_train),len(df_test),len(df_train)+len(df_test))
"""

In [None]:
class ArcMarginProduct(nn.Module):
    def __init__(self, in_features, out_features, s=30.0, m=0.70, easy_margin=False, ls_eps=0.0):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.ls_eps = ls_eps  # label smoothing
        self.weight = Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        one_hot = torch.zeros(cosine.size(), device="cuda")
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s

        return output
    
class ShopeeNet(nn.Module):

    def __init__(self,
                 n_classes,
                 model_name='../input/bert-base-uncased',
                 pooling='mean_pooling',
                 use_fc=False,
                 fc_dim=512,
                 dropout=0.0,
                 loss_module='softmax',
                 s=30.0,
                 margin=0.50,
                 ls_eps=0.0,
                 theta_zero=0.785):

        super(ShopeeNet, self).__init__()

        self.transformer = transformers.AutoModel.from_pretrained(model_name)
        final_in_features = self.transformer.config.hidden_size
        
        self.pooling = pooling
        self.use_fc = use_fc
        self.fc = nn.Linear(final_in_features, fc_dim)
        self._init_params()
        self.loss_module = loss_module
        #self.final = ArcMarginProduct(final_in_features, n_classes)
        self.final = ArcMarginProduct(fc_dim, n_classes)
        
    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        #nn.init.constant_(self.bn.weight, 1)
        #nn.init.constant_(self.bn.bias, 0)

    def forward(self, input_ids,attention_mask, label=None):
        feature = self.extract_feat(input_ids,attention_mask)
        feature = self.fc(feature)
        if label is not None:
            logits = self.final(feature, label)
        else:
            logits = feature
        return F.normalize(logits,dim=1)

    def extract_feat(self, input_ids,attention_mask):
        x = self.transformer(input_ids=input_ids,attention_mask=attention_mask)
        features = x[0]
        features = features[:,0,:]

        if self.use_fc:
            features = self.dropout(features)
            features = self.fc(features)
            features = self.bn(features)
            features = self.relu(features)

        return features

In [None]:
def get_embedding_fn(df):
    test_dataset = ShopeeDataset1(csv=df)
    loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=cfg.BATCH_SIZE,
        shuffle=False,
        pin_memory=True,
        drop_last=False,
        num_workers=cfg.NUM_WORKERS)
    model1 = ShopeeNet(**model_params).to(device)
    model1.load_state_dict(torch.load('../input/shopee-weight/bertmodel_acc0.813.pt'))
    model2 = ShopeeNet(**model_params2).to(device)
    model2.load_state_dict(torch.load('../input/shopee-weight/bertmodel_fold2_acc0.774.pt'))
    model1.eval()
    model2.eval()
    print('start collection')
    embedded = np.empty((0,1280),dtype='float32')
    with torch.no_grad():
        for idx,d in enumerate(loader):
            input_ids, attention_mask = d[0].to(device),d[1].to(device)
            outputs1 = model1(input_ids,attention_mask)
            outputs2 = model2(input_ids,attention_mask)
            outputs = (outputs1 + outputs2)/2
            embedded = np.append(embedded, outputs.cpu().detach().numpy(),axis=0)

            if idx%500==0:
                print(idx,len(loader)) 
                print(embedded.shape)
    print(embedded.shape)
    return embedded

def get_embedding_para(df):
    test_dataset = ShopeeDataset2(csv=df)
    loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=cfg.BATCH_SIZE,
        shuffle=False,
        pin_memory=True,
        drop_last=False,
        num_workers=cfg.NUM_WORKERS)
    model = ShopeeNet(**model_params_2).to(device)
    model.load_state_dict(torch.load('../input/shopee-weight/paraphrasemodel_fold3_acc0.73.pt'))
    model.eval()
    print('start collection')
    embedded = np.empty((0,1280),dtype='float32')
    with torch.no_grad():
        for idx,d in enumerate(loader):
            input_ids, attention_mask = d[0].to(device),d[1].to(device)
            outputs = model(input_ids,attention_mask)
            embedded = np.append(embedded, outputs.cpu().detach().numpy(),axis=0)

            if idx%500==0:
                print(idx,len(loader)) 
                print(embedded.shape)
    print(embedded.shape)
    return embedded

def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

def predict_bert(df,embeddings,topk=50,threshold=0.63):
    N,D = embeddings.shape
    cpu_index = faiss.IndexFlatL2(D)
    gpu_index = faiss.index_cpu_to_all_gpus(cpu_index)
    gpu_index.add(embeddings)
    cluster_distance,cluster_index = gpu_index.search(x=embeddings, k=topk)
    
    df['pred_bert'] = ''
    pred = []
    for k in range(embeddings.shape[0]):
        idx = np.where(cluster_distance[k,] < threshold)[0]
        ids = cluster_index[k,idx]
        #posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
        posting_ids = df['posting_id'].iloc[ids].values
        pred.append(posting_ids)
    df['pred_bert'] = pred
    if COMPUTE_CV:
        df['pred_bertonly'] = df.pred_bert.apply(lambda x: ' '.join(x))
        df['f1_bert'] = f1_score(df['target'], df['pred_bertonly'])
        score = df['f1_bert'].mean()
        print(f'Our f1 score for threshold {threshold} is {score}')
    return df

In [None]:
bert_embedding = get_embedding_fn(df)

In [None]:
#para_embedding = get_embedding_para(df)

In [None]:
#trans_embedding = bert_embedding

In [None]:
theresholds=np.linspace(0.32,0.38,10)
if COMPUTE_CV:
    df = predict_bert(df,bert_embedding,topk=50,threshold=0.346)
    #df = predict_bert(df,para_embedding,topk=50,threshold=0.573)
    #df = predict_bert(df,trans_embedding,topk=50,threshold=0.27)
    #for topk in [49,50,51,60]:
    #for threshold in theresholds:
        #df = predict_bert(df,bert_embedding,topk=50,threshold=threshold)

In [None]:
df = predict_bert(df,bert_embedding,topk=50,threshold=0.24)
df.head()

In [None]:
df['matches'] = df['pred_bert'].apply(lambda x: ' '.join(x))
if COMPUTE_CV:
    df['f1'] = f1_score(df['target'], df['matches'])
    score = df['f1'].mean()
    print(f'Final f1 score is {score}')
else:
    with open('submission.csv', 'w') as outf:
        print('posting_id,matches', file=outf)
        for i,(idnum,match) in enumerate(zip(df['posting_id'],df['matches'])):
            print(f'{idnum},{match}', file=outf)

In [None]:
if COMPUTE_CV:
    df.to_csv('result.csv')
else:
    df_t = pd.read_csv("submission.csv")
    print(df_t)