In [None]:
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
import cv2,math,gc

import torch
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
from torch.nn import Parameter
import torch.optim as optim

!pip install "../input/efficient-net/dist/efficientnet_pytorch-0.7.0.tar"
from efficientnet_pytorch import EfficientNet

!pip install "../input/faissgpuwheel/faiss_gpu-1.7.0-cp37-cp37m-manylinux2014_x86_64.whl"
import faiss

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import cudf, cuml, cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors

import warnings
warnings.simplefilter('ignore')
    
torch.backends.cudnn.benchmark = True
from transformers import (BertTokenizer, BertModel,
                          DistilBertTokenizer, DistilBertModel)
from sklearn.preprocessing import LabelEncoder
from tqdm.autonotebook import tqdm

In [None]:
TRAIN = False

In [None]:
class cfg:
    img_size = (380,380)
    feavec_num1 = 512
    feavec_num2 = 768
    fea_norm = 64
    margin = 0.35
    batch = 16
    mname = 'efficientnet-b3'
    clsize = 8812
    lr = 0.001
    momentum = 0.9
    weight_decay = 0.0005
    log_interval = 1000
    epochs = 10
    wpath = '../input/my-weight/efficientnet-b3_eff_bert_arcface_epoch_10.pt'

In [None]:
class CFG:
    DistilBERT = True # if set to False, BERT model will be used
    bert_hidden_size = 768
    
    batch_size = 64
    epochs = 100
    num_workers = 4
    learning_rate = 1e-5 
    scheduler = "ReduceLROnPlateau"
    step = 'epoch'
    patience = 2
    factor = 0.8
    dropout = 0.5
    model_path = "/kaggle/working"
    max_length = 30
    model_save_name = "bert_model.pt"
    device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

In [None]:
df = pd.read_csv("../input/shopee-product-matching/test.csv")
    
df_cu = cudf.DataFrame(df)

if len(df)==3:
    cfg.batch = 3
print('df shape is', df.shape )
df.head()

In [None]:
if CFG.DistilBERT:
    model_name='cahya/distilbert-base-indonesian'
    tokenizer = DistilBertTokenizer.from_pretrained("../input/distilbertbaseindonesianfte1")
    bert_model = DistilBertModel.from_pretrained("../input/distilbertbaseindonesianfte1")
else:
    model_name='cahya/bert-base-indonesian-522M'
    tokenizer = BertTokenizer.from_pretrained(model_name)
    bert_model = BertModel.from_pretrained(model_name)

In [None]:
class ArcMarginProduct(nn.Module):
    def __init__(self, in_features, out_features, s=30.0, m=0.30, easy_margin=False):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.weight = Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt((1.0 - torch.pow(cosine, 2)).clamp(0, 1))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        one_hot = torch.zeros(cosine.size(), device=device)
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output  #need softmax then


class Model(nn.Module):
    def __init__(self,name,clustersize,feavec, bert_model, last_hidden_size=CFG.bert_hidden_size):
        super(Model, self).__init__()
        self.eff = EfficientNet.from_name(name)
        self.bert_model = bert_model
        self.out = nn.Linear(1000+last_hidden_size,feavec)
        self.margin = ArcMarginProduct(in_features=feavec, 
                                       out_features = clustersize, 
                                       s=cfg.fea_norm, 
                                       m=cfg.margin)      
    
    def get_bert_features(self, batch):
        output = self.bert_model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        last_hidden_state = output.last_hidden_state # shape: (batch_size, seq_length, bert_hidden_dim)
        CLS_token_state = last_hidden_state[:, 0, :] # obtaining CLS token state which is the first token.
        return CLS_token_state

    def forward(self, batch, labels=None):
        x1 = self.eff(batch['image'])
        x2 = self.get_bert_features(batch)
        
        x = torch.hstack((x1, x2))
        x = self.out(x)
        if labels is not None:
            return self.margin(x,labels)
        return F.normalize(x,dim=1)

In [None]:
model = Model(name=cfg.mname,clustersize=cfg.clsize, feavec=512, bert_model=bert_model).to(device)
model.load_state_dict(torch.load(cfg.wpath, map_location=device))

In [None]:
def load_image(file_name):
    if TRAIN:
        file_path = f'/kaggle/input/shopee-product-matching/train_images/{file_name}'
    else:
        file_path = f'/kaggle/input/shopee-product-matching/test_images/{file_name}'
    img = cv2.imread(file_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, cfg.img_size)
    tensor_img = torch.tensor(img)
    tensor_img = tensor_img.permute(( 2, 0, 1)).float()/255.0
    return tensor_img

In [None]:
class MyDataset(Dataset):
    def __init__(self, dataframe, tokenizer, mode="train", max_length=None):
        self.dataframe = dataframe
        if mode != "test":
            self.targets = dataframe['label'].values
        texts = list(dataframe['title'].apply(lambda o: str(o)).values)
        self.encodings = tokenizer(texts, 
                                   padding=True, 
                                   truncation=True, 
                                   max_length=max_length)
        self.mode = mode
        self.img = dataframe.image.values
        
    def __getitem__(self, idx):
        # putting each tensor in front of the corresponding key from the tokenizer
        # HuggingFace tokenizers give you whatever you need to feed to the corresponding model
        item = {key: torch.tensor(values[idx]) for key, values in self.encodings.items()}
        # when testing, there are no targets so we won't do the following
        if self.mode != "test":
            item['labels'] = torch.tensor(self.targets[idx]).long()
        img = self.img[idx]
        img = load_image(img)
        item['image'] = img
        return item
    
    def __len__(self):
        return len(self.dataframe)


In [None]:
def all_embeddings(df):
    dataset = MyDataset(df, tokenizer, mode='test', max_length=CFG.max_length)
    loader = DataLoader(dataset,
                        batch_size=cfg.batch,
                        shuffle=False,
                        num_workers=2,
                        pin_memory=True,
                        drop_last=False)
    
    model.eval()
    print('start collection')
    feavec = 512
    embedded1 = np.empty((0,feavec),dtype='float32')
    with torch.no_grad():
        for idx,batch in enumerate(loader):
            batch = {k: v.to(CFG.device) for k, v in batch.items()}
            outputs = model(batch)
            embedded1 = np.append(embedded1, outputs.cpu().detach().numpy(),axis=0)

            if idx%100==0:
                print(idx,len(loader))
                print(embedded1.shape)
    return embedded1

In [None]:
def predict_img(df,embeddings,topk=50,threshold=0.63):
    N,D = embeddings.shape
    cpu_index = faiss.IndexFlatL2(D)
    gpu_index = faiss.index_cpu_to_all_gpus(cpu_index)
    gpu_index.add(embeddings)
    cluster_distance,cluster_index = gpu_index.search(x=embeddings, k=topk)
    
    df['pred_images'] = ''
    pred = []
    for k in range(embeddings.shape[0]):
        idx = np.where(cluster_distance[k,] < threshold)[0]
        ids = cluster_index[k,idx]
        #posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
        posting_ids = df['posting_id'].iloc[ids].values
        pred.append(posting_ids)
    df['pred_images'] = pred
    return df

In [None]:
def get_text_predictions(df, max_features = 25000,threshold=0.7):
    from cuml.feature_extraction.text import TfidfVectorizer
    model = TfidfVectorizer(stop_words = 'english', binary = True, max_features = max_features)
    text_embeddings = model.fit_transform(df_cu.title).toarray()
    #print(text_embeddings)
    preds = []
    CHUNK = 1024*4

    print('Finding similar titles...')
    CTS = len(df)//CHUNK
    if len(df)%CHUNK!=0: CTS += 1
    for j in range( CTS ):

        a = j*CHUNK
        b = (j+1)*CHUNK
        b = min(b,len(df))
        print('chunk',a,'to',b)

        # COSINE SIMILARITY DISTANCE
        cts = cupy.matmul( text_embeddings, text_embeddings[a:b].T).T

        for k in range(b-a):
            IDX = cupy.where(cts[k,]>threshold)[0]
            o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
            preds.append(o)
    df['pred_text'] = preds
    del model,text_embeddings
    gc.collect()

    return df

In [None]:
all_embeddings1= all_embeddings(df)

In [None]:
df = predict_img(df,all_embeddings1,topk=50,threshold=0.73)
df

In [None]:
df = get_text_predictions(df, max_features = 25000,threshold=0.75)
df.head()

In [None]:
def combine_predictions(row):
    x = np.concatenate([row['pred_images'], row['pred_text']])
    return ' '.join( np.unique(x) )

In [None]:
df['matches'] = df.apply(combine_predictions, axis=1)
with open('submission.csv', 'w') as outf:
    print('posting_id,matches', file=outf)
    for i,(idnum,match) in enumerate(zip(df['posting_id'],df['matches'])):
        print(f'{idnum},{match}', file=outf)

In [None]:
df_t = pd.read_csv("submission.csv")
print(df_t)