In [None]:
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
import cv2,math,gc

import torch
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
from torch.nn import Parameter

!pip install "../input/efficient-net/dist/efficientnet_pytorch-0.7.0.tar"
from efficientnet_pytorch import EfficientNet

!pip install "../input/faissgpuwheel/faiss_gpu-1.7.0-cp37-cp37m-manylinux2014_x86_64.whl"
import faiss

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import cudf, cuml, cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors

import warnings
warnings.simplefilter('ignore')

torch.backends.cudnn.benchmark = True

In [None]:
class cfg:
    img_size = (380,380)
    feavec_num1 = 512
    feavec_num2 = 1280
    fea_norm = 64
    margin = 0.35
    batch = 50
    wpath = "../input/my-weight/efficientnet-b3_arcface_epoch_10.pt"
    mname = 'efficientnet-b3'
    clsize = 8812

In [None]:
df = pd.read_csv("../input/shopee-product-matching/test.csv")
df_cu = cudf.DataFrame(df)
if len(df)==3:
    cfg.batch = 3
    
print('df shape is', df.shape )
df.head()

In [None]:
class ArcMarginProduct(nn.Module):
    def __init__(self, in_features, out_features, s=30.0, m=0.30, easy_margin=False):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.weight = Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt((1.0 - torch.pow(cosine, 2)).clamp(0, 1))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        one_hot = torch.zeros(cosine.size(), device=device)
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)  # you can use torch.where if your torch.__version__ is 0.4
        output *= self.s
        return output

class Model(nn.Module):
    def __init__(self,name,clustersize,feavec=512):
        super(Model, self).__init__()
        self.eff = EfficientNet.from_name(name)
        self.out = nn.Linear(1000,feavec)
        self.margin = ArcMarginProduct(in_features=feavec, 
                                       out_features = clustersize, 
                                       s=cfg.fea_norm, 
                                       m=cfg.margin)      

    def forward(self, x, labels=None):
        x = self.eff(x)
        x = self.out(x)
        if labels is not None:
            return self.margin(x,labels)
        return F.normalize(x,dim=1)

In [None]:
class Model(nn.Module):
    def __init__(self,name,clustersize,feavec=512):
        super(Model, self).__init__()
        self.eff = EfficientNet.from_name(name)
        self.out = nn.Linear(1000,feavec)
        self.margin = ArcMarginProduct(in_features=feavec, 
                                       out_features = clustersize, 
                                       s=cfg.fea_norm, 
                                       m=cfg.margin)      

    def forward(self, x, labels=None):
        x = self.eff(x)
        x = self.out(x)
        if labels is not None:
            return self.margin(x,labels)
        return F.normalize(x,dim=1)

In [None]:
model1 = Model(name=cfg.mname,clustersize=cfg.clsize).to(device)
model1.load_state_dict(torch.load(cfg.wpath, map_location=device))

In [None]:
# make image Datasets
def load_image(file_name):
    file_path = f'/kaggle/input/shopee-product-matching/test_images/{file_name}'

    img = cv2.imread(file_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, cfg.img_size)
    tensor_img = torch.tensor(img)
    tensor_img = tensor_img.permute(( 2, 0, 1)).float()/255.0
    return tensor_img

class valDataset(Dataset):
    def __init__(self, df):
        self.img = df.image.values
        
    def __len__(self):
        return len(self.img)

    def __getitem__(self, idx):
        img = self.img[idx]
        img = load_image(img)
        return img

In [None]:
def image_embeddings(df):
    dataset = valDataset(df)
    loader = DataLoader(dataset,
                        batch_size=cfg.batch,
                        shuffle=False,
                        num_workers=2,
                        pin_memory=True,
                        drop_last=False)
    
    model1.eval()
    print('start collection')
    feavec = 512
    embedded1 = np.empty((0,feavec),dtype='float32')
    with torch.no_grad():
        for idx,images in enumerate(loader):
            images = images.to(device,non_blocking=True)
            outputs = model1(images)
            embedded1 = np.append(embedded1, outputs.cpu().detach().numpy(),axis=0)

            if idx%100==0:
                print(idx,len(loader))
                print(embedded1.shape)
    return embedded1

In [None]:
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

def predict_img(df,embeddings,topk=50,threshold=0.63):
    N,D = embeddings.shape
    cpu_index = faiss.IndexFlatL2(D)
    gpu_index = faiss.index_cpu_to_all_gpus(cpu_index)
    gpu_index.add(embeddings)
    cluster_distance,cluster_index = gpu_index.search(x=embeddings, k=topk)
    
    df['pred_images'] = ''
    pred = []
    for k in range(embeddings.shape[0]):
        idx = np.where(cluster_distance[k,] < threshold)[0]
        ids = cluster_index[k,idx]
        #posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
        posting_ids = df['posting_id'].iloc[ids].values
        pred.append(posting_ids)
    df['pred_images'] = pred

    #print(f'Our f1 score for threshold {threshold} is {score}')
    return df

In [None]:
def get_text_predictions(df, max_features = 25000,threshold=0.7):
    from cuml.feature_extraction.text import TfidfVectorizer
    model = TfidfVectorizer(stop_words = 'english', binary = True, max_features = max_features)
    text_embeddings = model.fit_transform(df_cu.title).toarray()
    #print(text_embeddings)
    preds = []
    CHUNK = 1024*4

    print('Finding similar titles...')
    CTS = len(df)//CHUNK
    if len(df)%CHUNK!=0: CTS += 1
    for j in range( CTS ):

        a = j*CHUNK
        b = (j+1)*CHUNK
        b = min(b,len(df))
        print('chunk',a,'to',b)

        # COSINE SIMILARITY DISTANCE
        cts = cupy.matmul( text_embeddings, text_embeddings[a:b].T).T

        for k in range(b-a):
            IDX = cupy.where(cts[k,]>threshold)[0]
            o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
            preds.append(o)
    df['pred_text'] = preds
    del model,text_embeddings
    gc.collect()

    return df

In [None]:
image_embeddings1= image_embeddings(df)

In [None]:
df = predict_img(df,image_embeddings1,topk=50,threshold=0.88)
df

In [None]:
df = get_text_predictions(df, max_features = 25000,threshold=0.75)
df.head()

In [None]:
def combine_predictions(row):
    x = np.concatenate([row['pred_images'], row['pred_text']])
    return ' '.join( np.unique(x) )

In [None]:
df['matches'] = df.apply(combine_predictions, axis=1)
with open('submission.csv', 'w') as outf:
    print('posting_id,matches', file=outf)
    for i,(idnum,match) in enumerate(zip(df['posting_id'],df['matches'])):
        print(f'{idnum},{match}', file=outf)

In [None]:
df_t = pd.read_csv("submission.csv")
print(df_t)