In [None]:
timm_path = "../input/timm-pytorch-image-models/pytorch-image-models-master"
import sys
sys.path.append(timm_path)
import timm
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import os
from tqdm.notebook import tqdm

import cv2
import albumentations as A
from albumentations.pytorch import ToTensorV2

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader
from torch import optim

import numpy as np, pandas as pd, gc
import cv2, matplotlib.pyplot as plt
import cudf, cuml, cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors
import tensorflow as tf
# from tensorflow.keras.applications import EfficientNetB0
print('RAPIDS',cuml.__version__)
print('TF',tf.__version__)
metric='cosine'
import warnings
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load Train Data

In [None]:
COMPUTE_CV = True

test = pd.read_csv('../input/shopee-product-matching/test.csv')
if len(test)>3: COMPUTE_CV = False
else: print('this submission notebook will compute CV score, but commit notebook will not')

In [None]:
train = pd.read_csv('../input/shopee-product-matching/train.csv')
tmp = train.groupby('label_group').posting_id.agg('unique').to_dict()
train['target'] = train.label_group.map(tmp)
print('train shape is', train.shape )
train.head()

# Compute Baseline CV Score

In [None]:
tmp = train.groupby('image_phash').posting_id.agg('unique').to_dict()
train['oof'] = train.image_phash.map(tmp)

In [None]:
def getMetric(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

In [None]:
train['f1'] = train.apply(getMetric('oof'),axis=1)
print('CV score for baseline =',train.f1.mean())

# Compute RAPIDS Model CV and Infer Submission

In [None]:
if COMPUTE_CV:
    test = pd.read_csv('../input/shopee-product-matching/train.csv')
    test_gf = cudf.DataFrame(test)
    print('Using train as test to compute CV (since commit notebook). Shape is', test_gf.shape )
else:
    test = pd.read_csv('../input/shopee-product-matching/test.csv')
    test_gf = cudf.read_csv('../input/shopee-product-matching/test.csv')
    print('Test shape is', test_gf.shape )
test_gf.head()

# Use Image Embeddings

In [None]:
BASE = '../input/shopee-product-matching/test_images/'
if COMPUTE_CV: BASE = '../input/shopee-product-matching/train_images/'
    

image_size = 192#256
valid_batch_size = 64

valid_aug = A.Compose([
    A.LongestMaxSize(max_size=image_size, p=1.0),
    A.PadIfNeeded(min_height=image_size, min_width=image_size, border_mode=0, p=1.0),
    A.Normalize(p=1.0),
    ToTensorV2(p=1.0)
    ])

class Shopee(Dataset):
    def __init__(self, df, augs=None):
        self.df = df
        self.augs = augs
        
    def __len__(self):
        return(len(self.df))
    
    def __getitem__(self,idx):
        img_src = self.df.loc[idx, 'image']
        image = cv2.imread(BASE + img_src)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.uint8)
        
        if (self.augs):
            transformed = self.augs(image=image)
            image = transformed['image']
            
        return image
    

def test_predict(model, dataloader, device):
    model.eval()
    embeds = []
    
    with torch.no_grad():    
        for i, inputs in enumerate(tqdm(dataloader)):
            inputs = inputs.to(device)
            features = model(inputs).detach().cpu().numpy()
            metric = features.reshape(features.shape[0], features.shape[1])
            embeds.append(metric)
            
    return np.concatenate(embeds)

valid_data = test.copy()
valid_data = Shopee(valid_data.reset_index(drop=True), augs = valid_aug)
test_loader = DataLoader(valid_data,
                          shuffle=False,
                          num_workers=2,
                          batch_size=valid_batch_size)


num_embeddings = 512

model = timm.create_model('dm_nfnet_f0', pretrained=False)
num_features = model.head.fc.in_features
model.head.fc = nn.Linear(num_features, num_embeddings)           
_ = model.to(device)
weights_path = "../input/shopee-embedding-df/dm_nfnet.pth"
load_weghts = torch.load(weights_path)
model.load_state_dict(load_weghts)

image_embeddings = test_predict(model, test_loader, device)
print('image embeddings shape',image_embeddings.shape)

del model
_ = gc.collect()

In [None]:
KNN = 50
if len(test)==3: KNN = 2
model = NearestNeighbors(n_neighbors=KNN,metric=metric)
model.fit(image_embeddings)

In [None]:
preds = []
CHUNK = 1024*4

print('Finding similar images...')
CTS = len(image_embeddings)//CHUNK
if len(image_embeddings)%CHUNK!=0: CTS += 1
for j in tqdm(range( CTS )):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(image_embeddings))
    distances, indices = model.kneighbors(image_embeddings[a:b,])
    
    for k in range(b-a):
        IDX = np.where(distances[k,]<0.65)[0]
        IDS = indices[k,IDX]
        o = test.iloc[IDS].posting_id.values
        preds.append(o)

_ = gc.collect()

test['image_embeddings'] = preds

In [None]:
if COMPUTE_CV:
    tmp = test.groupby('label_group').posting_id.agg('unique').to_dict()
    test['target'] = test.label_group.map(tmp)
    test['f1'] = test.apply(getMetric('image_embeddings'),axis=1)
    print(f" CV score for baseline =",test.f1.mean())