In [None]:
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')

In [None]:
import numpy as np 
import pandas as pd 

import math
import random
import os 
import cv2
import timm

from tqdm.notebook import tqdm 

import albumentations as A 
from albumentations.pytorch.transforms import ToTensorV2

import torch 
from torch.utils.data import Dataset 
from torch import nn
import torch.nn.functional as F 

import gc
import cudf
import cuml
import cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors

In [None]:
from torch.utils.data import Dataset, DataLoader

In [None]:
class CFG:
#     img_size = 512
    batch_size = 4
    seed = 1
    device = 'cuda'
#     classes = 11014
    model_name = 'tf_mobilenetv3_small_minimal_100' # 'resnext50_32x4d'
#     model_path = '../input/shopee-pytorch-models/arcface_512x512_resnext32x4d.pt'
#     scale = 30 
#     margin = 0.5

In [None]:
def create_model():
    model = timm.create_model(CFG.model_name, pretrained=False)
    model.load_state_dict(torch.load('../input/timm-pretrained-mobilenetv3/mobilenetv3/tf_mobilenetv3_small_minimal_100-922a7843.pth'))
    model.eval()
    model = model.to(CFG.device)
    return model

In [None]:
class ShopeeDataset(Dataset):
    def __init__(self, image_paths, transforms=None):
        self.image_paths = image_paths
        self.augmentations = transforms

    def __len__(self):
        return self.image_paths.shape[0]

    def __getitem__(self, index):
        image_path = self.image_paths[index]
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.augmentations:
            augmented = self.augmentations(image=image)
            image = augmented['image']
        return image

In [None]:
def get_test_transforms():
    return A.Compose(
        [
            A.SmallestMaxSize(256),
            A.CenterCrop(224, 224),
            A.Normalize(),
            ToTensorV2(p=1.0)
        ]
    )

def get_image_embeddings(image_paths):
    embeds = []
    
    model = create_model()
    
    image_dataset = ShopeeDataset(image_paths=image_paths, transforms=get_test_transforms())
    image_loader = torch.utils.data.DataLoader(
        image_dataset,
        batch_size=CFG.batch_size,
        pin_memory=True,
        drop_last=False,
        num_workers=4
    )
    
    with torch.no_grad():
        for img in tqdm(image_loader): 
            img = img.cuda()
            feat = model.forward_features(img)
#             print(feat.shape)
            feat = feat.squeeze()
            image_embeddings = feat.detach().cpu().numpy()
            embeds.append(image_embeddings)
    
    del model
    image_embeddings = np.concatenate(embeds)
    print(f'Our image embeddings shape is {image_embeddings.shape}')
    del embeds
    gc.collect()
    return image_embeddings

In [None]:
def read_dataset():
    df = pd.read_csv('../input/shopee-product-matching/test.csv')
#     df = pd.concat([df]*30000, ignore_index=True)
    df_cu = cudf.DataFrame(df)
    image_paths = '../input/shopee-product-matching/test_images/' + df['image']
    return df, df_cu, image_paths

In [None]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_torch(1)

In [None]:
def combine_predictions(row):
    x = np.concatenate([row['image_predictions'], row['text_predictions']])
    return ' '.join( np.unique(x))

In [None]:
import math
def get_image_predictions(df, embeddings, threshold=0.2*math.pi):
    if len(df) > 3:
        KNN = 50
    else : 
        KNN = 3
    
    model = NearestNeighbors(n_neighbors = KNN, metric='cosine')
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
#     print(distances)
#     print(indices)
    
    predictions = []
    for k in tqdm(range(embeddings.shape[0])):
        idx = np.where(distances[k,] < threshold)[0]
        ids = indices[k, idx]
        posting_ids = df['posting_id'].iloc[ids].values
        predictions.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return predictions

In [None]:
def get_text_predictions(df, max_features = 25_000):
    
    model = TfidfVectorizer(stop_words = 'english', binary = True, max_features = max_features)
    text_embeddings = model.fit_transform(df_cu['title']).toarray()
    preds = []
    CHUNK = 1024*4

    print('Finding similar titles...')
    CTS = len(df)//CHUNK
    if len(df)%CHUNK!=0: CTS += 1
    for j in range( CTS ):

        a = j*CHUNK
        b = (j+1)*CHUNK
        b = min(b,len(df))
        print('chunk',a,'to',b)

        # COSINE SIMILARITY DISTANCE
        cts = cupy.matmul( text_embeddings, text_embeddings[a:b].T).T

        for k in range(b - a):
            IDX = cupy.where(cts[k,] > 0.75)[0]
            o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
            preds.append(o)
    
    del model,text_embeddings
    gc.collect()
    return preds

In [None]:
df, df_cu, image_paths = read_dataset()

In [None]:
image_embeddings = get_image_embeddings(image_paths.values)
image_predictions = get_image_predictions(df, image_embeddings)
text_predictions = get_text_predictions(df, max_features = 25_000)

In [None]:
df['image_predictions'] = image_predictions
df['text_predictions'] = text_predictions
df['matches'] = df.apply(combine_predictions, axis = 1)
df[['posting_id', 'matches']].to_csv('submission.csv', index = False)

In [None]:
# pd.read_csv('./submission.csv')