In [None]:
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')

In [None]:
import timm
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from tqdm.notebook import tqdm
import numpy as np 
import pandas as pd 
import sqlite3
import warnings
import cv2
warnings.filterwarnings("ignore")
import concurrent
import os
import gc
torch.cuda.empty_cache()

torch.backends.cudnn.benchmark = True

import matplotlib.pyplot as plt
import torch.nn.functional as F

import albumentations as A


In [None]:
MEAN = (0.485, 0.456, 0.406)
STD = 0.229, 0.224, 0.225
BATCH_SIZE = 256
DEVICE = torch.device('cuda:0')

In [None]:
transform = A.Compose([ A.Resize(384, 384, p=1),
                        A.Normalize(
                        MEAN, STD, max_pixel_value=255.0, always_apply=True),
                               ])
def load_image(file_path, transform=transform):
#     file_path = f'/kaggle/input/shopee-product-matching/train_images/{file_name}'

    img = cv2.imread(file_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    tensor_img = transform(image=img)['image']
    tensor_img = torch.tensor(tensor_img)
    tensor_img = tensor_img.permute(( 2, 0, 1)).float()
    
    return tensor_img
def load_images(file_paths):
    images = []
    for file_path in file_paths:
        image = load_image(file_path)
        images.append(image.unsqueeze(0))
    images = torch.cat(images)
    return images

# Get embedding

In [None]:
model = timm.create_model('densenet121', pretrained=False)
model.load_state_dict(torch.load('../input/shopee-model-siamese/model_epoch3_loss0.7067989706993103.pt'))

In [None]:
model.to(DEVICE)
_ = model.eval()

In [None]:
image_path_root = '../input/shopee-product-matching/train_images/'
train = pd.read_csv('../input/shopee-product-matching/train.csv')
images_name = train.image.tolist()
images_name = np.array(list(map(lambda x: image_path_root+x, images_name)))

In [None]:
n_iter = len(images_name) // BATCH_SIZE
embedding = []
for i in tqdm(range(n_iter+1)):
    file_paths = images_name[BATCH_SIZE*i:BATCH_SIZE*(i+1)]
    images = load_images(file_paths)
#     print(images.shape)
    with torch.no_grad():
        output = model(images.to(DEVICE).float()).detach().cpu().numpy()
        embedding.append(output)
    del images
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
embedding = np.concatenate(embedding)
embedding.shape

## KNN

In [None]:
from sklearn.neighbors import NearestNeighbors

In [None]:
if len(train) > 100:
    k = 100
else:
    k = len(train)
nbrs = NearestNeighbors(n_neighbors=k, metric='cosine').fit(embedding) 
#  algorithm='brute', metric='cosine'
distances, indices = nbrs.kneighbors(embedding)

# Make tripple

In [None]:
train_image = np.array(train.image.tolist())

In [None]:
neighbors = {}
for image, label_group in tqdm(zip(train.image, train.label_group), total=len(train)):
    neighbors[image] = train[train.label_group==label_group].image.tolist()

In [None]:
df_tripple = pd.DataFrame(columns=['anchor', 'positives', 'negatives'])
count = 0
for i, n_indices in tqdm(enumerate(indices), total=len(indices)):
    anchor = train_image[i]
    all_neighbor = train_image[n_indices]
    positives = list(filter(lambda x: x in neighbors[anchor], all_neighbor))  # to keep order/distance
    # positives = neighbors[anchor]  # no order
    negatives = list(filter(lambda x: x not in neighbors[anchor], all_neighbor))
    df_tripple.loc[i] = [anchor, ' '.join(positives), ' '.join(negatives)]

In [None]:
df_tripple.to_csv('df_tripple.csv', index=False)