In [None]:
!pip install timm

In [None]:
from PIL import Image
import timm
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform
import torch
import pandas as pd
import numpy as np
import os
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import math
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize


if torch.cuda.is_available():
  print('Good to go!')
else:
  print('Please set GPU via Edit -> Notebook Settings.')
  
device = torch.device('cuda:0')
cpu = torch.device('cpu')

In [None]:
model = timm.create_model('resnet101', pretrained=True, num_classes=0)
model.eval()
model.to(device)

config = resolve_data_config({}, model=model)
transform = create_transform(**config)

In [None]:
data_csv = pd.read_csv('../input/shopee-product-matching/train.csv')

label_groups = data_csv['label_group'].unique()
train_label_groups, val_label_groups = train_test_split(label_groups, test_size = 0.2)

train = data_csv[data_csv['label_group'].isin(train_label_groups)].copy()
val = data_csv[data_csv['label_group'].isin(val_label_groups)].copy()

train.to_csv('../working/train_batch.csv')
val.to_csv('../working/val_batch.csv')

# train = pd.read_csv('../working/train_batch.csv')
# val = pd.read_csv('../working/val_batch.csv')

print('all data: ', data_csv.shape)
print('train data: ', train.shape)
print('validation data: ', val.shape)

In [None]:
data_csv.head()

In [None]:
#Image Folder Paths
data_jpg_directory = '../input/shopee-product-matching/train_images'

In [None]:
class LoadData(Dataset):

    def __init__(self, filenames, root, input_transform=None):
        self.input_transform = input_transform
        self.filenames = filenames
        self.root = root

    def __getitem__(self, index):
        path = os.path.join(self.root, self.filenames[index]) 
        image = Image.open(path).convert('RGB')

        if self.input_transform is not None:
            image = self.input_transform(image)

        return image

    def __len__(self):
        return len(self.filenames)


In [None]:
batch_size = 64

train_set = LoadData(train.image.values, data_jpg_directory, transform)
# val_set = LoadData(val.image.values, data_jpg_directory, transform)

train_loader = torch.utils.data.DataLoader(train_set,
                                        batch_size=batch_size,
                                        shuffle=False,
                                        drop_last=False,
                                        num_workers=2
                                        )

# val_loader = torch.utils.data.DataLoader(val_set,
#                                         batch_size=batch_size,
#                                         shuffle=False,
#                                         drop_last=False,
#                                         )


In [None]:
def extract_embedding(data_loader):
    embeds = []

    with torch.no_grad():
        for i, images in enumerate(data_loader):
            if(i*64) % 2560 == 0:
                print(i*64)
            image_embeddings = model(images.to(device))
            embeds.append(image_embeddings.to(cpu))

    image_embeddings = np.concatenate(embeds)
    print('image embeddings shape',image_embeddings.shape)
    
    return image_embeddings

In [None]:
train_embeddings = extract_embedding(train_loader)
train_embeddings = normalize(train_embeddings, norm='l2', axis=1)

In [None]:
KNN_classes = 50
KNN_model = NearestNeighbors(n_neighbors=KNN_classes)
KNN_model.fit(train_embeddings)

In [None]:
def predict(image_embeddings, distance_threshold, df, KNN_model):
    preds = []
    CHUNK = 2560
    print(len(df))
    
    CTS = len(image_embeddings)//CHUNK
    if len(image_embeddings)%CHUNK!=0: CTS += 1

    for j in range(CTS):
        a = j*CHUNK
        b = (j+1)*CHUNK
        b = min(b,len(image_embeddings))
        print('chunk',a,'to',b)
    
        distances, indices = KNN_model.kneighbors(image_embeddings[a:b,])

        for k in range(b-a):
            IDX = np.where(distances[k,]<distance_threshold)[0] # for each embedding-k, find indices of other embedding having distance < threshold, in distances
            IDS = indices[k,IDX]                 # for each embedding-k, find indices of other embedding having distance < threshold, in indices (real indices)
            o = df.iloc[IDS].posting_id.values # get the posting_id of found embedding
            preds.append(o)
    return preds

In [None]:
preds = predict(train_embeddings, 0.6, train, KNN_model)

tmp = train.groupby('label_group').posting_id.agg('unique').to_dict()
train['target'] = train.label_group.map(tmp)
train['pred'] = preds

In [None]:
# F1-Score
def getMetric(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

In [None]:
train['f1score'] = train.apply(getMetric('pred'),axis=1)
f1_score = train['f1score'].mean()
print('train f1 score: ', f1_score)
train.head()