In [None]:
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')
import timm

import sys
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
"""
if torch.cuda.is_available():
    !cp ../input/rapids/rapids.0.18.0 /opt/conda/envs/rapids.tar.gz
    !cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
    sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
    sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
    sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
    !cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/"""

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm.auto import tqdm

tqdm.pandas()
import os
import copy
import time
import math
import cv2
import PIL.Image as Image
import random
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm.notebook import tqdm
import gc

from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import StepLR
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, models
import torchvision.transforms as transforms
import torch.optim as optim
from sklearn.metrics.pairwise import cosine_similarity



if torch.cuda.is_available():
    import cudf, cuml, cupy
    from cuml.feature_extraction.text import TfidfVectorizer
    from cuml.neighbors import NearestNeighbors
    print('RAPIDS',cuml.__version__)

import albumentations
import matplotlib.pyplot as plt


In [None]:
image_size = 512
batch_size = 42
data_dir = '../input/shopee-product-matching/'

COMPUTE_CV = False

if COMPUTE_CV:
    test_df = pd.read_csv('../input/shopee-product-matching/train.csv')
    test_df['file_path'] = test_df.image.apply(lambda x: os.path.join(data_dir + 'train_images', x))
    le = LabelEncoder()
    test_df.label_group = le.fit_transform(test_df.label_group)
else:
    test_df = pd.read_csv('../input/shopee-product-matching/test.csv')
    test_df['file_path'] = test_df.image.apply(lambda x: os.path.join(data_dir + 'test_images', x))
    


In [None]:
transforms_train = albumentations.Compose([
    albumentations.Resize(image_size, image_size),
    #albumentations.HorizontalFlip(p=0.5),
    #albumentations.RandomBrightnessContrast(p=0.5, brightness_limit=(-0.2, 0.2), contrast_limit=(-0.2, 0.2)),
    #albumentations.HueSaturationValue(p=0.5, hue_shift_limit=0.2, sat_shift_limit=0.2, val_shift_limit=0.2),
    #albumentations.ShiftScaleRotate(p=0.5, shift_limit=0.0625, scale_limit=0.2, rotate_limit=20),
    #albumentations.CoarseDropout(p=0.5),
    albumentations.Normalize()
])

transforms_valid = albumentations.Compose([
    albumentations.Resize(image_size, image_size),
    albumentations.Normalize()
])

In [None]:
class SHOPEEDataset(Dataset):
    def __init__(self, df, mode, transform=None):
        
        self.df = df.reset_index(drop=True)
        self.mode = mode
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        
        img1 = self.df.iloc[index]
        if COMPUTE_CV == True:
            img1_label = img1.label_group
        img1 = cv2.imread(img1.file_path)
        img1 = cv2.cvtColor(img1, cv2.COLOR_BGR2RGB)
        
        if self.transform is not None:
            res = self.transform(image=img1)
            img1 = res['image']
                
        img1 = img1.astype(np.float32)
        img1 = img1.transpose(2,0,1)
        
        if self.mode == 'test':
            return torch.tensor(img1).reshape(3,image_size,image_size).float()
        else:
            return torch.tensor(img1).reshape(3,image_size,image_size).float(), torch.tensor(img1_label).long()

In [None]:
def getMetric(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

def combine_predictions(row):
    x = np.concatenate([row['img_preds'], row['txt_preds'], row['phash_predictions']])
    if COMPUTE_CV == False:
        return ' '.join(np.unique(x))
    else:
        return np.unique(x)

In [None]:
class Shopee_model(nn.Module):
    def __init__(self, fc_dim, num_classes, use_pretrained=True):
        super(Shopee_model, self).__init__()
        
        self.model_ft = timm.create_model('efficientnet_b0', pretrained=use_pretrained)
        
        in_features = self.model_ft.classifier.in_features
        self.model_ft.classifier = nn.Identity()
        self.model_ft.global_pool = nn.Identity()
        self.pooling = nn.AdaptiveAvgPool2d(1)
        self.dropout = nn.Dropout(p=0.1)
        self.classifier = nn.Linear(in_features, fc_dim)
        self.bn = nn.BatchNorm1d(fc_dim)
        self._init_params()
        self.arc_layer = ArcMarginProduct(in_feature=fc_dim, out_feature=num_classes, s=30, m=0.50, easy_margin=False)
        
    def _init_params(self):
        nn.init.xavier_normal_(self.classifier.weight)
        nn.init.constant_(self.classifier.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)
        
    def forward(self, img, labels):
        features = self.extract_features(img)
        if self.training:
            logits = self.arc_layer(features, labels)
            return logits
        else:
            return features
        
    def extract_features(self, x):
        batch_size = x.shape[0]
        x = self.model_ft(x)
        x = self.pooling(x).view(batch_size, -1)

        if self.training:
            x = self.dropout(x)
            x = self.classifier(x)
            x = self.bn(x)
        return x

In [None]:
class ArcMarginProduct(nn.Module):
    def __init__(self, in_feature=128, out_feature=10575, s=30, m=0.50, easy_margin=False):
        super(ArcMarginProduct, self).__init__()
        self.in_feature = in_feature
        self.out_feature = out_feature
        self.s = s
        self.m = m
        self.weight = nn.Parameter(torch.Tensor(out_feature, in_feature))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)

        # make the function cos(theta+m) monotonic decreasing while theta in [0°,180°]
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m

    def forward(self, x, label):
        # cos(theta)
        cosine = F.linear(F.normalize(x), F.normalize(self.weight))
        # cos(theta + m)
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m

        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where((cosine - self.th) > 0, phi, cosine - self.mm)

        #one_hot = torch.zeros(cosine.size(), device='cuda' if torch.cuda.is_available() else 'cpu')
        one_hot = torch.zeros_like(cosine)
        one_hot.scatter_(1, label.view(-1, 1), 1)
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output = output * self.s

        return output

In [None]:
if COMPUTE_CV == True:
    tmp = test_df.groupby('label_group').posting_id.agg('unique').to_dict()
    test_df['target'] = test_df.label_group.map(tmp)

In [None]:
def query_expansion(model, embeds):
    CHUNK = 1024 * 2
    n = len(embeds)
    CTS = n // CHUNK
    if n % CHUNK != 0: CTS += 1
    new_embeds = []

    for j in range(CTS):
        a = j * CHUNK
        b = (j + 1) * CHUNK
        b = min(b, n)
        print('chunk',a,'to',b)

        distances, indices = model.kneighbors(embeds[a:b,])
        for i in range(b - a):
            IDS = indices[i,].get()
            o = embeds[IDS].sum(axis=0)
            o = torch.tensor(o).reshape(1, -1)
            o = F.normalize(o)
            new_embeds.append(o[0].detach().cpu().numpy())
    return new_embeds

In [None]:
def get_score(df, embeds, threshold, data_type='img', metric='euclidean', KNN=100):
    CHUNK = 1024 * 2
    n = len(df)
    preds = []
    CTS = n // CHUNK
    model = NearestNeighbors(n_neighbors=KNN, metric=metric)
    model.fit(embeds)
    if n % CHUNK != 0: CTS += 1
    f1_name = data_type + '_f1'
    preds_name = data_type + '_preds'
    if data_type == 'img':
        for j in range(CTS):
            a = j * CHUNK
            b = (j + 1) * CHUNK
            b = min(b, n)
            print('chunk',a,'to',b)

            distances, indices = model.kneighbors(embeds[a:b,])
            for k in range(b - a):
                IDX = np.where(distances[k,] < threshold)[0]
                IDS = indices[k,IDX].get()
                o = df.iloc[IDS].posting_id.values
                preds.append(o)
    else:
        for j in range(CTS):
            a = j * CHUNK
            b = (j + 1) * CHUNK
            b = min(b, n)
            print('chunk',a,'to',b)

            distances, indices = model.kneighbors(embeds[a:b,])
            for k in range(b - a):
                IDX = np.where(distances[k,] < threshold)[0]
                IDS = indices[k,IDX].get()
                o = df.iloc[IDS].posting_id.values
                preds.append(o)
                
    df[preds_name] = preds
    del model
    if COMPUTE_CV == True:
        df[f1_name] = df.apply(getMetric(preds_name), axis=1)
        mean = df[f1_name].mean()
        print('CV score for for threshold with {} Dist {} ='.format(metric, threshold), mean)
        return df, mean
    else:
        return df, 0

In [None]:
num_classes = 11014
fc_dim = 512

model_ft = Shopee_model(fc_dim ,num_classes, use_pretrained=False)
model_ft.load_state_dict(torch.load('../input/modelstate52v1/model (5).pt'))
model_ft.eval()
model_ft.to(device)

In [None]:
dataset_test = SHOPEEDataset(test_df, 'test', transform=transforms_train)
embeds = []
data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=12, shuffle=False, num_workers=4)
with torch.no_grad():
    for img in tqdm(data_loader_test):
        img = img.to(device)
        img = model_ft.extract_features(img)
        embeds.append(img.detach().cpu().numpy())
del model_ft

In [None]:
embeds = np.array(embeds)
img_embeds = []
for batch in embeds:
    for x in batch:
        img_embeds.append(x)

In [None]:
from sklearn.decomposition import PCA
comp = min(len(img_embeds), 500)
pca = PCA(n_components=comp)
pca.fit(img_embeds)
img_embeds_pca = pca.transform(img_embeds)
print(pca.explained_variance_ratio_.sum())

In [None]:
img_embeds = cupy.asarray(img_embeds_pca)
model = NearestNeighbors(n_neighbors=2, metric='euclidean')
model.fit(img_embeds)
for _ in range(1):

    img_embeds = query_expansion(model, img_embeds)
    img_embeds = cupy.asarray(img_embeds)
del model

In [None]:
if COMPUTE_CV == True:
    best = 0
    img_embeds = cupy.asarray(img_embeds)
    for thresh in list(np.arange(.9, 1, 0.05)):
        tmp_df, tmp_score = get_score(test_df, img_embeds, thresh, data_type='img', metric='euclidean')
        if tmp_score > best:
            test_df = tmp_df
            best = tmp_score
else:
    thresh = 0.5
    img_embeds = cupy.asarray(img_embeds)
    KNN = min(100, len(test_df))
    test_df, _ = get_score(test_df, img_embeds, thresh, data_type='img', metric='euclidean', KNN=KNN)
gc.collect()

In [None]:
from cuml.feature_extraction.text import TfidfVectorizer

df_cu = cudf.DataFrame(test_df)
model = TfidfVectorizer(stop_words=None, binary=True, max_features=15000)
text_embeddings = model.fit_transform(df_cu['title']).toarray()
print('text embeddings shape',text_embeddings.shape)
del model

In [None]:
COMPUTE_CV == True
if COMPUTE_CV == True:
    best = 0
    for thresh in list(np.arange(0.9, 1, 0.05)):
        tmp_df, tmp_score = get_score(test_df, text_embeddings, thresh, data_type='txt', metric='euclidean')
        if tmp_score > best:
            test_df = tmp_df
            best = tmp_score
else:
    thresh = .65
    text_embeddings = cupy.asarray(text_embeddings)
    KNN = min(100, len(test_df))
    test_df, _ = get_score(test_df, text_embeddings, thresh, data_type='txt', metric='euclidean', KNN=KNN)
gc.collect()

In [None]:
tmp = test_df.groupby('image_phash').posting_id.agg('unique').to_dict()
test_df['phash_predictions'] = test_df.image_phash.map(tmp)

In [None]:
test_df['matches'] = test_df.apply(combine_predictions, axis=1)
if COMPUTE_CV == True:
    test_df['final_f1'] = test_df.apply(getMetric('matches'), axis=1)
    print('CV score =',test_df.final_f1.mean())

In [None]:
test_df[['posting_id', 'matches']].to_csv('submission.csv', index=False)

In [None]:
test_df.head()