In [None]:
import sys
!cp ../input/rapids/rapids.0.18.0 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [None]:
# Preliminaries
from tqdm import tqdm

import random
import os


#torch
import torch
import torch.nn as nn
from torch.nn import Parameter
from torch.nn import functional as F
from torch.utils.data import Dataset,DataLoader

import transformers


from sklearn.preprocessing import Normalizer


# image


In [None]:
!pip install ../input/download/Keras_Applications-1.0.8-py3-none-any.whl
!pip install ../input/download/efficientnet-1.1.1-py3-none-any.whl

In [None]:
import numpy as np
import pandas as pd
import gc
import matplotlib.pyplot as plt
import cudf
import cuml
import cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml import PCA
from cuml.neighbors import NearestNeighbors
import tensorflow as tf
import efficientnet.tfkeras as efn
from tqdm.notebook import tqdm
import math
import tensorflow_hub as hub

In [None]:
# For tf.dataset
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
BATCH_SIZE = 8
IMAGE_SIZE = [512, 512]
# Seed
SEED = 42
# Verbosity
VERBOSE = 1
# Number of classes
N_CLASSES = 11014

In [None]:
LIMIT = 4.0
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        #print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)
print('We will restrict TensorFlow to max %iGB GPU RAM'%LIMIT)
print('then RAPIDS can use %iGB GPU RAM'%(16-LIMIT))

In [None]:
# Flag to get cv score
GET_CV = False
# Flag to check ram allocations (debug)
CHECK_SUB = False

df = pd.read_csv('../input/shopee-product-matching/test.csv')
# If we are comitting, replace train set for test set and dont get cv
if len(df) > 3:
    GET_CV = False
del df

In [None]:
# Function to get our f1 score
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

# Function to combine predictions
def combine_predictions(row):
    x = np.concatenate([row['image_predictions']])
    return ' '.join( np.unique(x) )


# Function to read out dataset
def read_dataset():
    if GET_CV:
        df = pd.read_csv('../input/shopee-product-matching/train.csv')
        tmp = df.groupby(['label_group'])['posting_id'].unique().to_dict()
        df['matches'] = df['label_group'].map(tmp)
        df['matches'] = df['matches'].apply(lambda x: ' '.join(x))
        if CHECK_SUB:
            df = pd.concat([df, df], axis = 0)
            df.reset_index(drop = True, inplace = True)
        df_cu = cudf.DataFrame(df)
        image_paths = '../input/shopee-product-matching/train_images/' + df['image']
    else:
        df = pd.read_csv('../input/shopee-product-matching/test.csv')
        df_cu = cudf.DataFrame(df)
        image_paths = '../input/shopee-product-matching/test_images/' + df['image']
        
    return df, df_cu, image_paths


# Function to decode our images
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels = 3)
    image = tf.image.resize(image, IMAGE_SIZE)
    image = tf.cast(image, tf.float32) / 255.0
    return image

# Function to read our test image and return image
def read_image(image):
    image = tf.io.read_file(image)
    image = decode_image(image)
    return image

# Function to get our dataset that read images
def get_dataset(image):
    dataset = tf.data.Dataset.from_tensor_slices(image)
    dataset = dataset.map(read_image, num_parallel_calls = AUTO)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset


# Arcmarginproduct class keras layer
class ArcMarginProduct(tf.keras.layers.Layer):
    '''
    Implements large margin arc distance.

    Reference:
        https://arxiv.org/pdf/1801.07698.pdf
        https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution/
            blob/master/src/modeling/metric_learning.py
    '''
    def __init__(self, n_classes, s=30, m=0.50, easy_margin=False,
                 ls_eps=0.0, **kwargs):

        super(ArcMarginProduct, self).__init__(**kwargs)

        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m = tf.math.cos(m)
        self.sin_m = tf.math.sin(m)
        self.th = tf.math.cos(math.pi - m)
        self.mm = tf.math.sin(math.pi - m) * m

    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'n_classes': self.n_classes,
            's': self.s,
            'm': self.m,
            'ls_eps': self.ls_eps,
            'easy_margin': self.easy_margin,
        })
        return config

    def build(self, input_shape):
        super(ArcMarginProduct, self).build(input_shape[0])

        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer='glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None)

    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)
        cosine = tf.matmul(
            tf.math.l2_normalize(X, axis=1),
            tf.math.l2_normalize(self.W, axis=0)
        )
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = tf.where(cosine > 0, phi, cosine)
        else:
            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
        one_hot = tf.cast(
            tf.one_hot(y, depth=self.n_classes),
            dtype=cosine.dtype
        )
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output



# Function to get the embeddings of our images with the fine-tuned model
def get_image_embeddings(image_paths):
    embeds = []
    
    margin = ArcMarginProduct(
            n_classes = N_CLASSES, 
            s = 30, 
            m = 0.7, 
            name='head/arc_margin', 
            dtype='float32'
            )

    inp = tf.keras.layers.Input(shape = (*IMAGE_SIZE, 3), name = 'inp1')
    label = tf.keras.layers.Input(shape = (), name = 'inp2')
    x = efn.EfficientNetB3(weights = None, include_top = False)(inp)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = margin([x, label])
        
    output = tf.keras.layers.Softmax(dtype='float32')(x)

    model = tf.keras.models.Model(inputs = [inp, label], outputs = [output])
    model.load_weights('../input/b3-try/EfficientNetB3_512_42.h5')
    model = tf.keras.models.Model(inputs = model.input[0], outputs = model.layers[-4].output)
    chunk = 5000
    iterator = np.arange(np.ceil(len(df) / chunk))
    for j in iterator:
        a = int(j * chunk)
        b = int((j + 1) * chunk)
        image_dataset = get_dataset(image_paths[a:b])
        image_embeddings = model.predict(image_dataset)
        embeds.append(image_embeddings)
    del model
    image_embeddings = np.concatenate(embeds)
    print(f'Our image embeddings shape is {image_embeddings.shape}')
    del embeds
    gc.collect()
    return image_embeddings

In [None]:
def get_distances(df, embeddings, KNN = 50):
    if len(pd.read_csv('../input/shopee-product-matching/test.csv'))<4 and GET_CV==False:
        KNN=3
    model = NearestNeighbors(n_neighbors = KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    del model
    _=gc.collect()
    return distances, indices

In [None]:

    

df, df_cu, image_paths = read_dataset()
image_embeddings = get_image_embeddings(image_paths)
# text_embeddings = get_text_embeddings(df)
gc.collect()

In [None]:
image_distances, image_indices=get_distances(df, image_embeddings, KNN = 50)

In [None]:
image_embeddings.shape

# eca-nfnet-l0 

In [None]:

sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')

In [None]:


import math

import cv2
import timm

from tqdm import tqdm 

import albumentations as A 
from albumentations.pytorch.transforms import ToTensorV2

import torch 
from torch.utils.data import Dataset 
from torch import nn
import torch.nn.functional as F 



In [None]:
class CFG:
    
    img_size = 512
    batch_size = 12
    seed = 2020
    
    device = 'cuda'
    classes = 11014
    
    model_name = 'eca_nfnet_l0'
    model_path = '../input/shopee-pytorch-models/arcface_512x512_nfnet_l0 (mish).pt'
    
    scale = 30 
    margin = 0.5

    if GET_CV:
        df = pd.read_csv('../input/shopee-product-matching/train.csv')
        tmp = df.groupby(['label_group'])['posting_id'].unique().to_dict()
        df['matches'] = df['label_group'].map(tmp)
        df['matches'] = df['matches'].apply(lambda x: ' '.join(x))
        if CHECK_SUB:
            df = pd.concat([df, df], axis = 0)
            df.reset_index(drop = True, inplace = True)
        df_cu = cudf.DataFrame(df)
        image_paths = '../input/shopee-product-matching/train_images/' + df['image']
    else:
        df = pd.read_csv('../input/shopee-product-matching/test.csv')
        df_cu = cudf.DataFrame(df)
        image_paths = '../input/shopee-product-matching/test_images/' + df['image']
        

def read_dataset():
    if GET_CV:
        df = pd.read_csv('../input/shopee-product-matching/train.csv')
        tmp = df.groupby(['label_group'])['posting_id'].unique().to_dict()
        df['matches'] = df['label_group'].map(tmp)
        df['matches'] = df['matches'].apply(lambda x: ' '.join(x))
        if CHECK_SUB:
            df = pd.concat([df, df], axis = 0)
            df.reset_index(drop = True, inplace = True)
        df_cu = cudf.DataFrame(df)
        image_paths = '../input/shopee-product-matching/train_images/' + df['image']
    else:
        df = pd.read_csv('../input/shopee-product-matching/test.csv')
        df_cu = cudf.DataFrame(df)
        image_paths = '../input/shopee-product-matching/test_images/' + df['image']
        
    return df, df_cu, image_paths

def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_torch(CFG.seed)


# def get_image_predictions(df, embeddings,threshold = 0.0):
    
#     if len(df) > 3:
#         KNN = 50
#     else : 
#         KNN = 3
    
#     model = NearestNeighbors(n_neighbors = KNN, metric = 'cosine')
#     model.fit(embeddings)
#     distances, indices = model.kneighbors(embeddings)
    
#     predictions = []
#     for k in tqdm(range(embeddings.shape[0])):
#         idx = np.where(distances[k,] < threshold)[0]
#         ids = indices[k,idx]
#         posting_ids = df['posting_id'].iloc[ids].values
#         predictions.append(posting_ids)
        
#     del model, distances, indices
#     gc.collect()
#     return predictions


def get_test_transforms():

    return A.Compose(
        [
            A.Resize(CFG.img_size,CFG.img_size,always_apply=True),
            A.Normalize(),
        ToTensorV2(p=1.0)
        ]
    )


class ShopeeDataset(Dataset):
    def __init__(self, image_paths, transforms=None):

        self.image_paths = image_paths
        self.augmentations = transforms

    def __len__(self):
        return self.image_paths.shape[0]

    def __getitem__(self, index):
        image_path = self.image_paths[index]
        
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.augmentations:
            augmented = self.augmentations(image=image)
            image = augmented['image']       
    
        return image,torch.tensor(1)
    
    



In [None]:
class ArcMarginProduct(nn.Module):
    def __init__(self, in_features, out_features, scale=30.0, margin=0.50, easy_margin=False, ls_eps=0.0):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.scale = scale
        self.margin = margin
        self.ls_eps = ls_eps  # label smoothing
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(margin)
        self.sin_m = math.sin(margin)
        self.th = math.cos(math.pi - margin)
        self.mm = math.sin(math.pi - margin) * margin

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
        one_hot = torch.zeros(cosine.size(), device='cuda')
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.scale

        return output

class ShopeeModel(nn.Module):

    def __init__(
        self,
        n_classes = CFG.classes,
        model_name = CFG.model_name,
        fc_dim = 512,
        margin = CFG.margin,
        scale = CFG.scale,
        use_fc = True,
        pretrained = False):


        super(ShopeeModel,self).__init__()
        print('Building Model Backbone for {} model'.format(model_name))

        self.backbone = timm.create_model(model_name, pretrained=pretrained)

        if model_name == 'resnext50_32x4d':
            final_in_features = self.backbone.fc.in_features
            self.backbone.fc = nn.Identity()
            self.backbone.global_pool = nn.Identity()

        elif model_name == 'efficientnet_b3':
            final_in_features = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
            self.backbone.global_pool = nn.Identity()

        elif model_name == 'tf_efficientnet_b5_ns':
            final_in_features = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
            self.backbone.global_pool = nn.Identity()
        
        elif model_name == 'eca_nfnet_l0':
            final_in_features = self.backbone.head.fc.in_features
            self.backbone.head.fc = nn.Identity()
            self.backbone.head.global_pool = nn.Identity()

        self.pooling =  nn.AdaptiveAvgPool2d(1)

        self.use_fc = use_fc

        self.dropout = nn.Dropout(p=0.0)
        self.fc = nn.Linear(final_in_features, fc_dim)
        self.bn = nn.BatchNorm1d(fc_dim)
        self._init_params()
        final_in_features = fc_dim

        self.final = ArcMarginProduct(
            final_in_features,
            n_classes,
            scale = scale,
            margin = margin,
            easy_margin = False,
            ls_eps = 0.0
        )

    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, image, label):
        feature = self.extract_feat(image)
        #logits = self.final(feature,label)
        return feature

    def extract_feat(self, x):
        batch_size = x.shape[0]
        x = self.backbone(x)
        x = self.pooling(x).view(batch_size, -1)

        if self.use_fc:
            x = self.dropout(x)
            x = self.fc(x)
            x = self.bn(x)
        return x
    
    
class Mish_func(torch.autograd.Function):
    
    """from: https://github.com/tyunist/memory_efficient_mish_swish/blob/master/mish.py"""
    
    @staticmethod
    def forward(ctx, i):
        result = i * torch.tanh(F.softplus(i))
        ctx.save_for_backward(i)
        return result

    @staticmethod
    def backward(ctx, grad_output):
        i = ctx.saved_variables[0]
  
        v = 1. + i.exp()
        h = v.log() 
        grad_gh = 1./h.cosh().pow_(2) 

        # Note that grad_hv * grad_vx = sigmoid(x)
        #grad_hv = 1./v  
        #grad_vx = i.exp()
        
        grad_hx = i.sigmoid()

        grad_gx = grad_gh *  grad_hx #grad_hv * grad_vx 
        
        grad_f =  torch.tanh(F.softplus(i)) + i * grad_gx 
        
        return grad_output * grad_f 


class Mish(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        pass
    def forward(self, input_tensor):
        return Mish_func.apply(input_tensor)


def replace_activations(model, existing_layer, new_layer):
    
    """A function for replacing existing activation layers"""
    
    for name, module in reversed(model._modules.items()):
        if len(list(module.children())) > 0:
            model._modules[name] = replace_activations(module, existing_layer, new_layer)

        if type(module) == existing_layer:
            layer_old = module
            layer_new = new_layer
            model._modules[name] = layer_new
    return model


def get_image_embeddings(image_paths, model_name = CFG.model_name):
    embeds = []
    
    model = ShopeeModel(model_name = model_name)
    model.eval()
    
    if model_name == 'eca_nfnet_l0':
        model = replace_activations(model, torch.nn.SiLU, Mish())

    model.load_state_dict(torch.load(CFG.model_path))
    model = model.to(CFG.device)
    

    image_dataset = ShopeeDataset(image_paths=image_paths,transforms=get_test_transforms())
    image_loader = torch.utils.data.DataLoader(
        image_dataset,
        batch_size=CFG.batch_size,
        pin_memory=True,
        drop_last=False,
        num_workers=4
    )
    
    
    with torch.no_grad():
        for img,label in tqdm(image_loader): 
            img = img.cuda()
            label = label.cuda()
            feat = model(img,label)
            image_embeddings = feat.detach().cpu().numpy()
            embeds.append(image_embeddings)
    
    
    del model
    image_embeddings = np.concatenate(embeds)
    print(f'Our image embeddings shape is {image_embeddings.shape}')
    del embeds
    gc.collect()
    return image_embeddings

In [None]:
df,df_cu,image_paths = read_dataset()
df.head()

In [None]:
image_embeddings1 = get_image_embeddings(image_paths.values)

In [None]:
def get_distances_cosine(df, embeddings, KNN = 50):
    if len(pd.read_csv('../input/shopee-product-matching/test.csv'))<4 and GET_CV==False:
        KNN=3
    model = NearestNeighbors(n_neighbors = KNN, metric = 'cosine')
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    del model
    _=gc.collect()
    return distances, indices

In [None]:
image_distances1, image_indices1=get_distances_cosine(df, image_embeddings1, KNN = 50)

In [None]:
# image_embeddings_tot=np.concatenate((image_embeddings1,image_embeddings),axis=1)

In [None]:
# def get_image_predictions(df, embeddings,threshold0 = 0.0):
    
#     if len(df) > 3:
#         KNN = 50
#     else : 
#         KNN = 3

#     model = NearestNeighbors(n_neighbors = KNN, metric = 'cosine')
#     model.fit(embeddings)
#     distances, indices = model.kneighbors(embeddings)
    
    
#     if GET_CV:

#         thresholds = list(np.arange(0.2, 0.4, 0.01))

#         scores = []
#         for threshold in thresholds:
#             predictions = []
#             for k in range(embeddings.shape[0]):
#                 idx = np.where(distances[k,] < threshold)[0]
#                 ids = indices[k,idx]
#                 posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
#                 predictions.append(posting_ids)
#             df['pred_matches'] = predictions
#             df['f1'] = f1_score(df['matches'], df['pred_matches'])
#             score = df['f1'].mean()
#             print(f'Our f1 score for threshold {threshold} is {score}')
#             scores.append(score)
#         thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
#         max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
#         best_threshold = max_score['thresholds'].values[0]
#         best_score = max_score['scores'].values[0]
#         print(f'Our best score is {best_score} and has a threshold {best_threshold}')
        
        
#     predictions = []
#     for k in tqdm(range(embeddings.shape[0])):
#         idx = np.where(distances[k,] < threshold0)[0]
#         ids = indices[k,idx]
#         posting_ids = df['posting_id'].iloc[ids].values
#         predictions.append(posting_ids)
        
#     del model, distances, indices
#     gc.collect()
#     return predictions

In [None]:
# image_predictions1 = get_image_predictions(df, image_embeddings_tot, threshold0 = 0.36)

# Test Configuration

In [None]:
NUM_WORKERS = 4
BATCH_SIZE = 16
SEED = 42

device = torch.device('cuda')

################################################  ADJUSTING FOR CV OR SUBMIT ##############################################



test = pd.read_csv('../input/shopee-product-matching/test.csv')
if len(test)>3: GET_CV = False
else: print('this submission notebook will compute CV score, but commit notebook will not')


################################################# MODEL ####################################################################

transformer_model = '../input/sentence-transformer-models/paraphrase-xlm-r-multilingual-v1/0_Transformer'
TOKENIZER = transformers.AutoTokenizer.from_pretrained(transformer_model)

################################################ MODEL PATH ###############################################################

TEXT_MODEL_PATH = '../input/only-text-sbert-with-dict/sentence_transfomer_xlm_best_loss_num_epochs_20_arcface.bin'

model_params = {
    'n_classes':11014,
    'model_name':transformer_model,
    'use_fc':False,
    'fc_dim':512,
    'dropout':0.3,
}

In [None]:
DICT = {}
file = open('../input/indonesean-english-dicttxt/indonesean_english_dict.txt','r')
for line in file.readlines():

    k = line.split(' ')[0][2:-2]
    v = line.split(' ')[1][1:-3]


    DICT[str(k).lower()] = str(v).lower()
file.close()

# Reading Data

In [None]:
def read_dataset():
    if GET_CV:
        df = pd.read_csv('../input/shopee-product-matching/train.csv')
        df["title"]=df["title"].str.lower()

        for key in list(DICT.keys()):
            df['title'] = df['title'].str.replace(key,DICT[key])
        tmp = df.groupby(['label_group'])['posting_id'].unique().to_dict()
        df['matches'] = df['label_group'].map(tmp)
        df['matches'] = df['matches'].apply(lambda x: ' '.join(x))
        if CHECK_SUB:
            df = pd.concat([df, df], axis = 0)
            df.reset_index(drop = True, inplace = True)
        df_cu = cudf.DataFrame(df)
    else:
        df = pd.read_csv('../input/shopee-product-matching/test.csv')
        df["title"]=df["title"].str.lower()

        for key in list(DICT.keys()):
            df['title'] = df['title'].str.replace(key,DICT[key])
        df_cu = cudf.DataFrame(df)
        
    return df, df_cu

# Utils

In [None]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_torch(SEED)

In [None]:
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

# Generating Embeddings

In [None]:
class ShopeeDataset(Dataset):
    def __init__(self, csv):
        self.csv = csv.reset_index()

    def __len__(self):
        return self.csv.shape[0]

    def __getitem__(self, index):
        row = self.csv.iloc[index]
        
        text = row.title
        
        text = TOKENIZER(text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")
        input_ids = text['input_ids'][0]
        attention_mask = text['attention_mask'][0]  
        
        return input_ids, attention_mask

In [None]:
class ShopeeNet(nn.Module):

    def __init__(self,
                 n_classes,
                 model_name='bert-base-uncased',
                 use_fc=False,
                 fc_dim=512,
                 dropout=0.0):
        """
        :param n_classes:
        :param model_name: name of model from pretrainedmodels
            e.g. resnet50, resnext101_32x4d, pnasnet5large
        :param pooling: One of ('SPoC', 'MAC', 'RMAC', 'GeM', 'Rpool', 'Flatten', 'CompactBilinearPooling')
        :param loss_module: One of ('arcface', 'cosface', 'softmax')
        """
        super(ShopeeNet, self).__init__()

        self.transformer = transformers.AutoModel.from_pretrained(model_name)
        final_in_features = self.transformer.config.hidden_size
        
        self.use_fc = use_fc
    
        if use_fc:
            self.dropout = nn.Dropout(p=dropout)
            self.fc = nn.Linear(final_in_features, fc_dim)
            self.bn = nn.BatchNorm1d(fc_dim)
            self._init_params()
            final_in_features = fc_dim


    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, input_ids,attention_mask):
        feature = self.extract_feat(input_ids,attention_mask)
        return F.normalize(feature)

    def extract_feat(self, input_ids,attention_mask):
        x = self.transformer(input_ids=input_ids,attention_mask=attention_mask)
        
        features = x[0]
        features = features[:,0,:]

        if self.use_fc:
            features = self.dropout(features)
            features = self.fc(features)
            features = self.bn(features)

        return features

# Generating Submission

In [None]:
def get_text_embeddings(df):
    embeds = []
    
    model = ShopeeNet(**model_params)
    model.eval()
    
    model.load_state_dict(dict(list(torch.load(TEXT_MODEL_PATH).items())[:-1]))
    model = model.to(device)

    text_dataset = ShopeeDataset(df)
    text_loader = torch.utils.data.DataLoader(
        text_dataset,
        batch_size=BATCH_SIZE,
        pin_memory=True,
        drop_last=False,
        num_workers=NUM_WORKERS
    )
    
    
    with torch.no_grad():
        for input_ids, attention_mask in tqdm(text_loader): 
            input_ids = input_ids.cuda()
            attention_mask = attention_mask.cuda()
            feat = model(input_ids, attention_mask)
            text_embeddings = feat.detach().cpu().numpy()
            embeds.append(text_embeddings)
    
    
    del model
    text_embeddings = np.concatenate(embeds)
    print(f'Our text embeddings shape is {text_embeddings.shape}')
    del embeds
    gc.collect()
    return text_embeddings

In [None]:
df,df_cu = read_dataset()
df.head()

In [None]:
text_embeddings1 = get_text_embeddings(df)

In [None]:
def get_text_distances(df, embeddings, KNN = 50):
    if len(pd.read_csv('../input/shopee-product-matching/test.csv'))<4 and GET_CV==False:
        KNN=3
    model = NearestNeighbors(n_neighbors = KNN,metric='cosine')
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    del model
    _=gc.collect()
    return distances, indices

In [None]:
text_distances1, text_indices1=get_text_distances(df, text_embeddings1, KNN = 50)

# Tfidf

In [None]:
def read_dataset():
    if GET_CV:
        df = pd.read_csv('../input/shopee-product-matching/train.csv')
        df["title"]=df["title"].str.lower()

#         for key in list(DICT.keys()):
#             df['title'] = df['title'].str.replace(key,DICT[key])
        tmp = df.groupby(['label_group'])['posting_id'].unique().to_dict()
        df['matches'] = df['label_group'].map(tmp)
        df['matches'] = df['matches'].apply(lambda x: ' '.join(x))
        if CHECK_SUB:
            df = pd.concat([df, df], axis = 0)
            df.reset_index(drop = True, inplace = True)
        df_cu = cudf.DataFrame(df)
    else:
        df = pd.read_csv('../input/shopee-product-matching/test.csv')
        df["title"]=df["title"].str.lower()

#         for key in list(DICT.keys()):
#             df['title'] = df['title'].str.replace(key,DICT[key])
        df_cu = cudf.DataFrame(df)
        
    return df, df_cu




In [None]:
df, df_cu = read_dataset()

In [None]:
model = TfidfVectorizer(stop_words='english',
                        binary=True)
text_embeddings = model.fit_transform(df_cu['title'])
del model
gc.collect()

In [None]:
KNN=50
if len(pd.read_csv('../input/shopee-product-matching/test.csv'))<4 and GET_CV==False:
    KNN=3
model = NearestNeighbors(n_neighbors = KNN,metric='cosine')
model.fit(text_embeddings)
distances, indices = model.kneighbors(text_embeddings)
del model
gc.collect()

In [None]:
def get_text_distances(df, embeddings, KNN = 50):
    if len(pd.read_csv('../input/shopee-product-matching/test.csv'))<4 and GET_CV==False:
        KNN=3
    model = NearestNeighbors(n_neighbors = KNN,metric='cosine')
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    del model
    _=gc.collect()
    return distances, indices

In [None]:
text_distances, text_indices=get_text_distances(df, text_embeddings, KNN = 50)

In [None]:
text_distances.shape

In [None]:
def get_neighbors(df,  image_distances, image_indices, image_distances1, image_indices1, text_distances, text_indices,text_distances1, text_indices1):

    # Iterate through different thresholds to maximize cv, run this in interactive mode, then replace else clause with a solid threshold
    if GET_CV:
        predictions0 = []
        predictions = []
        for k in range(image_distances.shape[0]):
                image_idx = np.where(image_distances[k,] <3.0)[0]
                image_ids = image_indices[k,image_idx]
                image_set=set(image_ids.tolist())

                image_idx = np.where(image_distances1[k,] <0.27)[0]
                image_ids = image_indices1[k,image_idx]
                image1_set=set(image_ids.tolist())

                image_idx = np.where((image_distances[k,] <4.0)&(image_distances[k,] >=3.4))[0]
                image_ids = image_indices[k,image_idx]
                image2_set=set(image_ids.tolist())

                image_idx = np.where((image_distances1[k,] <0.33)&(image_distances1[k,] >=0.27))[0]
                image_ids = image_indices1[k,image_idx]
                image3_set=set(image_ids.tolist())

                image_idx = np.where((image_distances[k,] <3.4)&(image_distances[k,] >=3.0))[0]
                image_ids = image_indices[k,image_idx]
                image4_set=set(image_ids.tolist()) 
                
                image_idx = np.where(image_distances1[k,] <0.5)[0]
                image_ids = image_indices1[k,image_idx]
                image5_set=set(image_ids.tolist())
                

                text_idx = np.where(text_distances1[k,] <=0.13)[0]
                text_ids = text_indices1[k,text_idx]
                text_set0=set(text_ids.tolist())

                text_idx = cupy.where(text_distances[k,] <=0.2)[0]
                text_ids = cupy.asnumpy(text_indices[k,text_idx])
                text_set=set(text_ids.tolist())

                text_idx = cupy.where((text_distances[k,]>0.2)&(text_distances[k,]<0.38))[0]
                text_ids = cupy.asnumpy(text_indices[k,text_idx])
                text_set1=set(text_ids.tolist())

                text_idx = cupy.where((text_distances[k,]>0.2)&(text_distances[k,]<0.5))[0]
                text_ids = cupy.asnumpy(text_indices[k,text_idx])
                text_set2=set(text_ids.tolist())

                text_idx = np.where(text_distances1[k,] <=0.5)[0]
                text_ids = text_indices1[k,text_idx]
                text_set3=set(text_ids.tolist())

                image2_set=(image2_set.union(image3_set)).intersection(image5_set)
                set1=image2_set.intersection(text_set1.intersection(text_set3))

                set2=image4_set.intersection(text_set2.union(text_set3))

                total_set=image_set.union(text_set,image1_set,set1,set2,text_set0)

                tot_idx=np.array(sorted(list(total_set)))

                posting_ids = df['posting_id'].iloc[tot_idx].values
                predictions.append(posting_ids)

                posting0_ids = ' '.join(df['posting_id'].iloc[tot_idx].values)
                predictions0.append(posting0_ids)
                
                
        df['pred_matches'] = predictions0
        df['f1'] = f1_score(df['matches'], df['pred_matches'])
        score = df['f1'].mean()
        print(f'Our f1 score for threshold 3.0 is {score}')
        


        

    # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
    else:
        predictions = []
        for k in range(image_distances.shape[0]):
                image_idx = np.where(image_distances[k,] <3.0)[0]
                image_ids = image_indices[k,image_idx]
                image_set=set(image_ids.tolist())
                
                image_idx = np.where(image_distances1[k,] <0.27)[0]
                image_ids = image_indices1[k,image_idx]
                image1_set=set(image_ids.tolist())
                
                image_idx = np.where((image_distances[k,] <4.0)&(image_distances[k,] >=3.4))[0]
                image_ids = image_indices[k,image_idx]
                image2_set=set(image_ids.tolist())
                
                image_idx = np.where((image_distances1[k,] <0.33)&(image_distances1[k,] >=0.27))[0]
                image_ids = image_indices1[k,image_idx]
                image3_set=set(image_ids.tolist())
                                
                image_idx = np.where((image_distances[k,] <3.4)&(image_distances[k,] >=3.0))[0]
                image_ids = image_indices[k,image_idx]
                image4_set=set(image_ids.tolist())                
                
                                
                image_idx = np.where(image_distances1[k,] <0.5)[0]
                image_ids = image_indices1[k,image_idx]
                image5_set=set(image_ids.tolist())
                
                text_idx = np.where(text_distances1[k,] <=0.13)[0]
                text_ids = text_indices1[k,text_idx]
                text_set0=set(text_ids.tolist())
               
                text_idx = cupy.where(text_distances[k,] <=0.2)[0]
                text_ids = cupy.asnumpy(text_indices[k,text_idx])
                text_set=set(text_ids.tolist())
                
                text_idx = cupy.where((text_distances[k,]>0.2)&(text_distances[k,]<0.38))[0]
                text_ids = cupy.asnumpy(text_indices[k,text_idx])
                text_set1=set(text_ids.tolist())
                
                text_idx = cupy.where((text_distances[k,]>0.2)&(text_distances[k,]<0.5))[0]
                text_ids = cupy.asnumpy(text_indices[k,text_idx])
                text_set2=set(text_ids.tolist())
                
                text_idx = np.where(text_distances1[k,] <=0.5)[0]
                text_ids = text_indices1[k,text_idx]
                text_set3=set(text_ids.tolist())
                
                image2_set=(image2_set.union(image3_set)).intersection(image5_set)
                set1=image2_set.intersection(text_set1.intersection(text_set3))
                
                set2=image4_set.intersection(text_set2.union(text_set3))
                
                total_set=image_set.union(text_set,image1_set,set1,set2,text_set0)

                tot_idx=np.array(sorted(list(total_set)))
                
                posting_ids = df['posting_id'].iloc[tot_idx].values
                predictions.append(posting_ids)
        
    #del distances, indices
    gc.collect()
    return df, predictions

In [None]:
 df,image_text_predictions=get_neighbors(df,  image_distances, image_indices, image_distances1, image_indices1, text_distances, text_indices,text_distances1, text_indices1)

In [None]:
def combine_predictions(row):
    x = row['image_text_predictions']

    return ' '.join( np.unique(x) )



In [None]:
if  GET_CV:
    df["image_text_predictions"]=image_text_predictions
    
    

    df['pred_matches'] = df.apply(combine_predictions, axis=1)
    df['f1'] = f1_score(df['matches'], df['pred_matches'])
    score = df['f1'].mean()
    print(f'Our final f1 cv score is {score}')
    df['matches0'] = df['pred_matches']
    df[['posting_id','matches0']].to_csv('submission.csv',index=False)
else:
    df["image_text_predictions"]=image_text_predictions
    
    

    df['matches'] = df.apply(combine_predictions, axis=1)

    
    df[['posting_id','matches']].to_csv('submission.csv',index=False)

In [None]:
df.head()