In [None]:
# COMPUTE_CV = False 
COMPUTE_CV = False

import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')

import numpy as np 
import pandas as pd 

import math
import random 
import os 
import cv2
import timm

from tqdm import tqdm 

import albumentations as A 
from albumentations.pytorch.transforms import ToTensorV2

import torch 
from torch.utils.data import Dataset 
from torch import nn
import torch.nn.functional as F 
import transformers
from transformers import (BertTokenizer, BertModel,
                          DistilBertTokenizer, DistilBertModel)
from transformers import AutoTokenizer, AutoModel, BertTokenizer
torch.cuda.empty_cache()

import gc
import matplotlib.pyplot as plt
import cudf
import cuml
import cupy
from cuml import PCA
from cuml.feature_extraction.text import TfidfVectorizer
# from sklearn.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors

import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
nltk.download('wordnet')
stemmer = SnowballStemmer('english')

# 0. Config

In [None]:
class CFG:
    img_size = 512
    batch_size = 50 # 12
    seed = 2020
    
    device = 'cuda'
    classes = 11014
    
    scale = 30 
    margin = 0.5

def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_torch(CFG.seed)

## F1 score

In [None]:
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

# 1. Dataset Loader

In [None]:
def read_dataset(check=False):
    if COMPUTE_CV:
        df = pd.read_csv("../input/shopee-product-matching/train.csv")
        if check:
            df = pd.concat([df]*2).reset_index()
        tmp = df.groupby('label_group').posting_id.agg('unique').to_dict()
        df['target'] = df.label_group.map(tmp)
        df['target'] = df['target'].apply(lambda x: ' '.join(x))
        df_cu = cudf.DataFrame(df)
        image_paths = '../input/shopee-product-matching/train_images/' + df['image']
    else:
        df = pd.read_csv("../input/shopee-product-matching/test.csv")
        df_cu = cudf.DataFrame(df)
        image_paths = '../input/shopee-product-matching/test_images/' + df['image']
    return df, df_cu, image_paths

## 1.1. Image Loader

In [None]:
class ImgDataset(Dataset):
    def __init__(self, image_paths, transforms=None):

        self.image_paths = image_paths
        self.augmentations = transforms

    def __len__(self):
        return self.image_paths.shape[0]

    def __getitem__(self, index):
        image_path = self.image_paths[index]
        
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.augmentations:
            augmented = self.augmentations(image=image)
            image = augmented['image']       
    
        return image,torch.tensor(1)

## 1.2. Text Loader

In [None]:
class TextDataset(Dataset):
    def __init__(self, csv):
        self.csv = csv.reset_index()

    def __len__(self):
        return self.csv.shape[0]

    def __getitem__(self, index):
        row = self.csv.iloc[index]
        text = row.title
        
        text = TOKENIZER(text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")
        input_ids = text['input_ids'][0]
        attention_mask = text['attention_mask'][0]  
        
        return input_ids, attention_mask

## 1.3. Text Loader for Indonesian Bert

In [None]:
TEXT_MODEL = "../input/distilbert-base-indonesian"
MAX_LEN = 32 # Maximum length of text
EMBED_DIM = 768

class TextDatasetB(Dataset):
    def __init__(self, df, tokenizer=DistilBertTokenizer.from_pretrained(TEXT_MODEL), max_length=MAX_LEN):
        self.df = df 
        self.tokenizer = tokenizer
        self.max_length = max_length
        texts = list(df['title'].apply(lambda o: str(o)).values)
        self.encodings = tokenizer(texts, 
                                   padding=True, 
                                   truncation=True, 
                                   max_length=max_length)
        del texts

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        item = {key: torch.tensor(values[idx]) for key, values in self.encodings.items()}  
        
        return item

## 1.4. CLIP Loader

In [None]:
%%capture
# Dirty code to make it work

import sys
!cp -r ../input/openai-clip/CLIP/CLIP-main /tmp/

# Kaggle likes to unpack .gz files in datasets... so we have to pack it back
!gzip -c /tmp/CLIP-main/clip/bpe_simple_vocab_16e6.txt > /tmp/CLIP-main/clip/bpe_simple_vocab_16e6.txt.gz
sys.path.append('/tmp/CLIP-main')

!pip install ../input/openai-clip/ftfy-5.9/ftfy-5.9
print("ftfy")

In [None]:
import clip
from PIL import Image
from clip.simple_tokenizer import SimpleTokenizer
_, preprocess = clip.load("../input/openai-clip/ViT-B-32.pt", device=CFG.device, jit=False)
_tokenizer = SimpleTokenizer()

# Copied from https://github.com/openai/CLIP/blob/beba48f35392a73c6c47ae67ddffced81ad1916d/clip/clip.py#L164
# but with relaxed exception
def tokenize(texts, context_length: int = 77) -> torch.LongTensor:
    if isinstance(texts, str):
        texts = [texts]
        
    sot_token = _tokenizer.encoder["<|startoftext|>"]
    eot_token = _tokenizer.encoder["<|endoftext|>"]
    all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)

    for i, tokens in enumerate(all_tokens):
        n = min(len(tokens), context_length)
        result[i, :n] = torch.tensor(tokens)[:n]
        if len(tokens) > context_length:
            result[i, -1] = tokens[-1]

    return result

# Remove EMOJI
RE_EMOJI = re.compile(r"\\x[A-Za-z0-9./]+", flags=re.UNICODE)

def strip_emoji(text):
    return RE_EMOJI.sub(r'', text)

class CLIPDataset(Dataset):
    def __init__(self, df, images_path):
        super().__init__()
        self.df = df
        self.images_path = images_path
        self.has_target = ('label_group' in df)
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image = preprocess(Image.open(self.images_path + '/' +  row['image']))
        text = tokenize([strip_emoji(row['title'])])[0]
        
        if self.has_target:
            return image, text, row['label_group']
        else:
            return image, text, 0

## 1.5. Word2Vec Loader

In [None]:
import re
import gensim
from gensim.models import Word2Vec
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

TOKEN_RE = re.compile(r'[\w]+')
def tokenize_text_simple_regex(txt, min_token_size=2):
    txt = str(txt).lower()
    all_tokens = TOKEN_RE.findall(txt)
    return [wordnet_lemmatizer.lemmatize(token, pos="v") for token in all_tokens if len(token) >= min_token_size]

def tokenize_corpus(texts, tokenizer=tokenize_text_simple_regex, **tokenizer_kwargs):
    return [tokenizer(text, **tokenizer_kwargs) for text in texts]

# 2. Model

## 2.1. Bert_paraphrase-xlm-r-multilingual-v1/0_Transformer

In [None]:
# Config
transformer_model = '../input/sentence-transformer-models/paraphrase-xlm-r-multilingual-v1/0_Transformer'
TOKENIZER = transformers.AutoTokenizer.from_pretrained(transformer_model)

TEXT_MODEL_PATH = '../input/best-multilingual-model/sentence_transfomer_xlm_best_loss_num_epochs_25_arcface.bin'

model_params = {
    'n_classes':11014,
    'model_name':transformer_model,
    'use_fc':False,
    'fc_dim':512,
    'dropout':0.3,
}

class TextNetA(nn.Module):

    def __init__(self,
                 n_classes,
                 model_name='bert-base-uncased',
                 use_fc=False,
                 fc_dim=512,
                 dropout=0.0):
        """
        :param n_classes:
        :param model_name: name of model from pretrainedmodels
            e.g. resnet50, resnext101_32x4d, pnasnet5large
        :param pooling: One of ('SPoC', 'MAC', 'RMAC', 'GeM', 'Rpool', 'Flatten', 'CompactBilinearPooling')
        :param loss_module: One of ('arcface', 'cosface', 'softmax')
        """
        super(TextNetA, self).__init__()

        self.transformer = transformers.AutoModel.from_pretrained(model_name)
        final_in_features = self.transformer.config.hidden_size
        
        self.use_fc = use_fc
    
        if use_fc:
            self.dropout = nn.Dropout(p=dropout)
            self.fc = nn.Linear(final_in_features, fc_dim)
            self.bn = nn.BatchNorm1d(fc_dim)
            self._init_params()
            final_in_features = fc_dim


    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, input_ids,attention_mask):
        feature = self.extract_feat(input_ids,attention_mask)
        return F.normalize(feature)

    def extract_feat(self, input_ids,attention_mask):
        x = self.transformer(input_ids=input_ids,attention_mask=attention_mask)
        
        features = x[0]
        features = features[:,0,:]

        if self.use_fc:
            features = self.dropout(features)
            features = self.fc(features)
            features = self.bn(features)

        return features

## 2.2. Bert_Indonesian

In [None]:
class TextNetB(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert_model = DistilBertModel.from_pretrained(TEXT_MODEL)
    
    def forward(self, batch):
        output = self.bert_model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        last_hidden_state = output.last_hidden_state # shape: (batch_size, seq_length, bert_hidden_dim)
        CLS_token_state = last_hidden_state[:, 0, :] # obtaining CLS token state which is the first token.
        return CLS_token_state

## 2.3. Image Model:ã€€efficientnet/eca_nfnet_l0

In [None]:
class ArcMarginProduct(nn.Module):
    def __init__(self, in_features, out_features, scale=30.0, margin=0.50, easy_margin=False, ls_eps=0.0):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.scale = scale
        self.margin = margin
        self.ls_eps = ls_eps  # label smoothing
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(margin)
        self.sin_m = math.sin(margin)
        self.th = math.cos(math.pi - margin)
        self.mm = math.sin(math.pi - margin) * margin

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
        one_hot = torch.zeros(cosine.size(), device='cuda')
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.scale

        return output

class ImgNetA(nn.Module):

    def __init__(
        self,
        n_classes = CFG.classes,
        model_name = None,
        fc_dim = 512,
        margin = CFG.margin,
        scale = CFG.scale,
        use_fc = True,
        pretrained = False):


        super(ImgNetA,self).__init__()
        print('Building Model Backbone for {} model'.format(model_name))

        self.backbone = timm.create_model(model_name, pretrained=pretrained)

        if model_name == 'resnext50_32x4d':
            final_in_features = self.backbone.fc.in_features
            self.backbone.fc = nn.Identity()
            self.backbone.global_pool = nn.Identity()

        elif 'efficientnet' in model_name:
            final_in_features = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
            self.backbone.global_pool = nn.Identity()
        
        elif model_name == 'eca_nfnet_l0' or 'eca_nfnet_l1':
            final_in_features = self.backbone.head.fc.in_features
            self.backbone.head.fc = nn.Identity()
            self.backbone.head.global_pool = nn.Identity()

        self.pooling =  nn.AdaptiveAvgPool2d(1)

        self.use_fc = use_fc

        self.dropout = nn.Dropout(p=0.0)
        self.fc = nn.Linear(final_in_features, fc_dim)
        self.bn = nn.BatchNorm1d(fc_dim)
        self._init_params()
        final_in_features = fc_dim

        self.final = ArcMarginProduct(
            final_in_features,
            n_classes,
            scale = scale,
            margin = margin,
            easy_margin = False,
            ls_eps = 0.0
        )

    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, image, label):
        feature = self.extract_feat(image)
        #logits = self.final(feature,label)
        return feature

    def extract_feat(self, x):
        batch_size = x.shape[0]
        x = self.backbone(x)
        x = self.pooling(x).view(batch_size, -1)

        if self.use_fc:
            x = self.dropout(x)
            x = self.fc(x)
            x = self.bn(x)
        return x

# Mish function
class Mish_func(torch.autograd.Function):
    
    """from: https://github.com/tyunist/memory_efficient_mish_swish/blob/master/mish.py"""
    
    @staticmethod
    def forward(ctx, i):
        result = i * torch.tanh(F.softplus(i))
        ctx.save_for_backward(i)
        return result

    @staticmethod
    def backward(ctx, grad_output):
        i = ctx.saved_variables[0]
  
        v = 1. + i.exp()
        h = v.log() 
        grad_gh = 1./h.cosh().pow_(2) 

        # Note that grad_hv * grad_vx = sigmoid(x)
        #grad_hv = 1./v  
        #grad_vx = i.exp()
        
        grad_hx = i.sigmoid()

        grad_gx = grad_gh *  grad_hx #grad_hv * grad_vx 
        
        grad_f =  torch.tanh(F.softplus(i)) + i * grad_gx 
        
        return grad_output * grad_f 


class Mish(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        pass
    def forward(self, input_tensor):
        return Mish_func.apply(input_tensor)


def replace_activations(model, existing_layer, new_layer):
    
    """A function for replacing existing activation layers"""
    
    for name, module in reversed(model._modules.items()):
        if len(list(module.children())) > 0:
            model._modules[name] = replace_activations(module, existing_layer, new_layer)

        if type(module) == existing_layer:
            layer_old = module
            layer_new = new_layer
            model._modules[name] = layer_new
    return model

### 2.3.1. Img Ensemble

In [None]:
def get_model(model_name = None, model_path = None, n_classes = None):
    
    model = ImgNetA(model_name = model_name)
    if model_name == 'eca_nfnet_l0' or 'eca_nfnet_l1':
        model = replace_activations(model, torch.nn.SiLU, Mish())
    model.eval()
    model.load_state_dict(torch.load(model_path))
    model = model.to(CFG.device)
    return model 

class EnsembleModel(nn.Module):
    
    def __init__(self):
        super(EnsembleModel,self).__init__()
        
        self.m1 = get_model('eca_nfnet_l0','../input/shopee-pytorch-models/arcface_512x512_nfnet_l0 (mish).pt')
        self.m2 = get_model('tf_efficientnet_b5_ns','../input/shopee-pytorch-models/arcface_512x512_eff_b5_.pt')
        
        self.m3 = get_model('eca_nfnet_l1','../input/my-embeddings/arcface_512x512_nfnet_l1(mish)_15.pt')
        
    def forward(self,img,label):
        
        feat1 = self.m1(img,label)
        feat2 = self.m2(img,label)
        feat3 = self.m3(img,label)
        
        return (feat1 + feat2) / 2, feat3
#         return feat1, feat2

# class EnsembleModelConcat(nn.Module):
    
#     def __init__(self):
#         super(EnsembleModelConcat,self).__init__()
        
#         self.m1 = get_model('eca_nfnet_l0','../input/shopee-pytorch-models/arcface_512x512_nfnet_l0 (mish).pt')
#         self.m2 = get_model('tf_efficientnet_b5_ns','../input/shopee-pytorch-models/arcface_512x512_eff_b5_.pt')
    
#     def l2_norm(self, input, axit=1):
#         norm = torch.norm(input,2,axit,True)
#         output = torch.div(input, norm)
#         return output
        
#     def forward(self,img,label):
        
#         feat1 = self.m1(img,label)
#         feat2 = self.m2(img,label)
        
#         feat1_l2 = self.l2_norm(feat1)
#         feat2_l2 = self.l2_norm(feat2)
        
#         feat_l2 = self.l2_norm(torch.cat([feat1_l2, feat2_l2], axis=1))
    
#         return feat_l2

## 2.4. Word2Vec Model

In [None]:
def Word2VecModel(VECTOR_SIZE=128):
#     train = pd.read_csv("../input/shopee-product-matching/train.csv")
    test = pd.read_csv("../input/shopee-product-matching/test.csv")
#     corpus = tokenize_corpus(list(pd.concat([train['title'], test['title']])))
    corpus = tokenize_corpus(list(test['title']))
    
    model = Word2Vec(
            sentences=corpus,
            vector_size=VECTOR_SIZE, 
            window=15, 
            min_count=1, 
            sg=1, #skip-gram
            negative=10, 
            epochs=25, 
            seed=42,
            workers=10
            )
    del corpus
    
    return model

# 3. Get Embedding

## 3.1. Image Embedding

In [None]:
def get_img_transforms():
    return A.Compose(
        [
            A.Resize(CFG.img_size,CFG.img_size,always_apply=True),
            A.Normalize(),
        ToTensorV2(p=1.0)
        ]
    )

def get_image_embeddings(image_paths, model_name = None, model_path = None):
#     embeds = []
    embed1, embed2 = [], []
    
    model = EnsembleModel()
#     model = EnsembleModelConcat()
    
    image_dataset = ImgDataset(image_paths=image_paths,transforms=get_img_transforms())
    image_loader = torch.utils.data.DataLoader(
        image_dataset,
        batch_size=CFG.batch_size,
        pin_memory=True,
        drop_last=False,
        num_workers=4
    )
    
    with torch.no_grad():
        for img,label in tqdm(image_loader): 
            img = img.cuda()
            label = label.cuda()
            
#             feat = model(img,label)
#             image_embeddings = feat.detach().cpu().numpy()
#             embeds.append(image_embeddings)
            
            feat1, feat2 = model(img,label)
            image_embedding1 = feat1.detach().cpu().numpy()
            image_embedding2 = feat2.detach().cpu().numpy()
            embed1.append(image_embedding1)
            embed2.append(image_embedding2)
    
    del model
#     image_embeddings = np.concatenate(embeds)
    image_embedding1 = np.concatenate(embed1)
    image_embedding2 = np.concatenate(embed2)
#     print(f'Our image embeddings shape is {image_embeddings.shape}')
    print(f'Our image embeddings shape is {image_embedding1.shape}')
    del embed1, embed2
#     del embeds
    gc.collect()
#     return image_embeddings
    return image_embedding1, image_embedding2

## 3.2. Bert Embedding

In [None]:
NUM_WORKERS = 4
SEED = 42

def get_bert_embeddings(df):
    embeds = []
    
    model = TextNetA(**model_params)
    model.eval()
    
    model.load_state_dict(dict(list(torch.load(TEXT_MODEL_PATH).items())[:-1]))
    model = model.to(CFG.device)

    text_dataset = TextDataset(df)
    text_loader = torch.utils.data.DataLoader(
        text_dataset,
        batch_size=CFG.batch_size,
        pin_memory=True,
        drop_last=False,
        num_workers=NUM_WORKERS
    )
    
    
    with torch.no_grad():
        for input_ids, attention_mask in tqdm(text_loader): 
            input_ids = input_ids.cuda()
            attention_mask = attention_mask.cuda()
            feat = model(input_ids, attention_mask)
            text_embeddings = feat.detach().cpu().numpy()
            embeds.append(text_embeddings)
    
    
    del model
    text_embeddings = np.concatenate(embeds)
    print(f'Our text embeddings shape is {text_embeddings.shape}')
    del embeds
    gc.collect()
    return text_embeddings

## 3.3. Bert Embedding for Indonesian

In [None]:
model_path = '../input/shopee-pytorch-models/arcface_distilbert_model_512.pt'

def get_bertB_embeddings(df):
    model = TextNetB()
    model.eval()
    model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')), strict=False)
    model = model.to(CFG.device)

    text_dataset = TextDatasetB(df)
    text_loader = torch.utils.data.DataLoader(
        text_dataset,
        batch_size=CFG.batch_size,
        pin_memory=True,
        drop_last=False,
        num_workers=4
    )

    embeds = []
    with torch.no_grad():
        for data in tqdm(text_loader):
            for k,v in data.items():
                data[k] = v.to(CFG.device)
            features = model(data)
            embeds.append(features.half())

    # del model
    text_embeddings = torch.cat(embeds, dim=0).detach().cpu().numpy()
    # del embeds
    print(f'Our text embeddings shape is {text_embeddings.shape}')
#     torch.save(text_embeddings, f'text_embeddings.pt')
    return text_embeddings

## 3.4. CLIP Embedding

In [None]:
def get_clip_embeddings(df, images_path):
    embed_dim = 512
    ds = CLIPDataset(df, images_path)
    dl = torch.utils.data.DataLoader(ds, batch_size=2 * CFG.batch_size, shuffle=False, num_workers=4)
    
    model = torch.load("../input/my-embeddings/model_31.pkl").to(CFG.device)
    
    # Allocate memory for features
    features = np.empty((len(df), 2*embed_dim), dtype=np.float32)
    
    # Begin predict
    i = 0
    for images, texts, _ in tqdm(dl):
        n = len(images)
        with torch.no_grad():
            # Generate image and text features
            images_features = model.encode_image(images.cuda())
            texts_features = model.encode_text(texts.cuda())

        # Concat features (first images then texts)
        features[i:i+n, :embed_dim] = images_features.cpu()
        features[i:i+n, embed_dim:] = texts_features.cpu()

        i += n

    # Option to save these features (may be usefull to tune cut value
    print(f'Our clip embeddings shape is {features.shape}')
#     np.save("clip_embeddings.npy", features)

    # l2-normalize
    features /= np.linalg.norm(features, 2, axis=1, keepdims=True)
    
    return features

## 3.5. Word2Vec Embedding

In [None]:
def get_w2v_embeddings(df):
    corpus = tokenize_corpus(list(df['title']))
    
    VECTOR_SIZE=128
    model = Word2VecModel(VECTOR_SIZE)
#     model.save("word2vec_train.model")
    
    embeds = []
    for sentence in corpus:
        words = [w for w in sentence if w in model.wv.index_to_key]
        words_vector = np.array([model.wv[w] for w in words])
        if len(words_vector)==0: 
            embeds.append(np.zeros((VECTOR_SIZE), dtype='float32').tolist())
        else:
            embed = np.median(words_vector, axis=0).tolist()
            embeds.append(embed)
    embeds = np.array(embeds)
    del corpus

    print(f'Our word2vec embeddings shape is {embeds.shape}')
    return embeds

# 4. Get Prediction

## 4.1. Image Prediction

In [None]:
def get_image_predictions(df, embeddings,threshold = 0.0):
    
    if len(df) > 3:
        KNN = 50
    else : 
        KNN = 3
    
    model = NearestNeighbors(n_neighbors = KNN, metric = 'cosine') # euclidean
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    print(distances.mean(axis=0))
    
    predictions = []
    df['img_pred'] = ''
    Sum = 0
    for k in tqdm(range(embeddings.shape[0])):
        idx = np.where(distances[k,] < threshold)[0]
        Sum += len(idx)
        ids = indices[k,idx]
        posting_ids = df['posting_id'].iloc[ids].values
        predictions.append(posting_ids)
    df['img_pred'] = predictions
    del model, distances, indices
    gc.collect()
    
    print(Sum / embeddings.shape[0])
    if COMPUTE_CV:
        df['img_pred'] = df.img_pred.apply(lambda x: ' '.join(x))
        score = f1_score(df['target'], df['img_pred']).mean()
        print(f'Our f1 score for threshold {threshold} is {score}')
    return predictions

## 4.2. Bert Prediction

In [None]:
def get_bert_predictions(df,embeddings, threshold=0.7, chunk=1024*4):
    '''
    When using cos_sim use normalized features else use normal features
    '''
    embeddings = cupy.array(embeddings)
  
            
################################################# Code for Getting Preds #########################################
    CHUNK = chunk

    print('Finding similar titles...for threshold :',threshold)
    CTS = len(embeddings)//CHUNK
    if len(embeddings)%CHUNK!=0: CTS += 1

    predictions = []
    df['bert_pred'] = ''
    Sum = 0
    for j in range( CTS ):
        a = j*CHUNK
        b = (j+1)*CHUNK
        b = min(b,len(embeddings))
        print('chunk',a,'to',b)
        
        cts = cupy.matmul(embeddings,embeddings[a:b].T).T

        for k in range(b-a):
            IDX = cupy.where(cts[k,]>threshold)[0]
            Sum += len(IDX)
            o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
#             o = ' '.join(o)
            predictions.append(o)
######################################################################################################################
    print(Sum / embeddings.shape[0])
    df['bert_pred'] = predictions
    del embeddings
    gc.collect()
    
    if COMPUTE_CV:
        df['bert_pred'] = df.bert_pred.apply(lambda x: ' '.join(x))
        score = f1_score(df['target'], df['bert_pred']).mean()
        print(f'Our f1 score for threshold {threshold} is {score}')
    return predictions

## 4.3. Tfidf Prediction

In [None]:
def get_tfidf_predictions(df, max_features = 25_000, threshold=0.75):
    
    model = TfidfVectorizer(stop_words = 'english', binary = True, max_features = max_features)
    text_embeddings = model.fit_transform(df_cu['title']).toarray()
    preds = []
    CHUNK = 1024 * 4

    print('Finding similar titles...')
    CTS = len(df)//CHUNK
    if len(df)%CHUNK!=0: CTS += 1
    Sum = 0
    for j in range( CTS ):

        a = j*CHUNK
        b = (j+1)*CHUNK
        b = min(b,len(df))
        print('chunk',a,'to',b)

        # COSINE SIMILARITY DISTANCE
        cts = cupy.matmul( text_embeddings, text_embeddings[a:b].T).T

        for k in range(b-a):
            IDX = cupy.where(cts[k,]>threshold)[0]
            Sum += len(IDX)
            o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
            preds.append(o)
            
    df['tfidf_pred'] = preds
    print(Sum / text_embeddings.shape[0])
    del model,text_embeddings
    
    if COMPUTE_CV:
        df['tfidf_pred'] = df.tfidf_pred.apply(lambda x: ' '.join(x))
        score = f1_score(df['target'], df['tfidf_pred']).mean()
        print(f'Our f1 score for threshold {threshold} is {score}')
    
    gc.collect()
    return preds

# 5. Beginning

In [None]:
df,df_cu,image_paths = read_dataset(check=False)
print(df.shape)

## 5.1. pHash Prediction

In [None]:
print("Hash Prediction...")
phash = df.groupby('image_phash').posting_id.agg('unique').to_dict()
print("Done.")

## 5.2. Image Prediction

In [None]:
print("Image Embedding...")
if COMPUTE_CV:
    image_embedding1, image_embedding2 = get_image_embeddings(image_paths.values)
#     np.save('nfnet_l0_efficientnet_b5_cat_l0.npy', image_embeddings)
#     image_embeddings = np.load("../input/my-embeddings/nfnet_l0_efficientnet_b5_add.npy")
#     theresholds=np.linspace(0.8,1.2,10)
#     for threshold in theresholds:
#         get_image_predictions_score(df,image_embeddings,threshold=threshold)
#         get_image_predictions(df,image_embeddings,threshold)
else:
#     image_embedding1, image_embedding2 = get_image_embeddings(image_paths.values)
    image_embedding1, image_embedding2 = get_image_embeddings(image_paths.values)
print("Done.")

In [None]:
print("Image Prediction...")
img_prediction1 = get_image_predictions(df,image_embedding1,threshold=0.36)
img_prediction2 = get_image_predictions(df,image_embedding2,threshold=0.32)
# img_prediction = get_image_predictions(df,image_embeddings,threshold=0.36)

In [None]:
del image_embedding1, image_embedding2
torch.cuda.empty_cache()
print("Done.")

## 5.3. Tfidf Prediction

In [None]:
print("Tfidf Prediction...")
text_predictions = get_tfidf_predictions(df, max_features = 25_000, threshold=0.75)
print("Done")

## 5.4. Bert Prediction

In [None]:
print("Bert Embedding...")
if COMPUTE_CV:
    bert_embeddings = get_bert_embeddings(df)
#     np.save('sbert.npy', bert_embeddings)
#     bert_embeddings = np.load('../input/my-embeddings/sbert.npy')
else:
    bert_embeddings = get_bert_embeddings(df)
print("Done")

In [None]:
print("Bert Prediction...")
bert_predictions = get_bert_predictions(df, bert_embeddings, threshold=0.85)
# bert_predictions = get_image_predictions(df, bert_embeddings, threshold=0.15)

In [None]:
del bert_embeddings
torch.cuda.empty_cache()
print("Done")

## 5.4. Bert (Indonesian) Prediction

In [None]:
print("Bert Embedding...")
bert_embeddingsB = get_bertB_embeddings(df)
print("Done")

In [None]:
torch.cuda.empty_cache()
print("Bert Prediction...")
bert_predictionsB = get_image_predictions(df, bert_embeddingsB, threshold=0.15)

In [None]:
del bert_embeddingsB
torch.cuda.empty_cache()
print("Done")

## 5.5. CLIP Prediction

In [None]:
print("CLIP Embedding...")
if COMPUTE_CV:
    clip_embeddings = get_clip_embeddings(df, '../input/shopee-product-matching/train_images')
#     np.save('clip_embeddings.npy', clip_embeddings)
#     clip_embeddings = np.load('../input/my-embeddings/clip_embeddings.npy')
else:
    clip_embeddings = get_clip_embeddings(df, '../input/shopee-product-matching/test_images')
print("Done")

In [None]:
print("Clip Prediction...")
clip_predictions = get_image_predictions(df,clip_embeddings,threshold=0.13)

In [None]:
del clip_embeddings
torch.cuda.empty_cache()
print("Done")

## 5.6. Word2Vec Prediction

In [None]:
# from sklearn import preprocessing
# def L2_normalize(data, norms = 'l2'):
#     return preprocessing.normalize(data, norm = norms)
print("W2V Embedding...")
w2v_embeddings = get_w2v_embeddings(df)
print("Done")

In [None]:
print("W2V Prediction...")
if COMPUTE_CV:
    w2v_predictions = get_image_predictions(df, w2v_embeddings, threshold=0.03)
else:
    w2v_predictions = get_image_predictions(df, w2v_embeddings, threshold=0.03)
print("Done.")

# 6. Merging Result

In [None]:
df['phash'] = df.image_phash.map(phash)
df['image_prediction1'] = img_prediction1
df['image_prediction2'] = img_prediction2
# df['image_predictions'] = img_prediction
df['text_predictions'] = text_predictions
df['bert_predictions'] = bert_predictions
df['bert_predictionsB'] = bert_predictionsB
df['clip_predictions'] = clip_predictions
df['w2v_predictions'] = w2v_predictions

In [None]:
# def higher(f,*args):
#     res = {}
#     keys = np.unique(np.concatenate(args))
#     for k in keys: 
#         res[k] = np.count_nonzero(np.concatenate(args) == k)
#     output_dict = dict(filter(lambda item: item[1] >= f, res.items()))
#     return np.array(list(output_dict.keys()))

In [None]:
# from collections import Counter
# Voting = True
# if Voting:
#     def combine_cv(row):
#         T = higher(2, row['image_prediction1'], row['image_prediction2'], row['clip_predictions'],
#                    row['text_predictions'], row['bert_predictions'], row['bert_predictionsB'])
#         All = np.unique(np.concatenate([row['image_predictions'], T, row['phash']]))
#         return All

#     def combine_predictions(row):
#         T = higher(2, row['text_predictions'], row['bert_predictions'], row['bert_predictionsB'])
#         All = np.unique(np.concatenate([row['image_predictions'], T, row['phash']]))
#         return ' '.join(All)
# else:
#     def combine_cv(row):
#         x = np.concatenate([row['image_predictions'], row['text_predictions'], row['phash'], 
#                             row['bert_predictions'], row['bert_predictionsB'], row['clip_predictions']])
#         return np.unique(x)

#     def combine_predictions(row):
#         x = np.concatenate([row['image_predictions'], row['text_predictions'], row['phash'], 
#                             row['bert_predictions'], row['bert_predictionsB'], row['clip_predictions']])
#         return ' '.join( np.unique(x))

# df['matches'] = df.apply(combine_predictions, axis = 1)
# if COMPUTE_CV:
#     df['matches_cv'] = df.apply(combine_cv, axis = 1)

# df[['posting_id', 'matches']].to_csv('submission.csv', index = False)

In [None]:
from collections import Counter
Voting = True
if Voting:
    def combine_cv(row):
        x = np.concatenate([row['image_prediction1'], row['image_prediction2'], row['clip_predictions'], row['phash'],
                   row['text_predictions'], row['bert_predictions'], row['bert_predictionsB'], row['w2v_predictions']])
#         x = np.concatenate([row['bert_predictionsB']])
        tmp = Counter(x)
        sorted_tmp = sorted(tmp.items(), key=lambda x: x[1], reverse=True)
#         res = ([x[0] for x in sorted_tmp])
        res = []
        for items in sorted_tmp:
            if items[1] >= 2:
                res.append(items[0])
        return res if len(res)<50 else res[:50]

    def combine_predictions(row):
        x = np.concatenate([row['image_prediction1'], row['image_prediction2'], row['clip_predictions'], row['phash'],
                   row['text_predictions'], row['bert_predictions'], row['bert_predictionsB'], row['w2v_predictions']])
#         x = np.concatenate([row['bert_predictionsB']])
        tmp = Counter(x)
        sorted_tmp = sorted(tmp.items(), key=lambda x: x[1], reverse=True)
#         res = ([x[0] for x in sorted_tmp])
        res = []
        for items in sorted_tmp:
            if items[1] >= 2:
                res.append(items[0])
        res if len(res)<50 else res[:50]
        return ' '.join(res)
else:
    def combine_cv(row):
        x = np.concatenate([row['image_prediction1'], row['image_prediction2'], row['clip_predictions'], row['phash'],
                   row['text_predictions'], row['bert_predictions'], row['bert_predictionsB']])
        return np.unique(x)

    def combine_predictions(row):
        x = np.concatenate([row['image_prediction1'], row['image_prediction2'], row['clip_predictions'], row['phash'],
                   row['text_predictions'], row['bert_predictions'], row['bert_predictionsB']])
        return ' '.join( np.unique(x))

df['matches'] = df.apply(combine_predictions, axis = 1)
if COMPUTE_CV:
    df['matches_cv'] = df.apply(combine_cv, axis = 1)

df[['posting_id', 'matches']].to_csv('submission.csv', index = False)

## 6.1. CV Score & Counting

In [None]:
if COMPUTE_CV:
    df['f1'] = f1_score(df['target'], df['matches'])
    score = df['f1'].mean()
    print(f'Final f1 score is {score}')

if COMPUTE_CV:
    nums = df['matches_cv'].shape[0]
    count = {}
    obj= df['matches_cv']
    for i in range(nums):
        max = len(obj.values[i])
        if max not in count.keys():
            count[max] = 1
        else:
            count[max] += 1
#     print(count)
    print(sorted(count.items(), key=lambda d:d[0]))
#     print(df)