# Comments

Thanks to Chris for this great notebook https://www.kaggle.com/cdeotte/part-2-rapids-tfidfvectorizer-cv-0-700. 

Here is the script for the EfficientNetb3 ArcFace Model https://www.kaggle.com/ragnar123/shopee-efficientnetb3-arcmarginproduct

Here is the script for the Bert Model https://www.kaggle.com/ragnar123/bert-baseline

In [None]:
!pip install ../input/external-libraries-shopee-product-matching/Keras_Applications-1.0.8-py3-none-any.whl
!pip install ../input/external-libraries-shopee-product-matching/efficientnet-1.1.1-py3-none-any.whl

# !pip install ../input/d/mhilmiasyrofi/external-libraries-shopee-product-matching/Keras_Applications-1.0.8-py3-none-any.whl
# !pip install ../input/d/mhilmiasyrofi/external-libraries-shopee-product-matching/efficientnet-1.1.1-py3-none-any.whl

In [None]:
import sys
!cp ../input/rapids/rapids.0.19.0 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [None]:
import os
import random
import math
import numpy as np
import pandas as pd
import gc
import matplotlib.pyplot as plt
import cudf
import cuml
import cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml import PCA
from cuml.neighbors import NearestNeighbors
import tensorflow as tf
import efficientnet.tfkeras as efn
from tqdm.notebook import tqdm
from shutil import copyfile
import tensorflow_hub as hub
import sklearn
import torch
from shutil import copyfile

from tqdm.autonotebook import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.preprocessing import LabelEncoder

import transformers

In [None]:
# For tf.dataset
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
BATCH_SIZE = 8
IMAGE_SIZE = [512, 512]
# Seed
SEED = 42
# Verbosity
VERBOSE = 1
# Number of classes
N_CLASSES = 11014

# Flag to get cv score
GET_CV = True
# Flag to check ram allocations (debug)
CHECK_SUB = False

RECOMPUTE_IMAGE_EMBEDDING = False
RECOMPUTE_NFNET_EMBEDDING = False
RECOMPUTE_TEXT_EMBEDDING = False
RECOMPUTE_TFIDF_EMBEDDING = False
RECOMPUTE_INDOBERT_EMBEDDING = False


KAGGLE_ENV = True

INPUT_DIR = "../input/"
WORK_DIR = "../working/"

In [None]:
copyfile(src = INPUT_DIR + 'external-modules-shopee-product-matching/utils.py', dst = WORK_DIR + 'utils.py')

from utils import clean_text

In [None]:
copyfile(src = INPUT_DIR + 'external-modules-shopee-product-matching/tokenization.py', dst = WORK_DIR + 'tokenization.py')

import tokenization

In [None]:
# RESTRICT TENSORFLOW TO 2GB OF GPU RAM
# SO THAT WE HAVE 14GB RAM FOR RAPIDS
LIMIT = 2.0
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        #print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)
print('We will restrict TensorFlow to max %iGB GPU RAM'%LIMIT)
print('then RAPIDS can use %iGB GPU RAM'%(16-LIMIT))

In [None]:
df = pd.read_csv(INPUT_DIR + 'shopee-product-matching/test.csv')
# If we are comitting, replace train set for test set and dont get cv
if len(df) > 3:
    GET_CV = False
    RECOMPUTE_IMAGE_EMBEDDING = True
    RECOMPUTE_TEXT_EMBEDDING = True
del df

# Function to get our f1 score
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

# Function to combine predictions
def combine_predictions(row1, row2):
    x = np.concatenate([row1, row2])
    return np.unique(x)

# Function to combine predictions
def aggregate_predictions(preds1, preds2): 
    
    memory = {}
    connections = {}
    
    def add_to_memory(idx, value):
        if value != "":
            if value in memory:
                memory[value].add(idx)
                return
            memory[value] = {idx}
            
    combineds = []

    for i, pred in enumerate(preds1) :
        combineds.append(set())
        for v in pred :
            add_to_memory(i, v)
            combineds[i].add(v)

    for i, pred in enumerate(preds2) :
        for v in pred :
            add_to_memory(i, v)
            combineds[i].add(v)
    
    for ids in memory.values():
        current_connection = set(ids)

        for uid in ids:
            if uid in connections:
                current_connection.update(connections[uid])

        for uid in current_connection:
            connections[uid] = current_connection
            
    del memory
    
    connections = sorted(connections.items())
    res = []
    for i in range(len(combineds)):
        combined = set()
        for idx in connections[i][1] :
            combined.update(combineds[idx])
        res.append(list(combined))

    del connections
    
    gc.collect()
    
    return res

def reformat_labels(arr): 
    """
    Convert arrray of strings into concatenated strings separated with space
    """
    return ' '.join(np.unique(arr))

In [None]:
# Function to read out dataset
def read_dataset():
    data_type = "test"
    if GET_CV :
        data_type = "train"
        
    df = pd.read_csv(INPUT_DIR + 'shopee-product-matching/' + data_type + '.csv')
    
    df["title"] = df["title"].apply(clean_text)
    
    if GET_CV :
        tmp = df.groupby(['label_group'])['posting_id'].unique().to_dict()
        df['matches'] = df['label_group'].map(tmp)
        df['matches'] = df['matches'].apply(lambda x: ' '.join(x))
        if CHECK_SUB:
            df = pd.concat([df, df], axis = 0)
            df.reset_index(drop = True, inplace = True)
    
    df_cu = cudf.DataFrame(df)
    image_paths = INPUT_DIR + 'shopee-product-matching/' + data_type + '_images/' + df['image']
        
    return df, df_cu, image_paths

In [None]:
# Function to decode our images
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels = 3)
    image = tf.image.resize(image, IMAGE_SIZE)
    image = tf.cast(image, tf.float32) / 255.0
    return image

# Function to read our test image and return image
def read_image(image):
    image = tf.io.read_file(image)
    image = decode_image(image)
    return image

# Function to get our dataset that read images
def get_dataset(image):
    dataset = tf.data.Dataset.from_tensor_slices(image)
    dataset = dataset.map(read_image, num_parallel_calls = AUTO)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

# Arcmarginproduct class keras layer
class ArcMarginProduct(tf.keras.layers.Layer):
    '''
    Implements large margin arc distance.

    Reference:
        https://arxiv.org/pdf/1801.07698.pdf
        https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution/
            blob/master/src/modeling/metric_learning.py
    '''
    def __init__(self, n_classes, s=30, m=0.50, easy_margin=False,
                 ls_eps=0.0, **kwargs):

        super(ArcMarginProduct, self).__init__(**kwargs)

        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m = tf.math.cos(m)
        self.sin_m = tf.math.sin(m)
        self.th = tf.math.cos(math.pi - m)
        self.mm = tf.math.sin(math.pi - m) * m

    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'n_classes': self.n_classes,
            's': self.s,
            'm': self.m,
            'ls_eps': self.ls_eps,
            'easy_margin': self.easy_margin,
        })
        return config

    def build(self, input_shape):
        super(ArcMarginProduct, self).build(input_shape[0])

        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer='glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None)

    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)
        cosine = tf.matmul(
            tf.math.l2_normalize(X, axis=1),
            tf.math.l2_normalize(self.W, axis=0)
        )
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = tf.where(cosine > 0, phi, cosine)
        else:
            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
        one_hot = tf.cast(
            tf.one_hot(y, depth=self.n_classes),
            dtype=cosine.dtype
        )
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output

In [None]:
def call_function_by_model_name(model_name): 
    return {
        "EfficientNetB3": efn.EfficientNetB3,
        "EfficientNetB4": efn.EfficientNetB4,
        "EfficientNetB5": efn.EfficientNetB5,
        "EfficientNetB6": efn.EfficientNetB6,
        "EfficientNetB7": efn.EfficientNetB7
    }[model_name]

# Function to get the embeddings of our images with the fine-tuned model
def get_image_embeddings(image_paths, model_name="EfficientNetB3"):
    embeds = []
    
    margin = ArcMarginProduct(
            n_classes = N_CLASSES, 
            s = 30, 
            m = 0.7, 
            name='head/arc_margin', 
            dtype='float32'
            )

    inp = tf.keras.layers.Input(shape = (*IMAGE_SIZE, 3), name = 'inp1')
    label = tf.keras.layers.Input(shape = (), name = 'inp2')
    x = call_function_by_model_name(model_name)(weights = None, include_top = False)(inp)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = margin([x, label])
        
    output = tf.keras.layers.Softmax(dtype='float32')(x)

    model = tf.keras.models.Model(inputs = [inp, label], outputs = [output])
    model.load_weights(INPUT_DIR + 'external-models-shopee-product-matching/' + model_name + '_512_42.h5')
    model = tf.keras.models.Model(inputs = model.input[0], outputs = model.layers[-4].output)
    chunk = 5000
    iterator = np.arange(np.ceil(len(df) / chunk))
    for j in iterator:
        a = int(j * chunk)
        b = int((j + 1) * chunk)
        image_dataset = get_dataset(image_paths[a:b])
        image_embeddings = model.predict(image_dataset)
        embeds.append(image_embeddings)
    del model
    image_embeddings = np.concatenate(embeds)
    print(f'Our image embeddings shape is {image_embeddings.shape}')
    del embeds
    gc.collect()
    return image_embeddings

In [None]:
# Return tokens, masks and segments from a text array or series
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

# Function to get our text title embeddings using a pre-trained bert model
def get_text_embeddings(df, batch_size=32, max_len = 128):
    embeds = []
    module_url = INPUT_DIR + 'external-models-shopee-product-matching/bert_en_uncased_L-24_H-1024_A-16_1'
    bert_layer = hub.KerasLayer(module_url, trainable = True)
    vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
    tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)
    text = bert_encode(df['title'].values, tokenizer, max_len = max_len)
    
    margin = ArcMarginProduct(
            n_classes = 11014, 
            s = 30, 
            m = 0.5, 
            name='head/arc_margin', 
            dtype='float32'
            )
    
    input_word_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")
    label = tf.keras.layers.Input(shape = (), name = 'label')

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    x = margin([clf_output, label])
    output = tf.keras.layers.Softmax(dtype='float32')(x)
    model = tf.keras.models.Model(inputs = [input_word_ids, input_mask, segment_ids, label], outputs = [output])
    
    model.load_weights(INPUT_DIR + 'external-models-shopee-product-matching/Bert_42.h5')
    model = tf.keras.models.Model(inputs = model.input[0:3], outputs = model.layers[-4].output)
    chunk = 5000
    iterator = np.arange(np.ceil(len(df) / chunk))
    for j in iterator:
        a = int(j * chunk)
        b = int((j + 1) * chunk)
        text_chunk = ((text[0][a:b], text[1][a:b], text[2][a:b]))
        text_embeddings = model.predict(text_chunk, batch_size = batch_size)
        embeds.append(text_embeddings)
    del model
    text_embeddings = np.concatenate(embeds)
    print(f'Our text embeddings shape is {text_embeddings.shape}')
    del embeds
    gc.collect()
    return text_embeddings

In [None]:
def set_seed(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [None]:
df, df_cu, image_paths = read_dataset()

## Image Phash

In [None]:
tmp = df.groupby('image_phash').posting_id.agg('unique').to_dict()
df['phash_predictions'] = df.image_phash.map(tmp)

In [None]:
if GET_CV:
    df['f1'] = f1_score(df['matches'], df['phash_predictions'].apply(lambda x: ' '.join( np.unique(x) )))
    print('CV score for baseline =', df.f1.mean())

## Image Embeddings

In [None]:
set_seed(42)
image_model_name = "EfficientNetB3"

if GET_CV :
    image_embedding_path = INPUT_DIR + 'external-embeddings-shopee-product-matching/image_embeddings_' + image_model_name
    if RECOMPUTE_IMAGE_EMBEDDING :
        image_embeddings = get_image_embeddings(image_paths, image_model_name)
        if KAGGLE_ENV : 
            np.save('image_embeddings', image_embeddings)
        else :
            np.save(image_embedding_path, image_embeddings)
    else :
        image_embeddings = np.load(image_embedding_path + ".npy")
else :
    image_embeddings = get_image_embeddings(image_paths, image_model_name)

## Text Embeddings

In [None]:
set_seed(42)

if GET_CV :
    text_embedding_path = INPUT_DIR + 'external-embeddings-shopee-product-matching/text_embeddings_Bert42'
    if RECOMPUTE_TEXT_EMBEDDING :
        text_embeddings = get_text_embeddings(df)
        if KAGGLE_ENV : 
            np.save('text_embeddings', text_embeddings)
        else :
            np.save(text_embedding_path, text_embeddings)
    else :
        text_embeddings = np.load(text_embedding_path + '.npy')
else :
    text_embeddings = get_text_embeddings(df)

In [None]:
gc.collect()

In [None]:
## Function to get 50 nearest neighbors of each image and apply a distance threshold to maximize cv
def get_distances_indices(embeddings, KNN=50, normalize=False, metric='cosine'):
    if metric:
        model = NearestNeighbors(n_neighbors = KNN, metric=metric)
    else :
        model = NearestNeighbors(n_neighbors = KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    if normalize: distances = sklearn.preprocessing.normalize(distances)
    del model
    gc.collect()
    return distances, indices
    
def get_neighbors_from_distance_indices(df, distances, indices, thresholds, choosen_threshold) :
    # Iterate through different thresholds to maximize cv, run this in interactive mode, then replace else clause with a solid threshold
    if GET_CV:
#         scores = []
#         for threshold in thresholds:
#             predictions = []
#             for k in range(indices.shape[0]):
#                 idx = np.where(distances[k,] < threshold)[0]
#                 ids = indices[k,idx]
#                 posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
#                 predictions.append(posting_ids)
#             df['pred_matches'] = predictions
#             df['f1'] = f1_score(df['matches'], df['pred_matches'])
#             score = df['f1'].mean()
#             print(f'Our f1 score for threshold {threshold} is {score}')
#             scores.append(score)
#             df = df.drop(columns=['pred_matches','f1'])
#             del predictions
#             gc.collect()
#         thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
#         max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
#         best_threshold = max_score['thresholds'].values[0]
#         best_score = max_score['scores'].values[0]
#         print(f'Our best score is {best_score} and has a threshold {best_threshold}')
        
#         # Use threshold
        predictions = []
        for k in range(indices.shape[0]):
            # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
            idx = np.where(distances[k,] < choosen_threshold)[0]
            ids = indices[k,idx]
            posting_ids = df['posting_id'].iloc[ids].values
            predictions.append(posting_ids)
            
        # Modified
#         predictions = []
#         for k in range(indices.shape[0]):
#             dist = distances[k,]
#             posting_ids = np.array([])
#             for threshold in np.arange(choosen_threshold, choosen_threshold + 0.03, 0.005):
#                 if posting_ids.shape[0] <= 1:
#                     idx = np.where(dist < choosen_threshold)[0]
#                     ids = indices[k,idx]
#                     posting_ids = df['posting_id'].iloc[ids].values
#             predictions.append(posting_ids)

    
    # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
    else:
        predictions = []
        for k in tqdm(range(indices.shape[0])):
            idx = np.where(distances[k,] < choosen_threshold)[0]
            ids = indices[k,idx]
            posting_ids = df['posting_id'].iloc[ids].values
            predictions.append(posting_ids)
#         predictions = []
#         for k in range(indices.shape[0]):
#             dist = distances[k,]
#             posting_ids = np.array([])
#             for threshold in np.arange(choosen_threshold, choosen_threshold + 0.03, 0.005):
#                 if posting_ids.shape[0] <= 1:
#                     idx = np.where(dist < choosen_threshold)[0]
#                     ids = indices[k,idx]
#                     posting_ids = df['posting_id'].iloc[ids].values
#             predictions.append(posting_ids)
        
    del distances, indices
    gc.collect()
    return df, predictions

In [None]:
NORMALIZE_IMAGE_DISTANCE = True
NORMALIZE_TEXT_DISTANCE = True

KNN=100

In [None]:
"""
Get neighbors for image_embeddings
"""

## calculate image distances and get indices
image_distances, image_indices = get_distances_indices(image_embeddings, KNN=KNN, normalize=NORMALIZE_IMAGE_DISTANCE, metric='cosine')
    
if NORMALIZE_IMAGE_DISTANCE :
    ## threshold for normalized image distances
    image_thresholds = list(np.arange(0.03, 0.08, 0.005))
    choosen_image_threshold=0.035

df, image_predictions = get_neighbors_from_distance_indices(df, image_distances, image_indices, thresholds=image_thresholds, choosen_threshold=choosen_image_threshold)


In [None]:
"""
Get neighbors for text_embeddings
"""

## calculate text distances and get indices
text_distances, text_indices = get_distances_indices(text_embeddings, KNN=KNN, normalize=NORMALIZE_TEXT_DISTANCE)

if NORMALIZE_TEXT_DISTANCE :
    ## threshold for normalized embeddings
    text_thresholds = list(np.arange(0.03, 0.08, 0.005))
    choosen_text_threshold = 0.035
    
df, text_predictions  = get_neighbors_from_distance_indices(df, text_distances, text_indices, thresholds=text_thresholds, choosen_threshold=choosen_text_threshold)

In [None]:
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')
from cuml import PCA
from cuml.neighbors import NearestNeighbors
import tensorflow as tf
import efficientnet.tfkeras as efn

import tensorflow_hub as hub

import os
import cv2
import random
from tqdm import tqdm

import albumentations
from albumentations.pytorch.transforms import ToTensorV2

import torch
import timm
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset,DataLoader
import fasttext as ft

In [None]:
import albumentations as A 
from albumentations.pytorch.transforms import ToTensorV2

In [None]:
class CFG:
    
    img_size = 512
    batch_size = 12
    seed = 2020
    
    device = 'cuda'
    classes = 11014
    
    model_name = 'eca_nfnet_l0'
    model_path = '../input/shopee-pytorch-models/arcface_512x512_nfnet_l0 (mish).pt'
    
    scale = 30 
    margin = 0.5

In [None]:
set_seed(CFG.seed)

In [None]:
def get_test_transforms():

    return A.Compose(
        [
            A.Resize(CFG.img_size,CFG.img_size,always_apply=True),
            A.Normalize(),
        ToTensorV2(p=1.0)
        ]
    )

In [None]:
class ShopeeDataset(Dataset):
    def __init__(self, image_paths, transforms=None):

        self.image_paths = image_paths
        self.augmentations = transforms

    def __len__(self):
        return self.image_paths.shape[0]

    def __getitem__(self, index):
        image_path = self.image_paths[index]
        
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.augmentations:
            augmented = self.augmentations(image=image)
            image = augmented['image']       
    
        return image,torch.tensor(1)

In [None]:
class ArcMarginProduct_Image(nn.Module):
    def __init__(self, in_features, out_features, scale=30.0, margin=0.50, easy_margin=False, ls_eps=0.0):
        super(ArcMarginProduct_Image, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.scale = scale
        self.margin = margin
        self.ls_eps = ls_eps  # label smoothing
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(margin)
        self.sin_m = math.sin(margin)
        self.th = math.cos(math.pi - margin)
        self.mm = math.sin(math.pi - margin) * margin
        
    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
        one_hot = torch.zeros(cosine.size(), device='cuda')
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.scale
        
        return output

In [None]:
class ShopeeModel(nn.Module):

    def __init__(
        self,
        n_classes = CFG.classes,
        model_name = CFG.model_name,
        fc_dim = 512,
        margin = CFG.margin,
        scale = CFG.scale,
        use_fc = True,
        pretrained = False):


        super(ShopeeModel,self).__init__()
        print('Building Model Backbone for {} model'.format(model_name))
        
        self.backbone = timm.create_model(model_name, pretrained=pretrained)

        if model_name == 'resnext50_32x4d':
            final_in_features = self.backbone.fc.in_features
            self.backbone.fc = nn.Identity()
            self.backbone.global_pool = nn.Identity()

        elif model_name == 'efficientnet_b3':
            final_in_features = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
            self.backbone.global_pool = nn.Identity()

        elif model_name == 'tf_efficientnet_b5_ns':
            final_in_features = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
            self.backbone.global_pool = nn.Identity()
            
        elif model_name == 'eca_nfnet_l0':
            final_in_features = self.backbone.head.fc.in_features
            self.backbone.head.fc = nn.Identity()
            self.backbone.head.global_pool = nn.Identity()

        self.pooling =  nn.AdaptiveAvgPool2d(1)

        self.use_fc = use_fc

        self.dropout = nn.Dropout(p=0.0)
        self.fc = nn.Linear(final_in_features, fc_dim)
        self.bn = nn.BatchNorm1d(fc_dim)
        self._init_params()
        final_in_features = fc_dim

        self.final = ArcMarginProduct_Image(
            final_in_features,
            n_classes,
            scale = scale,
            margin = margin,
            easy_margin = False,
            ls_eps = 0.0
        )

    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)
        
    def forward(self, image, label):
        feature = self.extract_feat(image)
        #logits = self.final(feature,label)
        return feature

    def extract_feat(self, x):
        batch_size = x.shape[0]
        x = self.backbone(x)
        x = self.pooling(x).view(batch_size, -1)

        if self.use_fc:
            x = self.dropout(x)
            x = self.fc(x)
            x = self.bn(x)
        return x

In [None]:
class Mish_func(torch.autograd.Function):
    
    """from: https://github.com/tyunist/memory_efficient_mish_swish/blob/master/mish.py"""
    
    @staticmethod
    def forward(ctx, i):
        result = i * torch.tanh(F.softplus(i))
        ctx.save_for_backward(i)
        return result

    @staticmethod
    def backward(ctx, grad_output):
        i = ctx.saved_variables[0]
  
        v = 1. + i.exp()
        h = v.log() 
        grad_gh = 1./h.cosh().pow_(2)
        
        # Note that grad_hv * grad_vx = sigmoid(x)
        #grad_hv = 1./v  
        #grad_vx = i.exp()
        
        grad_hx = i.sigmoid()

        grad_gx = grad_gh *  grad_hx #grad_hv * grad_vx 
        
        grad_f =  torch.tanh(F.softplus(i)) + i * grad_gx 
        
        return grad_output * grad_f 
    
class Mish(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        pass
    def forward(self, input_tensor):
        return Mish_func.apply(input_tensor)
    
    
def replace_activations(model, existing_layer, new_layer):
    
    """A function for replacing existing activation layers"""
    
    for name, module in reversed(model._modules.items()):
        if len(list(module.children())) > 0:
            model._modules[name] = replace_activations(module, existing_layer, new_layer)

        if type(module) == existing_layer:
            layer_old = module
            layer_new = new_layer
            model._modules[name] = layer_new
    return model

In [None]:
def get_image_embeddings1(image_paths, model_name = CFG.model_name):
    embeds = []
    
    model = ShopeeModel(model_name = model_name)
    model.eval()
    
    if model_name == 'eca_nfnet_l0':
        model = replace_activations(model, torch.nn.SiLU, Mish())

    model.load_state_dict(torch.load(CFG.model_path))
    model = model.to(CFG.device)
    

    image_dataset = ShopeeDataset(image_paths=image_paths,transforms=get_test_transforms())
    image_loader = torch.utils.data.DataLoader(
        image_dataset,
        batch_size=CFG.batch_size,
        pin_memory=True,
        drop_last=False,
        num_workers=4
    )
    
    with torch.no_grad():
        for img,label in tqdm(image_loader): 
            img = img.cuda()
            label = label.cuda()
            feat = model(img,label)
            image_embeddings = feat.detach().cpu().numpy()
            embeds.append(image_embeddings)
    
    
    del model
    
    image_embeddings = np.concatenate(embeds)
    print(f'Our image embeddings shape is {image_embeddings.shape}')
    del embeds
    gc.collect()
    return image_embeddings

In [None]:
def get_image_predictions(df, embeddings,threshold = 0.0):
    
    if len(df) > 3:
        KNN = 100
    else : 
        KNN = 3
    
    model = NearestNeighbors(n_neighbors = KNN, metric = 'cosine')
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    predictions = []
    for k in tqdm(range(embeddings.shape[0])):
        idx = np.where(distances[k,] < threshold)[0]
        ids = indices[k,idx]
        posting_ids = df['posting_id'].iloc[ids].values
        predictions.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return predictions

In [None]:
df_image,df_image_cu,image_paths = read_dataset()
df_image.head()

In [None]:
if GET_CV :
    nfnet_embedding_path = INPUT_DIR + 'external-embeddings-shopee-product-matching/image_embeddings_nfnet'
    if RECOMPUTE_NFNET_EMBEDDING :
        nfnet_embeddings = get_image_embeddings1(image_paths.values)
        if KAGGLE_ENV : 
            np.save('image_embeddings_nfnet', nfnet_embeddings)
        else :
            np.save(nfnet_embedding_path, nfnet_embeddings)
    else :
        nfnet_embeddings = np.load(nfnet_embedding_path + '.npy')
else :
    nfnet_embeddings = get_image_embeddings1(image_paths.values)

In [None]:
## calculate image distances and get indices
# nfnet_distances, nfnet_indices = get_distances_indices(nfnet_embeddings, KNN=KNN, normalize=False, metric='cosine')
nfnet_distances, nfnet_indices = get_distances_indices(nfnet_embeddings, KNN=KNN, normalize=True, metric='cosine')

## threshold for normalized image distances
nfnet_thresholds = list(np.arange(0.03, 0.05, 0.005))
choosen_nfnet_threshold = 0.045

df, nfnet_predictions = get_neighbors_from_distance_indices(df, nfnet_distances, nfnet_indices, thresholds=nfnet_thresholds, choosen_threshold=choosen_nfnet_threshold)

In [None]:
import cuml
from cuml.feature_extraction.text import TfidfVectorizer

def get_tfidf_embeddings(df_cu):
    model = TfidfVectorizer(stop_words=None, binary=True, use_idf=True, max_features=25000)
#     model = TfidfVectorizer(stop_words=None, binary=True, ngram_range=(1,2), use_idf=True, max_features=25000)
    tfidf_embeddings = model.fit_transform(df_cu.title).toarray()
    print('text embeddings shape',tfidf_embeddings.shape)
    del model
    gc.collect()
    return tfidf_embeddings

tfidf_embeddings = get_tfidf_embeddings(df_cu)

In [None]:
CHECK_SUB = True
if GET_CV and not CHECK_SUB:
    tfidf_preds_path = INPUT_DIR + 'external-embeddings-shopee-product-matching/tfidf_preds.npy'
    tfidf_preds = np.load(tfidf_preds_path, allow_pickle=True)
else :
    tfidf_preds = []
    tfidf_low_preds = []
    CHUNK = 1024*4

    print('Finding similar titles...')
    CTS = len(df_cu)//CHUNK
    if len(df_cu)%CHUNK!=0: CTS += 1
    for j in range( CTS ):

        a = j*CHUNK
        b = (j+1)*CHUNK
        b = min(b,len(df_cu))
        print('chunk',a,'to',b)

        # COSINE SIMILARITY DISTANCE
        cts = cupy.matmul(tfidf_embeddings, tfidf_embeddings[a:b].T).T

        for k in range(b-a):
            llll = cts[k,]
            o = np.array([])
            for ii in np.arange(0.775,0.50, -0.02):
                if ii>0.5 and o.shape[0] <= 1:
                    IDX = cupy.where(llll>ii)[0]
                    o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
            tfidf_preds.append(o)
            
        for k in range(b-a):
            IDX = cupy.where(cts[k,]>0.1)[0]
            o = df_cu.iloc[cupy.asnumpy(IDX)].posting_id.to_pandas().values
            tfidf_low_preds.append(cupy.asnumpy(o))

    del tfidf_embeddings

    gc.collect()

In [None]:
df_cu['oof_text'] = tfidf_preds

## Submission

In [None]:
USE_PHASH_PREDICTION = False

AGGREGATE_IMAGE_PREDICTION = False
AGGREGATE_TEXT_PREDICTION = False

df['phash_predictions'] = df['phash_predictions']
df['image_predictions'] = image_predictions
df['text_predictions'] = text_predictions
# df['tfidf_predictions'] = df_cu['oof_text'].to_pandas().values
df['tfidf_predictions'] = tfidf_preds
df['nfnet_predictions'] = nfnet_predictions

### TFIDF 

In [None]:
submission_column = "matches"
if GET_CV:
    submission_column = "pred_matches"
    
df[submission_column] = df['tfidf_predictions']

# df[submission_column] = df.apply(lambda x: combine_predictions(x[submission_column],x['text_predictions']), axis=1)

df[submission_column] = df[submission_column].apply(reformat_labels)

if GET_CV: 
    df['f1'] = f1_score(df['matches'], df['pred_matches'])
    score = df['f1'].mean()
    print(f'Our final f1 cv score is {score}')


**TFIDF Baseline**

Our final f1 cv score is 0.6470817084617556


### Aggregate Phash, Then Combine with Image + Text

Note: I have tried to use image only or text only feature, but the performance is worse than combining it.

In [None]:
submission_column = "matches"
if GET_CV:
    submission_column = "pred_matches"

if USE_PHASH_PREDICTION :
    df[submission_column] = aggregate_predictions(df['phash_predictions'], df['phash_predictions'])
else :
    df[submission_column] = df['image_predictions']
    
if AGGREGATE_IMAGE_PREDICTION :
    df[submission_column] = aggregate_predictions(df[submission_column], df['image_predictions'])
else :
    df[submission_column] = df.apply(lambda x: combine_predictions(x[submission_column],x['image_predictions']), axis=1)

if AGGREGATE_TEXT_PREDICTION :
    df[submission_column] = aggregate_predictions(df[submission_column], df['text_predictions'])
else :
    df[submission_column] = df.apply(lambda x: combine_predictions(x[submission_column],x['text_predictions']), axis=1)


df[submission_column] = df.apply(lambda x: combine_predictions(x[submission_column],x['tfidf_predictions']), axis=1)
df[submission_column] = df.apply(lambda x: combine_predictions(x[submission_column],x['nfnet_predictions']), axis=1)

df[submission_column] = df[submission_column].apply(reformat_labels)

if GET_CV: 
    df['f1'] = f1_score(df['matches'], df['pred_matches'])
    score = df['f1'].mean()
    print(f'Our final f1 cv score is {score}')

# df[['posting_id', 'matches']].to_csv('submission.csv', index = False)

## F1 Score Analysis

In [None]:
# df[df["f1"] < 1][["matches", "phash_predictions", "image_predictions", "text_predictions", "pred_matches", "f1"]]

In [None]:
def len_token(tokens) :
    return len(tokens.split(" "))

def f1_analysis(df) :
    d = df[df["f1"] < 1]
    d["len_pred"] = d["pred_matches"].apply(len_token)
    d["len_label"] = d["matches"].apply(len_token)
    
    equal = d["len_pred"] == d["len_label"]

    ## number the len prediction that is less than the len label
    ## example pred="train_1" ; label="train_1 train_2"
    less_than = d["len_pred"] < d["len_label"]

    ## number the len prediction that is greater than the len label
    ## example pred="train_1 train_2 train_3" ; label="train_1 train_2"
    greater_than = d["len_pred"] > d["len_label"]
    
    print(f"Error which length pred equal to length match: {sum(equal)}")
    print(f"Error which length pred less than length match: {sum(less_than)}")
    print(f"Error which length pred greater than length match: {sum(greater_than)}")
    
    del d, equal
    gc.collect()
    
    return less_than, greater_than

In [None]:
less_than, greater_than = f1_analysis(df)

I think it's better to find a threshold where the number of error comes from them are balance

In [None]:
# df[["label_group", "matches", "pred_matches", "f1"]][less_than].sort_values(by=["f1"])

In [None]:
# df[["label_group", "matches", "pred_matches", "f1"]][greater_than].sort_values(by=["f1"])

In [None]:
image_distances, image_indices = get_distances_indices(image_embeddings, KNN=KNN, normalize=NORMALIZE_IMAGE_DISTANCE)
text_distances, text_indices = get_distances_indices(text_embeddings, KNN=KNN, normalize=NORMALIZE_TEXT_DISTANCE)
nfnet_distances, nfnet_indices = get_distances_indices(nfnet_embeddings, KNN=KNN, normalize=NORMALIZE_IMAGE_DISTANCE)

In [None]:
# GROUP = 2008989859
# # N = 24839
# N = df.groupby("label_group").get_group(GROUP).index.values[0]
# df.groupby("label_group").get_group(GROUP)[[ "f1", "matches", "pred_matches", "image_predictions", "text_predictions", "title"]]

In [None]:
# print(f"Image Threshold: {choosen_image_threshold}")
# print(f"Text Threshold: {choosen_text_threshold}")

In [None]:
def print_pair(pair, threshold): 
    print()
    print("Predicted Less Than Threshold")
    print([(a, b) for (a, b) in pair if b < threshold])
    print()
    print("Predicted Greater Than Threshold")
    print([(a, b) for (a, b) in pair if b > threshold])

# ## True label
# print("True Label")
# print(df.groupby("label_group").get_group(GROUP).index.values)

# ## Label predicted from images      
# for indexs, distances in zip(image_indices[N:N+1], image_distances[N:N+1]) :
#     pair = sorted(zip(indexs, distances), key = lambda x: x[1])
#     print_pair(pair, choosen_image_threshold)

In [None]:
# ## True label
# print("True Label")
# print(df.groupby("label_group").get_group(GROUP).index.values)


# ## Label predicted from texts
# for i, d in zip(text_indices[N:N+1], text_distances[N:N+1]) :
#     pair = sorted(zip(i, d), key = lambda x: x[1])
#     print_pair(pair, choosen_text_threshold)

## Combine Image and Text Distance

In [None]:
def list_intersection(a, b) :
    return list(set(a) & set(b))

def get_neighbors_from_combined_image_text_distance_indices(df, image_distances, image_indices, choosen_image_threshold, image_multipliers, choosen_image_multiplier, text_distances, text_indices, choosen_text_threshold, text_multipliers, choosen_text_multiplier, PARAMETER_SEARCH) :
    ## Iterate through different thresholds to maximize cv, run this in interactive mode, then replace else clause with a solid threshold
    if GET_CV and PARAMETER_SEARCH:
        scores = []
        keys = []
        for im in image_multipliers:
            for tm in text_multipliers:
                predictions = []
                for k in range(image_indices.shape[0]):        
                    image_idx = np.where(image_distances[k,] < (choosen_image_threshold * im))[0]
                    image_ids = image_indices[k,image_idx]
                    text_idx = np.where(text_distances[k,] < (choosen_text_threshold * tm))[0]
                    text_ids = text_indices[k,text_idx]
                    ids = list_intersection(image_ids, text_ids)
                    posting_ids = df['posting_id'].iloc[ids].values
                    predictions.append(posting_ids)
                df['pred_matches'] = predictions
                df['pred_matches'] = df['pred_matches'].apply(reformat_labels)
                df['f1'] = f1_score(df['matches'], df['pred_matches'])
                score = df['f1'].mean()
                print("Our f1 score for im-{:.2f} and tm-{:.2f} is {:.3f}".format(im, tm, score))
                scores.append(score)
                keys.append(f'{im}-{tm}')
                df = df.drop(columns=['pred_matches','f1'])
                del predictions
                gc.collect()
                
        keys_scores = pd.DataFrame({'keys': keys, 'scores': scores})
        max_score = keys_scores[keys_scores['scores'] == keys_scores['scores'].max()]
        best_key = max_score['keys'].values[0]
        best_score = max_score['scores'].values[0]
        print(f'Our best score is {best_score} and has a threshold {best_key}')
        
    # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
    predictions = []
    for k in range(image_indices.shape[0]):        
        image_idx = np.where(image_distances[k,] < choosen_image_threshold * choosen_image_multiplier)[0]
        image_ids = image_indices[k,image_idx]
        text_idx = np.where(text_distances[k,] < choosen_text_threshold * choosen_text_multiplier)[0]
        text_ids = text_indices[k,text_idx]
        ids = list_intersection(image_ids, text_ids)
        posting_ids = df['posting_id'].iloc[ids].values
        predictions.append(posting_ids)


    del image_distances, image_indices, text_distances, text_indices
    gc.collect()
    return df, predictions

In [None]:
image_distances, image_indices = get_distances_indices(image_embeddings, KNN=KNN, normalize=NORMALIZE_IMAGE_DISTANCE)
text_distances, text_indices = get_distances_indices(text_embeddings, KNN=KNN, normalize=NORMALIZE_TEXT_DISTANCE)
nfnet_distances, nfnet_indices = get_distances_indices(nfnet_embeddings, KNN=KNN, normalize=NORMALIZE_IMAGE_DISTANCE)

In [None]:
PARAMETER_SEARCH = False

image_multipliers = list(np.arange(1.3, 1.7, 0.1))
choosen_image_multiplier = 1.5

text_multipliers = list(np.arange(1.3, 1.7, 0.1))
choosen_text_multiplier = 1.5

nfnet_multipliers = list(np.arange(1.3, 1.7, 0.1))
choosen_nfnet_multiplier = 1.5

In [None]:
df, image_text_predictions = get_neighbors_from_combined_image_text_distance_indices(df, image_distances, image_indices, choosen_image_threshold, image_multipliers, choosen_image_multiplier, text_distances, text_indices, choosen_text_threshold, text_multipliers, choosen_text_multiplier, PARAMETER_SEARCH)

In [None]:
df, nfnet_text_predictions = get_neighbors_from_combined_image_text_distance_indices(df, nfnet_distances, nfnet_indices, choosen_nfnet_threshold, nfnet_multipliers, choosen_nfnet_multiplier, text_distances, text_indices, choosen_text_threshold, text_multipliers, choosen_text_multiplier, PARAMETER_SEARCH)

In [None]:
df, image_nfnet_predictions = get_neighbors_from_combined_image_text_distance_indices(df, image_distances, image_indices, choosen_image_threshold, image_multipliers, choosen_image_multiplier, nfnet_distances, nfnet_indices, choosen_nfnet_threshold, nfnet_multipliers, choosen_nfnet_multiplier, PARAMETER_SEARCH)

In [None]:
def list_intersection(a, b) :
    return list(set(a) & set(b))

def get_predictions_from_combined_with_tfidf(df, tfidf_low_preds, text_distances, text_indices, choosen_text_threshold, text_multipliers, choosen_text_multiplier, PARAMETER_SEARCH=True) :
    ## Iterate through different thresholds to maximize cv, run this in interactive mode, then replace else clause with a solid threshold
    if GET_CV and PARAMETER_SEARCH:
        scores = []
        keys = []
        for tm in text_multipliers:
            predictions = []
            for k in range(image_indices.shape[0]):        
                text_idx = np.where(text_distances[k,] < (choosen_text_threshold * tm))[0]
                text_ids = text_indices[k,text_idx]
                text_post_ids = df['posting_id'].iloc[text_ids].values
                tfidf_ids = tfidf_low_preds[k]
#                 if k in [1,2,3,4] :
#                     print(tfidf_ids)
#                     print(text_post_ids)
                posting_ids = list_intersection(tfidf_ids, text_post_ids)
                predictions.append(posting_ids)
            df['pred_matches'] = predictions
            df['pred_matches'] = df['pred_matches'].apply(reformat_labels)
            df['f1'] = f1_score(df['matches'], df['pred_matches'])
            score = df['f1'].mean()
            print("Our f1 score for tm-{:.2f} is {:.3f}".format(tm, score))
            scores.append(score)
            keys.append(f'{tm}')
            df = df.drop(columns=['pred_matches','f1'])
            del predictions
            gc.collect()
                
        keys_scores = pd.DataFrame({'keys': keys, 'scores': scores})
        max_score = keys_scores[keys_scores['scores'] == keys_scores['scores'].max()]
        best_key = max_score['keys'].values[0]
        best_score = max_score['scores'].values[0]
        print(f'Our best score is {best_score} and has a threshold {best_key}')
        
    # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
    predictions = []
    for k in range(image_indices.shape[0]):        
        text_idx = np.where(text_distances[k,] < choosen_text_threshold * choosen_text_multiplier)[0]
        text_ids = text_indices[k,text_idx]
        tfidf_ids = tfidf_low_preds[k]
        ids = list_intersection(tfidf_ids, text_ids)
        posting_ids = df['posting_id'].iloc[ids].values
        predictions.append(posting_ids)


    del text_distances, text_indices
    gc.collect()
    return predictions

In [None]:
tfidf_text_predictions = get_predictions_from_combined_with_tfidf(df, tfidf_low_preds, text_distances, text_indices, choosen_text_threshold, text_multipliers, choosen_text_multiplier)
tfidf_nfnet_predictions = get_predictions_from_combined_with_tfidf(df, tfidf_low_preds, nfnet_distances, nfnet_indices, choosen_nfnet_threshold, nfnet_multipliers, choosen_nfnet_multiplier)
tfidf_image_predictions = get_predictions_from_combined_with_tfidf(df, tfidf_low_preds, image_distances, image_indices, choosen_image_threshold, image_multipliers, choosen_image_multiplier)

In [None]:
AGGREGATE_IMAGE_TEXT_PREDICTION = False

df["image_text_predictions"] = image_text_predictions
df["nfnet_text_predictions"] = nfnet_text_predictions
df["image_nfnet_predictions"] = image_nfnet_predictions

df["tfidf_image_predictions"] = tfidf_image_predictions
df["tfidf_text_predictions"] = tfidf_text_predictions
df["tfidf_nfnet_predictions"] = tfidf_nfnet_predictions

In [None]:
submission_column = "matches"
if GET_CV:
    submission_column = "pred_matches"

if USE_PHASH_PREDICTION :
    df[submission_column] = aggregate_predictions(df['phash_predictions'], df['phash_predictions'])
else :
    df[submission_column] = df['image_predictions']

    
if AGGREGATE_IMAGE_PREDICTION :
    df[submission_column] = aggregate_predictions(df[submission_column], df['image_predictions'])
else :
    df[submission_column] = df.apply(lambda x: combine_predictions(x[submission_column],x['image_predictions']), axis=1)

if AGGREGATE_TEXT_PREDICTION :
    df[submission_column] = aggregate_predictions(df[submission_column], df['text_predictions'])
else :
    df[submission_column] = df.apply(lambda x: combine_predictions(x[submission_column],x['text_predictions']), axis=1)

if AGGREGATE_IMAGE_TEXT_PREDICTION :
    df[submission_column] = aggregate_predictions(df[submission_column], df['image_text_predictions'])
else :
    df[submission_column] = df.apply(lambda x: combine_predictions(x[submission_column],x['image_text_predictions']), axis=1)

df[submission_column] = df.apply(lambda x: combine_predictions(x[submission_column],x['nfnet_text_predictions']), axis=1)
df[submission_column] = df.apply(lambda x: combine_predictions(x[submission_column],x['image_nfnet_predictions']), axis=1)

df[submission_column] = df.apply(lambda x: combine_predictions(x[submission_column],x['tfidf_predictions']), axis=1)
df[submission_column] = df.apply(lambda x: combine_predictions(x[submission_column],x['nfnet_predictions']), axis=1)

df[submission_column] = df.apply(lambda x: combine_predictions(x[submission_column],x['tfidf_text_predictions']), axis=1)
df[submission_column] = df.apply(lambda x: combine_predictions(x[submission_column],x['tfidf_nfnet_predictions']), axis=1)
df[submission_column] = df.apply(lambda x: combine_predictions(x[submission_column],x['tfidf_image_predictions']), axis=1)

df[submission_column] = df[submission_column].apply(reformat_labels)

if GET_CV: 
    df['f1'] = f1_score(df['matches'], df['pred_matches'])
    score = df['f1'].mean()
    print(f'Our final f1 cv score is {score}')

df[['posting_id', 'matches']].to_csv('submission.csv', index = False)

In [None]:
less_than, greater_than = f1_analysis(df)

**Baseline 0.740**

Our final f1 cv score is 0.9144889569967541

Error which length pred equal to length match: 993

Error which length pred less than length match: 2650

Error which length pred greater than length match: 8816


**Baseline 0.740**

Our final f1 cv score is 0.9373126574577614

Error which length pred equal to length match: 295

Error which length pred less than length match: 2374

Error which length pred greater than length match: 6575


**Baseline 0.740**

Our final f1 cv score is 0.9249745345112435

Error which length pred equal to length match: 410

Error which length pred less than length match: 3844

Error which length pred greater than length match: 6281


**Baseline 0.737**

Our final f1 cv score is 0.9282814981347284

Error which length pred equal to length match: 401

Error which length pred less than length match: 3825

Error which length pred greater than length match: 6163

## TODO
- combine distance for lower parameter, let say by multiply the text distance and image distance
- use image embedding to compare with embedding from the training data
- use IndoBERT instead of English