# Abstract

Hi guys, I've added some functions to this notebook (including `threshold_searching`). To test LB or CV score, you're welcome to just simply **Copy and Edit** then **Commit** it. Enjoy exploring!<br>
And, don't forget to use `threshold_searching` to find your best neighbors.

To save time from generating img Embedding, use pre-saved embedding for faster work. You can use this dataset directly: [Shopee - Price Match Guarantee| Embeddings](https://www.kaggle.com/chienhsianghung/shopee-price-match-guarantee-embeddings).<br>
I've also done some experiments on KNN and Cosine Similarity in the previous notebook. Click here to see the result: [Shopee| text, img Embedding (Colab enabled)](https://www.kaggle.com/chienhsianghung/shopee-text-img-embedding-colab-enabled).

## Notes

* By turning on ` NEIGHBORS_SEARCHING` and using`threshold_searching` function, one may get higher F1 score.
* Use [this dataset](https://www.kaggle.com/chienhsianghung/shopee-price-match-guarantee-embeddings) to save your time.
* Change `THRES_METH` to test your searching on LB.
* Special thanks to @vatsalmavani
  * This [notebook](https://www.kaggle.com/vatsalmavani/eff-b4-tfidf-0-728) works with any EfficientNet(B0 - B7) Model.
  * Training Notebook for EfficientNet-B4 can be found [here](https://www.kaggle.com/vatsalmavani/shopee-training-eff-b4)

In [None]:
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')

In [None]:
import os
import cv2
import math
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import albumentations
from albumentations.pytorch.transforms import ToTensorV2

import torch
import timm
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset,DataLoader

import gc
import matplotlib.pyplot as plt
import cudf
import cuml
import cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml import PCA
from cuml.neighbors import NearestNeighbors

# Config

In [None]:
COMPUTE_CV = True
NEIGHBORS_SEARCHING = False
SAVE_IMGEMBEDDING = False
BASELINE_CHECKING = False
THRES_METH = 'BOOM' # BOOM, BOOM_OPTIMIZED, THRES, THRES_OPTIMIZED
FIXING = True
PHASH = True

df = pd.read_csv('../input/shopee-product-matching/test.csv')
if len(df)>3: COMPUTE_CV = False
if COMPUTE_CV: 
    print('this submission notebook will compute CV score but commit notebook will not')
else:
    print('this submission notebook will only be used to submit result')
    
if FIXING:
    NONE25000 = [None, 25000]
else:
    NONE25000 = ['english', 25_000]

class CFG:
    seed = 54
    classes = 11014 
    scale = 30 
    margin = 0.5
    model_name = 'tf_efficientnet_b4'
    fc_dim = 512
    img_size = 512
    batch_size = 20
    num_workers = 4
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model_path = '../input/utils-shopee/arcface_512x512_tf_efficientnet_b4_LR.pt'

# Utils

In [None]:
def read_dataset(COMPUTE_CV):
    
    if COMPUTE_CV:
        df = pd.read_csv('../input/shopee-product-matching/train.csv')
        df_cu = cudf.DataFrame(df)
        image_paths = '../input/shopee-product-matching/train_images/' + df['image']
    
    else:
        df = pd.read_csv('../input/shopee-product-matching/test.csv')
        df_cu = cudf.DataFrame(df)
        image_paths = '../input/shopee-product-matching/test_images/' + df['image']

    return df, df_cu, image_paths

In [None]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_torch(CFG.seed)

In [None]:
def combine_predictions(row):
    x = np.concatenate([row['image_predictions'], row['text_predictions']])
    return ' '.join( np.unique(x) )

def combine_predictions_phash(row):
    x = np.concatenate([row['image_predictions'], row['text_predictions'], row['phash_predictions'], row['text_predictions_BERT']])
    return ' '.join( np.unique(x) )

def combine_for_cv(row):
    x = np.concatenate([row['image_predictions'], row['text_predictions']])
    return np.unique(x)

def combine_for_cv_phash(row):
    x = np.concatenate([row['image_predictions'], row['text_predictions'], row['phash_predictions'], row['text_predictions_BERT']])
    return np.unique(x)

# Image Predictions

In [None]:
# Create Model

class ArcMarginProduct(nn.Module):
    def __init__(self, in_features, out_features, scale=30.0, margin=0.50, easy_margin=False, ls_eps=0.0):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.scale = scale
        self.margin = margin
        self.ls_eps = ls_eps
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(margin)
        self.sin_m = math.sin(margin)
        self.th = math.cos(math.pi - margin)
        self.mm = math.sin(math.pi - margin) * margin

    def forward(self, input, label):
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
    
        one_hot = torch.zeros(cosine.size(), device='cuda')
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.scale

        return output, nn.CrossEntropyLoss()(output,label)


class ShopeeModel(nn.Module):

    def __init__(
        self,
        n_classes = CFG.classes,
        model_name = CFG.model_name,
        fc_dim = CFG.fc_dim,
        margin = CFG.margin,
        scale = CFG.scale,
        use_fc = True,
        pretrained = True):

        super(ShopeeModel,self).__init__()
        print('Building Model Backbone for {} model'.format(model_name))

        self.backbone = timm.create_model(model_name, pretrained=pretrained)
        in_features = self.backbone.classifier.in_features
        self.backbone.classifier = nn.Identity()
        self.backbone.global_pool = nn.Identity()
        self.pooling =  nn.AdaptiveAvgPool2d(1)
        self.use_fc = use_fc

        if use_fc:
            self.dropout = nn.Dropout(p=0.1)
            self.classifier = nn.Linear(in_features, fc_dim)
            self.bn = nn.BatchNorm1d(fc_dim)
            self._init_params()
            in_features = fc_dim

        self.final = ArcMarginProduct(
            in_features,
            n_classes,
            scale = scale,
            margin = margin,
            easy_margin = False,
            ls_eps = 0.0
        )

    def _init_params(self):
        nn.init.xavier_normal_(self.classifier.weight)
        nn.init.constant_(self.classifier.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, image, label):
        features = self.extract_features(image)
        if self.training:
            logits = self.final(features, label)
            return logits
        else:
            return features

    def extract_features(self, x):
        batch_size = x.shape[0]
        x = self.backbone(x)
        x = self.pooling(x).view(batch_size, -1)

        if self.use_fc and self.training:
            x = self.dropout(x)
            x = self.classifier(x)
            x = self.bn(x)
        return x

In [None]:
def get_image_neighbors(df, embeddings, KNN=100, threshold=4.5):

    model = NearestNeighbors(n_neighbors = KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    predictions = []
    for k in tqdm(range(embeddings.shape[0])):
        idx = np.where(distances[k,] < threshold)[0]
        ids = indices[k,idx]
        posting_ids = df['posting_id'].iloc[ids].values
        predictions.append(posting_ids)

    return df, predictions

In [None]:
def get_test_transforms():
    return albumentations.Compose([
        albumentations.Resize(CFG.img_size, CFG.img_size, always_apply=True),
        albumentations.Normalize(),
        ToTensorV2(p=1.0)
    ])

In [None]:
class ShopeeDataset(Dataset):

    def __init__(self, image_paths, transforms=None):
        self.image_paths = image_paths
        self.augmentations = transforms

    def __len__(self):
        return self.image_paths.shape[0]

    def __getitem__(self, index):
        image_path = self.image_paths[index]
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.augmentations:
            augmented = self.augmentations(image=image)
            image = augmented['image']
        
        return image, torch.tensor(1)

In [None]:
def get_image_embeddings(image_paths):

    model = ShopeeModel(pretrained=False).to(CFG.device)
    model.load_state_dict(torch.load(CFG.model_path))
    model.eval()

    image_dataset = ShopeeDataset(image_paths=image_paths, transforms=get_test_transforms())
    image_loader = torch.utils.data.DataLoader(
        image_dataset,
        batch_size=CFG.batch_size,
        num_workers=CFG.num_workers
    )

    embeds = []
    with torch.no_grad():
        for img,label in tqdm(image_loader): 
            img = img.cuda()
            label = label.cuda()
            features = model(img,label)
            image_embeddings = features.detach().cpu().numpy()
            embeds.append(image_embeddings)

    image_embeddings = np.concatenate(embeds)
    print(f'Our image embeddings shape is {image_embeddings.shape}')
    return image_embeddings

# Text Predictions

In [None]:
def get_text_embeddings(df_cu, max_features=NONE25000[1]):
    model = TfidfVectorizer(stop_words=NONE25000[0],
                            binary=True,
                            max_features=max_features)
    text_embeddings = model.fit_transform(df_cu['title']).toarray()
    return text_embeddings
    
    
def get_text_predictions(df, embeddings, max_features=NONE25000[1], threshold=0.75):
    print('Finding similar titles...')
    CHUNK = 1024 * 4
    CTS = len(df) // CHUNK
    if (len(df)%CHUNK) != 0:
        CTS += 1

    preds = []
    for j in range( CTS ):
        a = j * CHUNK
        b = (j+1) * CHUNK
        b = min(b, len(df))
        print('chunk', a, 'to', b)

        # COSINE SIMILARITY DISTANCE
        cts = cupy.matmul(embeddings, embeddings[a:b].T).T
        for k in range(b-a):
            IDX = cupy.where(cts[k,]>threshold)[0]
            o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
            preds.append(o)
            
    return preds

# Data Load-in and Preparation

In [None]:
df,df_cu,image_paths = read_dataset(COMPUTE_CV)
df.head()

In [None]:
if not COMPUTE_CV:
    image_embeddings = get_image_embeddings(image_paths.values)
    if SAVE_IMGEMBEDDING: np.savetxt('image_embeddings_tf_efficientnet_b4.csv', image_embeddings, delimiter=',')
else:
    image_embeddings = np.loadtxt('../input/shopee-price-match-guarantee-embeddings/image_embeddings_tf_efficientnet_b4.csv', delimiter=',')

text_embeddings = get_text_embeddings(df_cu)

if BASELINE_CHECKING:
    text_predictions = get_text_predictions(df, text_embeddings)
    df, image_predictions = get_image_neighbors(df, image_embeddings, KNN=100 if len(df)>3 else 3)
    df.head()

# Preparing Submission (Pre-searching)

In [None]:
if BASELINE_CHECKING:
    df['image_predictions'] = image_predictions
    df['text_predictions'] = text_predictions
    df['matches'] = df.apply(combine_predictions, axis=1)
    df[['posting_id', 'matches']].to_csv('submission.csv', index=False)

## CV Score (BASELINE_CHECKING)

In [None]:
def getMetric(col):
    def f1score(row):
        n = len(np.intersect1d(row.target, row[col]))
        return 2*n / (len(row.target) + len(row[col]))
    return f1score

In [None]:
if COMPUTE_CV and BASELINE_CHECKING:
    df['matches_CV'] = df.apply(combine_for_cv, axis=1)
    tmp = df.groupby('label_group').posting_id.agg('unique').to_dict()
    df['target'] = df.label_group.map(tmp)
    MyCVScore = df.apply(getMetric('matches_CV'), axis=1)
    print('CV score =', MyCVScore.mean())
elif COMPUTE_CV:
    tmp = df.groupby('label_group').posting_id.agg('unique').to_dict()
    df['target'] = df.label_group.map(tmp)

# Good Neighbors Searching

In [None]:
# To find the finest neighbors

def threshold_searching(df, imgtxt, embeddings,
                       img_LB=4.0, img_UB=6.0, txt_LB=0.7, txt_UB=0.9,
                       KNN=100, max_features=25_000):
    df1 = pd.DataFrame(columns = ['target', 'pred_matches'])
    df1.target = df.target
    
    if imgtxt == 'img':
        thresholds = list(np.arange(img_LB, img_UB, 0.1))
        scores = []
        for threshold in thresholds:
            _, image_predictions = get_image_neighbors(df, embeddings, KNN, threshold=threshold)
            df1.pred_matches = image_predictions
            MyCVScore = df1.apply(getMetric('pred_matches'), axis=1)
            score = MyCVScore.mean()
            print(f'CV score for threshold {threshold} = {score}')
            scores.append(score)
        thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        print(f'Our best score is {best_score} and has a threshold {best_threshold}')
        
    elif imgtxt == 'txt':
        thresholds = list(np.arange(txt_LB, txt_UB, 0.01))
        scores = []
        for threshold in thresholds:
            text_predictions = get_text_predictions(df, embeddings, max_features=max_features, threshold=threshold)
            df1.pred_matches = text_predictions
            MyCVScore = df1.apply(getMetric('pred_matches'), axis=1)
            score = MyCVScore.mean()
            print(f'CV score for threshold {threshold} = {score}')
            scores.append(score)
        thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        print(f'Our best score is {best_score} and has a threshold {best_threshold}')
    
    return best_threshold

In [None]:
if COMPUTE_CV and NEIGHBORS_SEARCHING: 
    best_threshold_img = threshold_searching(df, 'img', image_embeddings, img_LB=5.5, img_UB=6.0)

In [None]:
# CV score = 0.7934218709044448 (only tune img)
# CV score = 0.7949996561875796 (combining txt 0.7)
# CV score = 0.7687872470155758 (combining txt 0.53)

if COMPUTE_CV: 
    import random
    text_embeddings[random.randint(0, 34250-1)][random.randint(0, 25000-1)]

In [None]:
if COMPUTE_CV and NEIGHBORS_SEARCHING: 
    best_threshold_txt = threshold_searching(df, 'txt', text_embeddings, txt_LB=0.7, txt_UB=0.76)

# Preparing Submission (Post-searching)

According to the model in [this notebook](https://www.kaggle.com/anlgrbz/how-optimum-threshold-changes-with-embed-test-size), optimum threshold decreases by 4.9636210^-6 for 1 increase in test set size. Given that hidden test size is 70000. -- slope of the regression line is $-4.9636210^-6$ for embedding size 5000

> (70000−10000)∗4.96362∗10^(−6)=0.2978

Hence, decrease your `threshold` by  `0.2978`  to use in your final inference kernel.

In [None]:
if COMPUTE_CV and NEIGHBORS_SEARCHING:
    best_threshold_img = best_threshold_img
    best_threshold_txt = best_threshold_txt
else: 
    if THRES_METH == 'THRES_OPTIMIZED':
        best_threshold_img = 5.6 - 0.2978
        best_threshold_txt = 0.53 * (1 + (1 - (5.6-0.2978)/5.6))
    elif THRES_METH == 'THRES':
        best_threshold_img = 5.6
        best_threshold_txt = 0.7
    elif THRES_METH == 'BOOM':
        best_threshold_img = 4.5
        best_threshold_txt = 0.75
    elif THRES_METH == 'BOOM_OPTIMIZED':
        best_threshold_img = 4.5 - 0.2978
        best_threshold_txt = 0.75 * (1 + (1 - (4.5-0.2978)/4.5))
        
text_predictions = get_text_predictions(df, text_embeddings, threshold=best_threshold_txt)
df, image_predictions = get_image_neighbors(df, image_embeddings, KNN=100 if len(df)>3 else 3, threshold=best_threshold_img)

df.head()

In [None]:
import transformers

NUM_WORKERS = 4
BATCH_SIZE = 16
SEED = 42

device = torch.device('cuda')

################################################# MODEL ####################################################################

transformer_model = '../input/sentence-transformer-models/paraphrase-xlm-r-multilingual-v1/0_Transformer'
TOKENIZER = transformers.AutoTokenizer.from_pretrained(transformer_model)

################################################ MODEL PATH ###############################################################

TEXT_MODEL_PATH = '../input/best-multilingual-model/sentence_transfomer_xlm_best_loss_num_epochs_25_arcface.bin'

model_params = {
    'n_classes':11014,
    'model_name':transformer_model,
    'use_fc':False,
    'fc_dim':512,
    'dropout':0.3,
}

class ShopeeDataset(Dataset):
    def __init__(self, csv):
        self.csv = csv.reset_index()

    def __len__(self):
        return self.csv.shape[0]

    def __getitem__(self, index):
        row = self.csv.iloc[index]
        
        text = row.title
        
        text = TOKENIZER(text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")
        input_ids = text['input_ids'][0]
        attention_mask = text['attention_mask'][0]  
        
        return input_ids, attention_mask


class ShopeeNet(nn.Module):

    def __init__(self,
                 n_classes,
                 model_name='bert-base-uncased',
                 use_fc=False,
                 fc_dim=512,
                 dropout=0.0):
        """
        :param n_classes:
        :param model_name: name of model from pretrainedmodels
            e.g. resnet50, resnext101_32x4d, pnasnet5large
        :param pooling: One of ('SPoC', 'MAC', 'RMAC', 'GeM', 'Rpool', 'Flatten', 'CompactBilinearPooling')
        :param loss_module: One of ('arcface', 'cosface', 'softmax')
        """
        super(ShopeeNet, self).__init__()

        self.transformer = transformers.AutoModel.from_pretrained(model_name)
        final_in_features = self.transformer.config.hidden_size
        
        self.use_fc = use_fc
    
        if use_fc:
            self.dropout = nn.Dropout(p=dropout)
            self.fc = nn.Linear(final_in_features, fc_dim)
            self.bn = nn.BatchNorm1d(fc_dim)
            self._init_params()
            final_in_features = fc_dim


    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, input_ids,attention_mask):
        feature = self.extract_feat(input_ids,attention_mask)
        return F.normalize(feature)

    def extract_feat(self, input_ids,attention_mask):
        x = self.transformer(input_ids=input_ids,attention_mask=attention_mask)
        
        features = x[0]
        features = features[:,0,:]

        if self.use_fc:
            features = self.dropout(features)
            features = self.fc(features)
            features = self.bn(features)

        return features
    
    
def get_text_embeddings(df):
    embeds = []
    
    model = ShopeeNet(**model_params)
    model.eval()
    
    model.load_state_dict(dict(list(torch.load(TEXT_MODEL_PATH).items())[:-1]))
    model = model.to(device)

    text_dataset = ShopeeDataset(df)
    text_loader = torch.utils.data.DataLoader(
        text_dataset,
        batch_size=BATCH_SIZE,
        pin_memory=True,
        drop_last=False,
        num_workers=NUM_WORKERS
    )
    
    
    with torch.no_grad():
        for input_ids, attention_mask in tqdm(text_loader): 
            input_ids = input_ids.cuda()
            attention_mask = attention_mask.cuda()
            feat = model(input_ids, attention_mask)
            text_embeddings = feat.detach().cpu().numpy()
            embeds.append(text_embeddings)
    
    
    del model
    text_embeddings = np.concatenate(embeds)
    print(f'Our text embeddings shape is {text_embeddings.shape}')
    del embeds
    gc.collect()
    return text_embeddings


def get_neighbours_cos_sim(df,embeddings):
    '''
    When using cos_sim use normalized features else use normal features
    '''
    embeddings = cupy.array(embeddings)
    
    if False:
        thresholds = list(np.arange(0.5,0.7,0.05))

        scores = []
        for threshold in thresholds:
            
################################################# Code for Getting Preds #########################################
            preds = []
            CHUNK = 1024*4

            print('Finding similar titles...for threshold :',threshold)
            CTS = len(embeddings)//CHUNK
            if len(embeddings)%CHUNK!=0: CTS += 1

            for j in range( CTS ):
                a = j*CHUNK
                b = (j+1)*CHUNK
                b = min(b,len(embeddings))

                cts = cupy.matmul(embeddings,embeddings[a:b].T).T

                for k in range(b-a):
                    IDX = cupy.where(cts[k,]>threshold)[0]
                    o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
                    o = ' '.join(o)
                    preds.append(o)
######################################################################################################################
            df['pred_matches'] = preds
            df['f1'] = f1_score(df['matches'], df['pred_matches'])
            score = df['f1'].mean()
            print(f'Our f1 score for threshold {threshold} is {score}')
            scores.append(score)
            
        thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        print(f'Our best score is {best_score} and has a threshold {best_threshold}')
            
    else:
        preds = []
        CHUNK = 1024*4
        threshold = 0.8

        print('Finding similar texts...for threshold :',threshold)
        CTS = len(embeddings)//CHUNK
        if len(embeddings)%CHUNK!=0: CTS += 1

        for j in range( CTS ):
            a = j*CHUNK
            b = (j+1)*CHUNK
            b = min(b,len(embeddings))
            print('chunk',a,'to',b)

            cts = cupy.matmul(embeddings,embeddings[a:b].T).T

            for k in range(b-a):
                IDX = cupy.where(cts[k,]>threshold)[0]
                o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
                preds.append(o)
                    
    return df, preds

In [None]:
text_embeddings_BERT = get_text_embeddings(df)

In [None]:
_, text_predictions_BERT = get_neighbours_cos_sim(df, text_embeddings_BERT)

In [None]:
df['image_predictions'] = image_predictions
df['text_predictions'] = text_predictions
df['text_predictions_BERT'] = text_predictions_BERT
if PHASH:
    tmp = df.groupby('image_phash').posting_id.agg('unique').to_dict()
    df['phash_predictions'] = df.image_phash.map(tmp)
    df['matches'] = df.apply(combine_predictions_phash, axis=1)
else: 
    df['matches'] = df.apply(combine_predictions, axis=1)    
df[['posting_id', 'matches']].to_csv('submission.csv', index=False)

## CV Score (FINAL)

In [None]:
if COMPUTE_CV:
    if PHASH:
        df['matches_CV'] = df.apply(combine_for_cv_phash, axis=1)
    else:
        df['matches_CV'] = df.apply(combine_for_cv, axis=1)
    MyCVScore = df.apply(getMetric('matches_CV'), axis=1)
    print('CV score =', MyCVScore.mean())

print(f'COMPUTE_CV = {COMPUTE_CV}')
print(f'NEIGHBORS_SEARCHING = {NEIGHBORS_SEARCHING}')
print(f'BASELINE_CHECKING = {BASELINE_CHECKING}')
print(f'SAVE_IMGEMBEDDING = {SAVE_IMGEMBEDDING}')
print(f'THRES_METH = {THRES_METH}')
print(f'FIXING = {FIXING}')
print(f'PHASH = {PHASH}')

|   | CV | LB |
| - | -- | -- |
| 5.6 0.7 | 0.795 |   |
| (fixing) 5.6 0.7 | 0.795 | 0.688 |
| (fixing) BOOM | 0.774 | 0.728 |
| (fixing) BOOM PHASH | 0.774 | 0.728 |
| (fixing) BOOM OPTIMIZED | 0.757 | 0.722 |
| (fixing) 5.6 0.53 | 0.769 |   | 
| (fixing) 5.6 0.53 OPTIMIZED | 0.782 | 0.675 |
| BOOM | 0.774 | 0.728 |
| BOOM OPTIMIZED | 0.757 |   |
| BERT w/o TFIDF (fixing) BOOM PHASH | 0.975 | 0.716 |
| BERT w/ TFIDF (fixing) BOOM PHASH | 0.958 | 0.719 |
| BERT80 w/ TFIDF (fixing) BOOM PHASH |   |   |

# References

* [Shopee| text, img Embedding (Colab enabled)](https://www.kaggle.com/chienhsianghung/shopee-text-img-embedding-colab-enabled)
* [Shopee - Price Match Guarantee| Embeddings](https://www.kaggle.com/chienhsianghung/shopee-price-match-guarantee-embeddings)
* [Eff-B4 + TFIDF >= 0.728](https://www.kaggle.com/vatsalmavani/eff-b4-tfidf-0-728)
* [Reaching 0.612 with Text Only : Shopee](https://www.kaggle.com/tanulsingh077/reaching-0-612-with-text-only-shopee)