## Training pipline to finetune pre-trained models from [SentenceTransformers](https://www.sbert.net/docs/pretrained_models.html) with Contrastive Loss and Hard Negative Samples

* reference: [CONTRASTIVE LEARNING WITH
HARD NEGATIVE SAMPLES](https://arxiv.org/pdf/2010.04592.pdf)

In [None]:
!pip install faiss-cpu sentence_transformers

In [None]:
import itertools
import os
import random as rn
import shutil
from scipy.special import comb

import faiss
import numpy as np
import pandas as pd
from sentence_transformers import (
    SentencesDataset,
    SentenceTransformer,
    evaluation,
    losses,
)
from sentence_transformers.readers import InputExample
from torch.utils.data import DataLoader
from tqdm import tqdm, notebook

In [None]:
MODEL_NAME = "stsb-roberta-base"
MODEL = SentenceTransformer(MODEL_NAME)


In [None]:
MODEL_SAVE_PATH = f"finetuned-model/{MODEL_NAME}"
BATCH_SIZE = 32
CAP_SIZE = 50
GROUP_CUT = 0.71  # Use option `RUN_ON_TRAIN` to find this number

In [None]:
Model_Save_Path='../input/sbert-models-for-shopee/' + MODEL_SAVE_PATH

In [None]:
!ls $Model_Save_Path

In [None]:
MODEL = SentenceTransformer(Model_Save_Path)

In [None]:
model_2 = SentenceTransformer(Model_Save_Path)

In [None]:
print(MODEL)

In [None]:
# DATA_PATH = '../input/'
DATA_PATH = '../input/shopee-product-matching/'

In [None]:
# f1 score metric
def getMetric(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

In [None]:
train = pd.read_csv(DATA_PATH + 'train.csv')
train['image'] = DATA_PATH + 'train_images/' + train['image']
tmp = train.groupby('label_group').posting_id.agg('unique').to_dict()
train['target'] = train.label_group.map(tmp)

In [None]:
train = train.sort_values(by='label_group')
train['title'] = train['title'].str.lower()
train.head()

In [None]:
text_embeddings = model_2.encode(train.title)
print('text embeddings shape',text_embeddings.shape)

In [None]:
text_embeddings = MODEL.encode(train.title)
print('text embeddings shape',text_embeddings.shape)

In [None]:
from sklearn.preprocessing import normalize

# l2 norm to kill all the sim in 0-1
text_embeddings = np.vstack(text_embeddings)
text_embeddings = normalize(text_embeddings)

import torch
text_embeddings = torch.from_numpy(text_embeddings)
text_embeddings = text_embeddings.cuda()

In [None]:
text_embeddings.shape

In [None]:
preds = []
CHUNK = 1024*2

print('Finding similar titles...')
CTS = len(train)//CHUNK
if len(train)%CHUNK!=0: CTS += 1
text_ids = None
    
for j in range( CTS ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(train))
    print('chunk',a,'to',b)
    
    cts = torch.matmul(text_embeddings, text_embeddings[a:b].T).T
    cts = cts.data.cpu().numpy()
    for k in range(b-a):
        IDX = np.where(cts[k,]>0.93)[0]
        o = train.iloc[IDX].posting_id.values
        preds.append(o)
        
    del cts
    torch.cuda.empty_cache()

In [None]:
train['oof_bert'] = preds

COMPUTE_CV = True
if COMPUTE_CV:
    train['f1'] = train.apply(getMetric('oof_bert'),axis=1)
    print('CV score for baseline =',train.f1.mean())
    

In [None]:
import faiss

def find_similarities_and_indexes(feature_embeddings,embed_dim, top_n=100, features_file=None):
    if features_file is not None:
        np.save(features_file, features)
        
    # Create index
    index = faiss.IndexFlatIP( embed_dim )
    index.add(feature_embeddings)
    # Search index
    return index.search(feature_embeddings, top_n)
    
    

In [None]:
text_embedding_2 = text_embeddings.cpu().numpy()
type(text_embedding_2)

In [None]:
embed_dim=768
similarities, indexes = find_similarities_and_indexes(text_embedding_2, embed_dim,top_n=20)

In [None]:
similarities[:5]

In [None]:
indexes[:5]

In [None]:
GROUP_CUT = 0.71  # Use option `RUN_ON_TRAIN` to find this number
# Apply cutoff of similiarites
train_are_same_groups = (similarities > GROUP_CUT)

In [None]:
train_are_same_groups[:5]

In [None]:
indexes[0][train_are_same_groups[0]]

In [None]:
train.index[[0, 33161, 10925]]

In [None]:
train['label_group'][:5]

In [None]:
found_groups = train['label_group'].values[[0, 33161]]
found_groups

In [None]:
similarity = np.dot(text_embedding_2[:10], text_embedding_2[:10].T)


In [None]:
similarity

In [None]:
for k in range(10):
    IDX = np.where(similarity[k,]>0.93)[0]
    print(IDX)

In [None]:
# Build submission
results = []
preds = []

for i, (test_is_same_group, index_result) in enumerate(zip(train_are_same_groups, indexes)):
    row_results = set(train.index[index_result[test_is_same_group]])
    #print(index_result[test_is_same_group])
    #print(row_results)
    row_posting_id_results = train.loc[row_results].posting_id.values
    preds.append(row_posting_id_results)
    results.append({         'posting_id': train.index[i],  
                            'matches': ' '.join(row_posting_id_results)    })

In [None]:
train.head(10)

In [None]:
print(results[:10])

In [None]:
train['oof_bert_faiss'] = preds
COMPUTE_CV = True
if COMPUTE_CV:
    train['f1'] = train.apply(getMetric('oof_bert_faiss'),axis=1)
    print('CV score for baseline =',train.f1.mean())

In [None]:
label_groups = train["label_group"].unique()

In [None]:
train_df = train.loc[train.label_group.isin(label_groups[: int(0.8 * len(label_groups))])]
eval_df = train.loc[train.label_group.isin(label_groups[int(0.8 * len(label_groups)) :])]

In [None]:
print(train_df.shape)
train_examples = list()
train_groups = [
        train_df.loc[train_df["label_group"] == lg]["title"].values.tolist()
        for lg in train_df["label_group"].unique()
    ]

In [None]:
train_groups[:8]

In [None]:
train_titles = sum(train_groups, [])


In [None]:
len(train_titles)

In [None]:
train_titles[:20]

In [None]:
train_embeddings = MODEL.encode(train_titles)

In [None]:
#train_embeddings /= np.linalg.norm(train_embeddings, 2, axis=1, keepdims=True)
#train_index = faiss.IndexFlatIP(train_embeddings.shape[1])

In [None]:

train_index = faiss.IndexFlatL2(train_embeddings.shape[1])
train_index.add(train_embeddings)

In [None]:
train_similarities, train_indexes = train_index.search(train_embeddings, 20)

In [None]:
CAP_SIZE = 10

In [None]:
 comb(len(group_1), 2)

In [None]:
group_1 = train_groups[0]
negative_pairs_no = int(max(CAP_SIZE - comb(len(group_1), 2), comb(len(group_1), 2)))
print(negative_pairs_no)
group_embedding = np.ascontiguousarray(
            np.mean(MODEL.encode(group_1), axis=0).reshape(1, -1), dtype=np.float32
        )

In [None]:
group_1

In [None]:
_, similar_idx = train_index.search(group_embedding, negative_pairs_no * 2)
negative_titles = [train_titles[idx] for idx in similar_idx[0]]
len(negative_titles)

In [None]:
len(group_1)
negative_titles[4:]

In [None]:
for title in group_1:
    try:
        negative_titles.remove(title)
    except:
        pass
negative_titles = negative_titles[:negative_pairs_no]

In [None]:
len(negative_titles)

In [None]:
positive_pairs = [
            list(pair)
            for pair in list(itertools.combinations(group_1, 2))
            if (isinstance(pair[0], str) and isinstance(pair[1], str))
        ]

In [None]:
for _, group in enumerate(train_groups):
    negative_pairs_no = int(max(CAP_SIZE - comb(len(group), 2), comb(len(group), 2)))
    group_embedding = np.ascontiguousarray(
            np.mean(MODEL.encode(group), axis=0).reshape(1, -1), dtype=np.float32
        )
    
    

In [None]:
for _, group in enumerate(train_groups):
        negative_pairs_no = int(max(CAP_SIZE - comb(len(group), 2), comb(len(group), 2)))

        group_embedding = np.ascontiguousarray(
            np.mean(MODEL.encode(group), axis=0).reshape(1, -1), dtype=np.float32
        )
        _, similar_idx = train_index.search(group_embedding, negative_pairs_no * 2)
        negative_titles = [train_titles[idx] for idx in similar_idx[0]]
        for title in group:
            try:
                negative_titles.remove(title)
            except:
                pass
        negative_titles = negative_titles[:negative_pairs_no]

        positive_pairs = [
            list(pair)
            for pair in list(itertools.combinations(group, 2))
            if (isinstance(pair[0], str) and isinstance(pair[1], str))
        ]
        for pair in positive_pairs:
            train_examples.append(InputExample(texts=pair, label=1))
        negative_pairs = [
            [rn.choice(rn.choice(positive_pairs)), negative_title]
            for negative_title in negative_titles
        ]
        for pair in negative_pairs:
            train_examples.append(InputExample(texts=pair, label=0))

In [None]:
len(train_examples)

In [None]:
CAP_SIZE = 50

In [None]:
MODEL = SentenceTransformer(Model_Save_Path)

In [None]:
def data_prep():
    df = pd.read_csv("../input/shopee-product-matching/train.csv")

    label_groups = df["label_group"].unique()
    # rn.shuffle(label_groups)
    train_df = df.loc[df.label_group.isin(label_groups[: int(0.8 * len(label_groups))])]
    eval_df = df.loc[df.label_group.isin(label_groups[int(0.8 * len(label_groups)) :])]

    # Prepare train_data according to ContrastiveLoss
    train_examples = list()
    train_groups = [
        train_df.loc[train_df["label_group"] == lg]["title"].values.tolist()
        for lg in train_df["label_group"].unique()
    ]

    # Build FAISS index to query hard negative samples
    train_titles = sum(train_groups, [])
    train_embeddings = MODEL.encode(train_titles)
    train_index = faiss.IndexFlatL2(train_embeddings.shape[1])
    train_index.add(train_embeddings)
    for _, group in enumerate(train_groups):
        negative_pairs_no = int(max(CAP_SIZE - comb(len(group), 2), comb(len(group), 2)))

        group_embedding = np.ascontiguousarray(
            np.mean(MODEL.encode(group), axis=0).reshape(1, -1), dtype=np.float32
        )
        _, similar_idx = train_index.search(group_embedding, negative_pairs_no * 2)
        negative_titles = [train_titles[idx] for idx in similar_idx[0]]
        for title in group:
            try:
                negative_titles.remove(title)
            except:
                pass
        #negative_titles = negative_titles[:negative_pairs_no]
        if negative_pairs_no > 10:
            negative_titles = negative_titles[ negative_pairs_no-10:]
        else:
            negative_titles = negative_titles[:negative_pairs_no]
                 

        positive_pairs = [
            list(pair)
            for pair in list(itertools.combinations(group, 2))
            if (isinstance(pair[0], str) and isinstance(pair[1], str))
        ]
        for pair in positive_pairs:
            train_examples.append(InputExample(texts=pair, label=1))
        negative_pairs = [
            [rn.choice(rn.choice(positive_pairs)), negative_title]
            for negative_title in negative_titles
        ]
        for pair in negative_pairs:
            train_examples.append(InputExample(texts=pair, label=0))

    print("train_examples len:",len(train_examples))        
    train_dataset = SentencesDataset(train_examples, MODEL)
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE)

    # Prepare eval_data according to BinaryClassificationEvaluator
    eval_examples = list()
    eval_groups = [
        eval_df.loc[eval_df["label_group"] == lg]["title"].values.tolist()
        for lg in eval_df["label_group"].unique()
    ]
    # Build FAISS index to query hard negative samples
    eval_titles = sum(eval_groups, [])
    eval_embeddings = MODEL.encode(eval_titles)
    eval_index = faiss.IndexFlatL2(eval_embeddings.shape[1])
    eval_index.add(eval_embeddings)
    for _, group in enumerate(eval_groups):
        negative_pairs_no = int(max(CAP_SIZE - comb(len(group), 2), comb(len(group), 2)))

        group_embedding = np.ascontiguousarray(
            np.mean(MODEL.encode(group), axis=0).reshape(1, -1), dtype=np.float32
        )
        _, similar_idx = eval_index.search(group_embedding, negative_pairs_no * 2)
        negative_titles = [eval_titles[idx] for idx in similar_idx[0]]
        for title in group:
            try:
                negative_titles.remove(title)
            except:
                pass
 #       negative_titles = negative_titles[:negative_pairs_no]
        if negative_pairs_no > 10:
            negative_titles = negative_titles[ negative_pairs_no-10:]
        else:
            negative_titles = negative_titles[:negative_pairs_no]


        positive_pairs = [
            list(pair) + [1]
            for pair in list(itertools.combinations(group, 2))
            if (isinstance(pair[0], str) and isinstance(pair[1], str))
        ]
        negative_pairs = [
            [rn.choice(rn.choice(positive_pairs)[:2]), negative_title, 0]
            for negative_title in negative_titles
        ]
        eval_examples.append(positive_pairs)
        eval_examples.append(negative_pairs)

    eval_examples = sum(eval_examples, [])
    print("eval_examples len:",len(eval_examples))

    evaluator = evaluation.BinaryClassificationEvaluator(
        sentences1=list(zip(*eval_examples))[0],
        sentences2=list(zip(*eval_examples))[1],
        labels=list(zip(*eval_examples))[2],
        batch_size=BATCH_SIZE,
    )

    return train_dataloader, evaluator

In [None]:
train_dataloader, evaluator = data_prep()

In [None]:
print(MODEL)

In [None]:


train_loss = losses.ContrastiveLoss(model=MODEL)

if not os.path.exists(MODEL_SAVE_PATH):
    os.makedirs(MODEL_SAVE_PATH)
else:
    shutil.rmtree(MODEL_SAVE_PATH)
    os.makedirs(MODEL_SAVE_PATH)

MODEL.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=3,
    warmup_steps=100,
    evaluation_steps=500,
    output_path=MODEL_SAVE_PATH,
    evaluator=evaluator,
)

In [None]:
# DATA_PATH = '../input/'
DATA_PATH = '../input/shopee-product-matching/'
print(DATA_PATH)

In [None]:
# f1 score metric
def getMetric(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

In [None]:
train = pd.read_csv(DATA_PATH + 'train.csv')
train['image'] = DATA_PATH + 'train_images/' + train['image']
tmp = train.groupby('label_group').posting_id.agg('unique').to_dict()
train['target'] = train.label_group.map(tmp)

In [None]:
train = train.sort_values(by='label_group')
train['title'] = train['title'].str.lower()
train.head()

In [None]:
text_embeddings = MODEL.encode(train.title)
print('text embeddings shape',text_embeddings.shape)