### Finetuning sentence embeddings with SBERT

Instead of a cross-encoder, Sentence-BERT (SBERT) uses a siamese network, two encoders that yield independent embeddings (but semantically comparable) for each of the input sentences.

Paper: https://arxiv.org/abs/1908.10084  
A number of pre-trained SBERT models are available: https://www.sbert.net/docs/pretrained_models.html

My attempt to fine-tune the SBERT pre-trained model with the data of this competition is to replicate the siamese network structure with two instances of the model and train it with pairs of anchor and target strings. The scores guide what the cosine similarity should be for each pair, moving the embeddings to the patent space.

In [None]:
from IPython.display import Image
Image('../input/patent-img/cosim.png', width = 600, height = 250)

In [None]:
import os

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

import pytorch_lightning as pl
from pytorch_lightning import LightningModule
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

from transformers import AutoConfig, AutoTokenizer, AutoModel


DATA_PATH_TRAIN = '../input/us-patent-phrase-to-phrase-matching/train.csv'
DATA_PATH_TEST  = '../input/us-patent-phrase-to-phrase-matching/test.csv'

TOKENIZER_DIR_LOAD = '../input/patent-tokenizer/'
TOKENIZER_DIR_SAVE = './tokenizer'

MODEL_DIR_LOAD = '../input/patent-models/'
MODEL_DIR_SAVE = './models'

MODEL_NAME = 'sentence-transformers/all-mpnet-base-v2'
HUGGING_FACE_CACHE = './hugging_face_cache'


IS_INFERENCE_TIME = True

In [None]:
class PatentDataModule(pl.LightningDataModule):

    def __init__(self, data_path, fold):

        super(PatentDataModule, self).__init__()

        print('reading data...')
        data = pd.read_csv(data_path)

        dict_code_descs = dict( A = 'Human Necessities', 
                                B = 'Operations and Transport', 
                                C = 'Chemistry and Metallurgy', 
                                D = 'Textiles', 
                                E = 'Fixed Constructions', 
                                F = 'Mechanical Engineering', 
                                G = 'Physics', 
                                H = 'Electricity', 
                                Y = 'Emerging Cross-Sectional Technologies')

        data['cpc_code'] = 'cpc' + data.context.str.lower()
        data['cpc_desc'] = data.context.apply(lambda x: dict_code_descs[x[0]].lower())

        data.anchor = data.anchor + ' ' + data.cpc_code + ' ' + data.cpc_desc
        data.target = data.target + ' ' + data.cpc_code + ' ' + data.cpc_desc

        max_length = max(map(lambda x: len(x.split(' ')) +  1, pd.concat((data.anchor, data.target))))
        print('max_length:',max_length)

        if IS_INFERENCE_TIME:
            
            tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR_LOAD)
        else:
            
            tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=HUGGING_FACE_CACHE)
            tokenizer.add_tokens(list(pd.unique('cpc' + pd.read_csv(DATA_PATH_TRAIN).context.str.lower())))
            tokenizer.save_pretrained(TOKENIZER_DIR_SAVE)

        if not 'score' in data.columns:
            # for test data
            data['score'] = np.nan

        train = []
        val = []
        i = 0

        if fold >= 0:

            for _, group in data.groupby('anchor'):

                if i % 5 == fold:
                    val.append(group)
                else:
                    train.append(group)

                i += 1
        else:
            val.append(data)

        if len(train) > 0:

            train = pd.concat(train)
            self.train_ds = PatentDataSet(train.anchor, train.target, train.score, tokenizer, max_length)

        if len(val) > 0:

            val = pd.concat(val)
            self.val_ds = PatentDataSet(val.anchor, val.target, val.score, tokenizer, max_length)

    def train_dataloader(self):

        return DataLoader(self.train_ds, shuffle=True, batch_size=128)

    def val_dataloader(self):

        return DataLoader(self.val_ds, shuffle=False, batch_size=512)


class PatentDataSet(Dataset):
    
    def __init__(self, anchor, target, score, tokenizer, max_length):

        super(PatentDataSet, self).__init__()

        self.score = score.to_numpy(dtype=np.float32)

        print('tokenization of data...')

        bert_anchor = tokenizer(    list(anchor),
                                    padding = "max_length",
                                    max_length = max_length,
                                    truncation = True,
                                    return_tensors="pt")

        self.anchor = bert_anchor['input_ids']
        self.anchor_mask = bert_anchor['attention_mask']

        bert_target = tokenizer(    list(target),
                                    padding = "max_length",
                                    max_length = max_length,
                                    truncation = True,
                                    return_tensors="pt")

        self.target = bert_target['input_ids']
        self.target_mask = bert_target['attention_mask']

    def __len__(self):

        return len(self.score)

    def __getitem__(self, idx):

        return self.anchor[idx], self.anchor_mask[idx], self.target[idx], self.target_mask[idx], self.score[idx]


class PatentTransfomer(LightningModule):

    def __init__(self):

        super(PatentTransfomer, self).__init__()

        if IS_INFERENCE_TIME:

            config = AutoConfig.from_pretrained(MODEL_DIR_LOAD)

            self.transformer_anchor = AutoModel.from_config(config)
            self.transformer_target = AutoModel.from_config(config) 

        else:
            self.transformer_anchor = AutoModel.from_pretrained(MODEL_NAME, cache_dir=HUGGING_FACE_CACHE)
            self.transformer_target = AutoModel.from_pretrained(MODEL_NAME, cache_dir=HUGGING_FACE_CACHE)
            
            tokenizer_length = len(AutoTokenizer.from_pretrained(TOKENIZER_DIR_LOAD))
            self.transformer_anchor.resize_token_embeddings(tokenizer_length)
            self.transformer_target.resize_token_embeddings(tokenizer_length)

            # Weights are not saved, only the config file to load the layers during inference.
            # Weights will be saved / loaded with pytorch lightning instead.
            self.transformer_target.save_pretrained(MODEL_DIR_SAVE,state_dict={})

        # freezing first layers parameters?
        # for i in range(6):
        #     self.transformer_anchor.encoder.layer[i].requires_grad_(False)
        #     self.transformer_target.encoder.layer[i].requires_grad_(False)
 
    def mean_pooling(self, model_output, attention_mask):

        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def forward(self, anchor, anchor_mask, target, target_mask):

        if False and IS_INFERENCE_TIME:
            embeddings_anchor = self.transformer_target(anchor, anchor_mask)
        else:
            embeddings_anchor = self.transformer_anchor(anchor, anchor_mask)

        embeddings_target = self.transformer_target(target, target_mask)

        return F.cosine_similarity( self.mean_pooling(embeddings_anchor, anchor_mask),
                                    self.mean_pooling(embeddings_target, target_mask))

    def training_step(self, train_batch, batch_idx):

        return self.process_batch(*train_batch, 'train')

    def validation_step(self, val_batch, batch_idx):
        
        with torch.no_grad():
            self.process_batch(*val_batch, 'val')

    def process_batch(self, anchor, anchor_mask, target, target_mask, target_scores, prefix):

        cos_similarity = self.forward(anchor, anchor_mask, target, target_mask)

        loss = - self.pearson_corr(cos_similarity, target_scores)

        self.log(prefix+'_loss', loss.item())

        return loss

    def pearson_corr(self, x, y):

        cos = nn.CosineSimilarity(dim=0, eps=1e-6)
        return cos(x - x.mean(dim=0, keepdim=True), y - y.mean(dim=0, keepdim=True))

    def configure_optimizers(self):

        optimizer = torch.optim.Adam(self.parameters(), lr=1e-5)
        sccheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=1, factor=1/5, verbose=True, min_lr=1e-7)
        return [optimizer], {'scheduler': sccheduler, 'monitor': 'val_loss'}

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

torch.manual_seed(0)
np.random.seed(0)

# num_folds = 5
num_folds = 1

if IS_INFERENCE_TIME:

    data = PatentDataModule(DATA_PATH_TEST, -1)

    test_scores = np.zeros(len(data.val_ds), dtype=np.float32)

    for fold in range(num_folds):

        print('loading model for fold {} and scoring...'.format(fold))
        model = PatentTransfomer.load_from_checkpoint(  '{}sbert_{}.ckpt'.format(MODEL_DIR_LOAD,fold))

        model.eval()
        model.to(device)

        fold_scores = []

        for batch in DataLoader(data.val_ds, shuffle=False, batch_size=128):
            
            fold_scores.append(model(   batch[0].to(device),
                                        batch[1].to(device),
                                        batch[2].to(device),
                                        batch[3].to(device)
                                    ).detach().cpu().numpy())

        test_scores = test_scores + np.concatenate(fold_scores)

    test_scores = test_scores / num_folds

    print('submitting...')
    test_df = pd.read_csv(DATA_PATH_TEST)
    sub_df = pd.DataFrame(data={
        "id": test_df["id"],
        "score": test_scores
    })

    sub_df.to_csv("submission.csv", index=False)
    # print(sub_df)

else:

    if not os.path.exists(TOKENIZER_DIR_SAVE):
        os.makedirs(TOKENIZER_DIR_SAVE)

    if not os.path.exists(MODEL_DIR_SAVE):
        os.makedirs(MODEL_DIR_SAVE)

    if not os.path.exists(HUGGING_FACE_CACHE):
        os.makedirs(HUGGING_FACE_CACHE)

    for fold in range(num_folds):

        print('training model for fold {}...'.format(fold))

        checkpoint_callback = ModelCheckpoint(
            dirpath=MODEL_DIR_SAVE,
            filename='sbert_{}'.format(fold),
            save_top_k=1,
            monitor='val_loss',
            mode='min',
            verbose=True
        )

        early_stop_callback = EarlyStopping(
            patience=5,
            monitor='val_loss',
            mode='min',
            verbose=True
        )

        trainer = pl.Trainer(   logger=pl_loggers.TensorBoardLogger('./logs/'),
                                gpus=1,
                                max_epochs=10,
                                callbacks=[early_stop_callback,checkpoint_callback] )

        data = PatentDataModule(DATA_PATH_TRAIN, fold)
        model = PatentTransfomer()
        trainer.fit(model, data)