### Fine-tune Sentence Transformer

In this notebook, we will go through the process of fine-tuning a sentence transformer. I made this class to make it easier to fine-tune a sentence transformer.

In [111]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sklearn.model_selection import train_test_split
import pandas as pd


### Cell 2 - FineTuneSentenceTransformer class

The class takes in a dataframe of startups and industries and fine-tunes the sentence transformer on the descriptions of the startups and the keywords of the industries. It is a copy of the `FineTuneSentenceTransformer` class in `src/models/training.py`. This code has a bug that i discovered too late. I left the model training, and as an output, you can see the csv file in `models/fine_tuned_sentence_transformer_1/eval/similarity_evaluation_results.csv`. I'm not sure what went wrong, but if need be, I can fix it. Its currently not a priority since this step was more or less a bonus.

The basic steps are as follows:

- Merge the startups and industries dataframes
- Filter out industries that have less than 3 startups
- Split the data into train, validation, and test sets, and stratify the split by the keywords
- Prepare the examples for the dataloader in the format of sentence1, sentence2
- Prepare the dataloader
- Prepare the evaluator for the validation set (sentence1, sentence2, score) where score is 1 for all examples
- Prepare the loss
- Train the model

In [103]:
class FineTuneSentenceTransfomer:
    def __init__(self, startups, industries, label_count_threshold=2, sentence_transformer='sentence-transformers/all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(sentence_transformer)
        self.data = self.merge_features(startups, industries, label_count_threshold)
        self.loss = losses.MultipleNegativesRankingLoss(self.model)

    def merge_features(self, startups, industries, label_count_threshold=3):

        startups.dropna(inplace=True)
        industries.dropna(inplace=True)
        merged_df = pd.merge(startups, industries, left_on='industry1', right_on='industry', how='left')
        merged_df = merged_df[['cb_description', 'industry1', 'keywords', 'id_y']].dropna()
        merged_df = merged_df.groupby('industry1').filter(lambda x : len(x)>label_count_threshold)
        merged_df['id_y'] = merged_df['id_y'].astype(int)
        self.merged_df = merged_df.rename(columns={'industry1': 'industry', 'cb_description': 'description', 'id_y': 'industry_id'})
        return self.merged_df


    def split_data(self):

        descriptions = self.merged_df['description']
        keywords = self.merged_df['keywords']

        descriptions_train, self.descriptions_test, keywords_train, self.keywords_test = train_test_split(descriptions, keywords, test_size=0.15, random_state=42, stratify=keywords)
        self.descriptions_train, self.descriptions_val, self.keywords_train, self.keywords_val = train_test_split(descriptions_train, keywords_train, test_size=0.1765, random_state=42, stratify=keywords_train)

        return self.keywords_train, self.keywords_val, self.keywords_test, self.descriptions_train, self.descriptions_val, self.descriptions_test

    def prepare_examples(self, descriptions, keywords):
        examples = []
        for i in range(len(descriptions)):
            examples.append(InputExample(texts=[descriptions[i]], label=keywords[i]))
        return examples

    def prepare_dataloader(self, examples, batch_size=16):
        return DataLoader(examples, shuffle=True, batch_size=batch_size)

    def prepare_evaluator(self):

        val_sentences1 = [description for description, keyword in zip(self.descriptions_val, self.keywords_val)]
        val_sentences2 = [keyword for description, keyword in zip(self.descriptions_val, self.keywords_val)]
        val_scores = [1] * len(val_sentences1)
        self.evaluator = EmbeddingSimilarityEvaluator(val_sentences1, val_sentences2, val_scores)

        return self.evaluator

    def prepare_loss(self):
        self.loss = losses.CosineSimilarityLoss(self.model)
        return self.loss

    def train(self,
              train_dataloader,
              train_loss,
              epochs=4,
              output_path=r'C:\Users\imran\DataspellProjects\WalidCase\models',
              warmup_steps=100,
              evaluation_steps=100,
              weight_decay=0.01,
              max_grad_norm=1.0,
              save_best_model=True,
              checkpoint_save_steps=100,
              checkpoint_path=r'C:\Users\imran\DataspellProjects\WalidCase\models\checkpoint'
              ):

        self.model.fit(train_objectives=[(train_dataloader, train_loss)],
                       evaluator=self.evaluator,
                       epochs=epochs,
                       evaluation_steps=evaluation_steps,
                       warmup_steps=warmup_steps,
                       output_path=output_path,
                       weight_decay=weight_decay,
                       checkpoint_path=checkpoint_path,
                       checkpoint_save_steps=checkpoint_save_steps,
                       max_grad_norm=max_grad_norm,
                       save_best_model=save_best_model,
                       )
        return self.model





In [93]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

startups = pd.read_csv(r'C:\Users\imran\DataspellProjects\WalidCase\data\tagged\tagged_with_sentence_transformer.csv')
industries = pd.read_csv(r'C:\Users\imran\DataspellProjects\WalidCase\data\processed\industries_clean.csv', sep='\t')
industries.dropna(inplace=True)
startups.dropna(inplace=True)
merged_df = pd.merge(startups, industries, left_on='industry1', right_on='industry', how='left')
merged_df = merged_df.groupby('industry1').filter(lambda x : len(x)>2)
merged_df = merged_df[['cb_description', 'industry1', 'keywords', 'id_y']].dropna()
merged_df['id_y'] = merged_df['id_y'].astype(int)
merged_df = merged_df.rename(columns={'industry1': 'industry', 'cb_description': 'description', 'id_y': 'industry_id'})
merged_df

Unnamed: 0,description,industry,keywords,industry_id
0,modular knowledge system identity credential m...,cybersecurity,access malware encryption authentication firew...,13
1,create revolutionary dna sequence technology h...,biotech,vaccine drug cell pharmaceutical genetic engin...,23
2,commit positive luxury skincare push boundary ...,beauty,cosmetic makeup fragrance haircare tech skinca...,84
3,internet company offer marketplace rare desira...,fashion,clothing apparel retail style trend commerce,64
4,application allow user play match favorite vid...,esports,streaming competition game virtual tournament ...,4
...,...,...,...,...
2706,believe great interior design accessible avail...,food & beverage,restaurant beverage catering foodtech,120
2707,give ability manage advance payment request au...,professional services,support strategy account human finance consult...,7
2708,mission improve protein production disruptive ...,longevity,age lifespan genomic anti healthspan extension...,15
2709,solve key step document base process receive d...,logistics,ship fulfillment warehousing tracking,45


# Split the data into train, validation, and test sets


In [101]:
descriptions = merged_df['description']
keywords = merged_df['keywords']

descriptions_train, descriptions_test, keywords_train, keywords_test = train_test_split(descriptions, keywords, test_size=0.15, random_state=42, stratify=keywords)
descriptions_train, descriptions_val, keywords_train, keywords_val = train_test_split(descriptions_train, keywords_train, test_size=0.1765, random_state=42, stratify=keywords_train)


ValueError: too many values to unpack (expected 2)

Save the data for reproducibility

In [99]:
descriptions_train.to_csv(r'C:\Users\imran\DataspellProjects\WalidCase\data\train_sets/descriptions_train.csv', index=False)
descriptions_val.to_csv(r'C:\Users\imran\DataspellProjects\WalidCase\data\train_sets/descriptions_val.csv', index=False)
descriptions_test.to_csv(r'C:\Users\imran\DataspellProjects\WalidCase\data\train_sets/descriptions_test.csv', index=False)

keywords_train.to_csv(r'C:\Users\imran\DataspellProjects\WalidCase\data\train_sets/keywords_train.csv', index=False)
keywords_val.to_csv(r'C:\Users\imran\DataspellProjects\WalidCase\data\train_sets/keywords_val.csv', index=False)
keywords_test.to_csv(r'C:\Users\imran\DataspellProjects\WalidCase\data\train_sets/keywords_test.csv', index=False)

In [112]:
# Make examples

train_examples = [InputExample(texts=[desc, kw]) for desc, kw in zip(descriptions_train, keywords_train)]
test_examples = [InputExample(texts=[desc, kw]) for desc, kw in zip(descriptions_test, keywords_test)]
val_examples = [InputExample(texts=[desc, kw]) for desc, kw in zip(descriptions_val, keywords_val)]

len(train_examples), len(test_examples), len(val_examples)


(1603, 344, 344)

In [113]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
val_dataloader = DataLoader(val_examples, shuffle=False, batch_size=16)

In [114]:
val_sentences1 = [description for description, keyword in zip(descriptions_val, keywords_val)]
val_sentences2 = [keyword for description, keyword in zip(descriptions_val, keywords_val)]
val_scores = [1] * len(val_sentences1)

evaluator = EmbeddingSimilarityEvaluator(val_sentences1, val_sentences2, val_scores)

In [116]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
train_loss = losses.MultipleNegativesRankingLoss(model=model)
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=4,
    evaluation_steps=500,
    warmup_steps=2,
    output_path=r'C:\Users\imran\DataspellProjects\WalidCase\models\finetuned_sentence_transformer_1'
)

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/101 [00:00<?, ?it/s]


KeyboardInterrupt

