In [1]:
from google.colab import drive
drive.mount('/content/drive')

path = '/content/drive/MyDrive/semantic_search/'

Mounted at /content/drive


In [2]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `FLP` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `FLP`


# Finetune alea-institute/kl3m-doc-small-uncased-001 with chunked_opinion:relevant:irrelevant query triples

1. The opinions were pre-chunked to at most 512 tokens
2. The queries were generated using gpt-4o
3. The finetuning is run in Google Colab with a T4 GPU

# Import Libraries

In [3]:
import pandas as pd

%pip install datasets -q
from datasets import Dataset

from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator



[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/485.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/194.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25h

# Load the data

In [4]:
train = pd.read_csv(f"{path}outputs/5.finetune_train_512.csv")
val = pd.read_csv(f"{path}outputs/5.finetune_val_512.csv")

# Take a look at the token count distribution

In [5]:
train["chunk_size"].describe()

Unnamed: 0,chunk_size
count,2828.0
mean,432.561174
std,75.62818
min,30.0
25%,435.0
50%,458.0
75%,471.0
max,480.0


In [6]:
val["chunk_size"].describe()

Unnamed: 0,chunk_size
count,489.0
mean,416.613497
std,91.049189
min,22.0
25%,418.0
50%,451.0
75%,468.0
max,480.0


# Generate dataset objects

In [7]:
cols = ["chunked_opinion", "relevant", "irrelevant"]
train = train[cols]
val = val[cols]

In [8]:
renames = {"chunked_opinion":"anchor", "relevant":"positive", "irrelevant":"negative"}
train = train.rename(columns=renames)
val = val.rename(columns=renames)

In [9]:
train_dataset = Dataset.from_pandas(train, split="train")
eval_dataset = Dataset.from_pandas(val, split="test")

In [10]:
train_dataset

Dataset({
    features: ['anchor', 'positive', 'negative'],
    num_rows: 2828
})

In [11]:
eval_dataset

Dataset({
    features: ['anchor', 'positive', 'negative'],
    num_rows: 489
})

# Train the model

In [12]:
BASE_MODEL = "alea-institute/kl3m-doc-small-uncased-001"
BASE_MODEL_NAME = BASE_MODEL.split("/")[-1]

In [13]:
# Load the model
model = SentenceTransformer(
    BASE_MODEL,
    model_card_data=SentenceTransformerModelCardData(
        language="en",
        license="apache-2.0",
        model_name=f"{BASE_MODEL_NAME} trained on triplets",
    )
)

# Define loss function
loss = MultipleNegativesRankingLoss(model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/699 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/949M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at alea-institute/kl3m-doc-small-uncased-001 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/12.0M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/987 [00:00<?, ?B/s]

In [14]:
# Specify training arguments
args = SentenceTransformerTrainingArguments(
    output_dir=f"{path}models/{BASE_MODEL_NAME}",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=True,
    bf16=False,
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    report_to="none"

)

In [15]:
# Evaluate the base model before finetuning
dev_evaluator = TripletEvaluator(
    anchors=eval_dataset["anchor"],
    positives=eval_dataset["positive"],
    negatives=eval_dataset["negative"],
    name="dev",
)
dev_evaluator(model)

{'dev_cosine_accuracy': 0.9284253716468811}

In [16]:
# Create a trainer & train
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=loss,
    evaluator=dev_evaluator,

)
trainer.train()

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Dev Cosine Accuracy
100,No log,0.435292,0.99182
200,No log,0.375966,0.993865
300,No log,0.336145,0.99182


TrainOutput(global_step=354, training_loss=0.3751106046687412, metrics={'train_runtime': 277.5364, 'train_samples_per_second': 10.19, 'train_steps_per_second': 1.276, 'total_flos': 0.0, 'train_loss': 0.3751106046687412, 'epoch': 1.0})

In [17]:
# Evaluate the finetuned model
dev_evaluator = TripletEvaluator(
    anchors=eval_dataset["anchor"],
    positives=eval_dataset["positive"],
    negatives=eval_dataset["negative"],
    name="dev",
)
dev_evaluator(model)

{'dev_cosine_accuracy': 0.9938650131225586}

In [18]:
# Save the trained model
model.save_pretrained(f"{path}models/{BASE_MODEL_NAME}/final")

In [19]:
# Push the trained model to hub
model.push_to_hub(f"{BASE_MODEL_NAME}_triples", private=True)

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/12.0M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/948M [00:00<?, ?B/s]

'https://huggingface.co/rachelFLP/kl3m-doc-small-uncased-001_triples/commit/c92ddc606a4a61042b2303b1000f524d5bf9631e'