In [None]:
!pip install accelerate -U
!pip install -U sentence-transformers

In [None]:
"""
This file runs Masked Language Model. You provide a training file. Each line is interpreted as a sentence / paragraph.
Optionally, you can also provide a dev file.

The fine-tuned model is stored in the output/model_name folder.

Usage:
python train_mlm.py model_name data/train_sentences.txt [data/dev_sentences.txt]
"""

from transformers import AutoModelForMaskedLM, AutoTokenizer
from transformers import DataCollatorForLanguageModeling, DataCollatorForWholeWordMask
from transformers import Trainer, TrainingArguments
import sys
import gzip
from datetime import datetime
import torch

if len(sys.argv) < 3:
    print("Usage: python train_mlm.py model_name data/train_sentences.txt [data/dev_sentences.txt]")
    exit()

model_name = "sentence-transformers/all-mpnet-base-v2" #sys.argv[1]
per_device_train_batch_size = 16

save_steps = 1000  # Save model every 1k steps
num_train_epochs = 20  # Number of epochs
use_fp16 = False  # Set to True, if your GPU supports FP16 operations
max_length = 512  # Max length for a text input
do_whole_word_mask = True  # If set to true, whole words are masked
mlm_prob = 0.15  # Probability that a word is replaced by a [MASK] token

# Load the model
model = AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


output_dir = "output/{}-{}".format(model_name.replace("/", "_"), datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
print("Save checkpoints to:", output_dir)


##### Load our training datasets

train_sentences = []
train_path = "text.txt" #sys.argv[2]
with gzip.open(train_path, "rt", encoding="utf8") if train_path.endswith(".gz") else open(
    train_path, "r", encoding="utf8"
) as fIn:
    for line in fIn:
        line = line.strip()
        if len(line) >= 10:
            train_sentences.append(line)

print("Train sentences:", len(train_sentences))

dev_sentences = []
# if len(sys.argv) >= 4:
dev_path = "text.txt" #sys.argv[3]
with gzip.open(dev_path, "rt", encoding="utf8") if dev_path.endswith(".gz") else open(
    dev_path, "r", encoding="utf8"
) as fIn:
    for line in fIn:
        line = line.strip()
        if len(line) >= 10:
            dev_sentences.append(line)

print("Dev sentences:", len(dev_sentences))


# A dataset wrapper, that tokenizes our data on-the-fly
class TokenizedSentencesDataset:
    def __init__(self, sentences, tokenizer, max_length, cache_tokenization=False):
        self.tokenizer = tokenizer
        self.sentences = sentences
        self.max_length = max_length
        self.cache_tokenization = cache_tokenization

    def __getitem__(self, item):
        if not self.cache_tokenization:
            return self.tokenizer(
                self.sentences[item],
                add_special_tokens=True,
                truncation=True,
                max_length=self.max_length,
                return_special_tokens_mask=True,
            )

        if isinstance(self.sentences[item], str):
            self.sentences[item] = self.tokenizer(
                self.sentences[item],
                add_special_tokens=True,
                truncation=True,
                max_length=self.max_length,
                return_special_tokens_mask=True,
            )
        return self.sentences[item]

    def __len__(self):
        return len(self.sentences)


train_dataset = TokenizedSentencesDataset(train_sentences, tokenizer, max_length)
dev_dataset = (
    TokenizedSentencesDataset(dev_sentences, tokenizer, max_length, cache_tokenization=True)
    if len(dev_sentences) > 0
    else None
)


##### Training arguments

if do_whole_word_mask:
    data_collator = DataCollatorForWholeWordMask(tokenizer=tokenizer, mlm=True, mlm_probability=mlm_prob)
else:
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=mlm_prob)

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=num_train_epochs,
    evaluation_strategy="steps" if dev_dataset is not None else "no",
    per_device_train_batch_size=per_device_train_batch_size,
    eval_steps=save_steps,
    save_steps=save_steps,
    logging_steps=save_steps,
    save_total_limit=3,
    prediction_loss_only=True,
    fp16=use_fp16,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=dev_dataset
)

print("Save tokenizer to:", output_dir)
tokenizer.save_pretrained(output_dir)

trainer.train()

print("Save model to:", output_dir)
model.save_pretrained(output_dir)

print("Training done")

In [1]:
from sentence_transformers import SentenceTransformer, models, util


word_embedding_model = models.Transformer("output/sentence-transformers_all-mpnet-base-v2-2024-02-13_17-18-11", max_seq_length=512)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

  from .autonotebook import tqdm as notebook_tqdm
Some weights of MPNetModel were not initialized from the model checkpoint at output/sentence-transformers_all-mpnet-base-v2-2024-02-13_17-18-11 and are newly initialized: ['mpnet.pooler.dense.bias', 'mpnet.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# pwd

'/data/Shrey-Jain/embed'

In [None]:
# model = SentenceTransformer("output/sentence-transformers_all-mpnet-base-v2-2024-02-13_12-54-59")

In [2]:
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


print(f'model max lenght is :{model.max_seq_length}')
model.max_seq_length = 512
print(f'model max lenght is :{model.max_seq_length}')

model max lenght is :512
model max lenght is :512


In [3]:
sentences1 = 'davenport hits out at wimbledon world number one lindsay davenport has criticised wimbledon over the issue of equal prize money for women.  reacting to a disputed comment by all england club chairman tim phillips  the american said:  i think it is highly insulting if prize money is taken away.  somebody  i think it was mr phillips  said they won t have money for flowers at wimbledon. that s insulting.  an all england club spokesperson denied phillips made the remark  insisting:  he definitely didn t say it.  the statement added:  it was said by someone else and was a humorous aside at the end of a radio interview when the conversation had moved to talking about the wimbledon grounds.   davenport was speaking following the announcement that this week s dubai duty free event will join the us and australian opens in offering equal prize money for women.  you hear about women playing only three sets while men play five   said daveport.  and the best women are never going to beat the best men.  but it s a different game you go to watch with the women - it doesn t make it better or worse.  hopefully we will be able to change people s minds.   serena williams  who is also in dubai  added:  i m obviously for equal prize money.  women s tennis is exciting. men s tennis is exciting as well  but the women have it right now.  if you are bringing in the spectators you should be able to reap what everyone else is able to reap.'
sentences2 = 'safety alert as gm recalls cars the world s biggest carmaker general motors (gm) is recalling nearly 200 000 vehicles in the us on safety grounds  according to federal regulators.  the national highway traffic safety administration (nhtsa) said the largest recall involves 155 465 pickups  vans and sports utility vehicles (suvs). this is because of possible malfunctions with the braking systems. the affected vehicles in the product recall are from the 2004 and 2005 model years  gm said. those vehicles with potential faults are the chevrolet avalanche  express  kodiak  silverade and suburban; the gmc savana  sierra and yukon.  the nhtsa said a pressure accumulator in the braking system could crack during normal driving and fragments could injure people if the hood was open. this could allow hydraulic fluid to leak  which could make it harder to brake or steer and could cause a crash  it warned. gm is also recalling 19 924 cadillac xlr coupes  srx suvs and pontiac grand prix sedans from the 2004 model year. this is because the accelerator pedal may not work properly in extremely cold temperatures  requiring more braking. in addition  the car giant is calling back 17 815 buick raniers  chevrolet trailblazers  gmc envoys and isuzu ascenders from the 2005 model years because the windshield is not properly fitted and could fall out in a crash. however  gm stressed that it did not know of any injuries related to the problems. news of the recall follows an announcement last month that gm expects earnings this year be lower than in 2004. the world s biggest car maker is grappling with losses in its european business  weak us sales and now a product recall. in january  gm said higher healthcare costs in north america  and lower profits at its financial services subsidiary would hurt its performance in 2005.'

embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

In [4]:
cosine_scores = util.cos_sim(embeddings1, embeddings2)

print(f'{cosine_scores[0][0].item()}')

0.5434198379516602
