In [None]:
!pip install transformers langchain torch 
!pip install sentence-transformers
!pip install langchain-huggingface
!pip install -r ../requirements.txt
!pip freeze > ../requirements.txt


In [119]:
import torch
import langchain # No reason why not to use langchain
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, AdamW
# AdamW is an optimization algorithm used for better performance,  correct weight decay
# better generalization (avoiding overfitting)
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
# from langchain.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
# I have used cohere for embeddings before, now using HuggingFaceEmbeddings

In [120]:
# Sample Dataset
texts = ["I am a cool guy", 
         "AI will change this world", 
         "I hate phone scams",
         "I love cheerful people",
         "I love friendly people",
         "I hate rude people",
        ]
labels = [1, 1, 0, 1, 1, 0]  # 1: positive, 0: negative

In [175]:
# Tokenizer and Model
# using this model https://huggingface.co/google-bert/bert-base-uncased 
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
# model = BertModel.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2,)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [153]:
# Tokenizing the inputs
max_len = 128
encodings = tokenizer(texts, padding=True, truncation=True, max_length=max_len, return_tensors='pt')
output = model(**encodings)

In [154]:
# Train and test split
train_texts, validation_texts, train_labels, validation_labels = train_test_split(texts, labels, test_size=0.2)

In [155]:
# Tokenizing the inputs for the training set
train_encodings = tokenizer(train_texts, padding=True, truncation=True, max_length=max_len, return_tensors='pt')
validation_encodings = tokenizer(validation_texts, padding=True, truncation=True, max_length=max_len, return_tensors='pt')

# Create tensors for input ids, attention masks, and labels
train_input_ids = train_encodings['input_ids']
train_attention_masks = train_encodings['attention_mask']
train_labels_tensor = torch.tensor(train_labels)

validation_input_ids = validation_encodings['input_ids']
validation_attention_masks = validation_encodings['attention_mask']
validation_labels_tensor = torch.tensor(validation_labels)

In [156]:
# Check the tensors
print("Traning")
print(f"Input IDs: {train_input_ids}")
print(f"Attention Masks: {train_attention_masks}")
print(f"Labels: {train_labels_tensor}")

Traning
Input IDs: tensor([[  101,  1045,  2293, 18350,  2111,   102,     0],
        [  101,  1045,  2293,  5379,  2111,   102,     0],
        [  101,  1045,  5223, 12726,  2111,   102,     0],
        [  101,  1045,  5223,  3042,  8040, 13596,   102]])
Attention Masks: tensor([[1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1]])
Labels: tensor([1, 1, 0, 0])


In [157]:
# Check the tensors
print("Validation")
print(f"Input IDs: {validation_input_ids}")
print(f"Attention Masks: {validation_attention_masks}")
print(f"Labels: {validation_labels_tensor}")

Validation
Input IDs: tensor([[ 101, 1045, 2572, 1037, 4658, 3124,  102],
        [ 101, 9932, 2097, 2689, 2023, 2088,  102]])
Attention Masks: tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1]])
Labels: tensor([1, 1])


In [158]:
# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

In [176]:
# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
input_ids = input_ids.to(device)
attention_masks = attention_masks.to(device)
#labels_tensor = labels_tensor.to(device) # Not needed for BertModel

In [177]:
# Training loop
model.train()  # Set the model to training mode
epochs = 13  # Define the number of epochs
loss_fn = torch.nn.CrossEntropyLoss()  # Define loss function

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    
    # Forward pass
    optimizer.zero_grad()  # Clear previous gradients
    logits = model(input_ids=train_input_ids.to(device), attention_mask=train_attention_masks.to(device), labels=train_labels_tensor.to(device)).logits
    
    # Calculate loss
    loss = loss_fn(logits, train_labels_tensor.to(device))  # Actual loss calculation

    # Backward pass
    loss.backward()  # Calculate gradients
    optimizer.step()  # Update weights

    print(f"Loss: {loss.item()}")


Epoch 1/13
Loss: 0.656816840171814
Epoch 2/13
Loss: 0.7356657981872559
Epoch 3/13
Loss: 0.737143337726593
Epoch 4/13
Loss: 0.7125605940818787
Epoch 5/13
Loss: 0.6921197772026062
Epoch 6/13
Loss: 0.6855766773223877
Epoch 7/13
Loss: 0.6678639650344849
Epoch 8/13
Loss: 0.69581139087677
Epoch 9/13
Loss: 0.6950128674507141
Epoch 10/13
Loss: 0.7002564072608948
Epoch 11/13
Loss: 0.709318220615387
Epoch 12/13
Loss: 0.7273451685905457
Epoch 13/13
Loss: 0.762508749961853


In [181]:
# Save the model if needed
model.save_pretrained("./fine_tuned_bert")
tokenizer.save_pretrained("./fine_tuned_bert")

('./fine_tuned_bert/tokenizer_config.json',
 './fine_tuned_bert/special_tokens_map.json',
 './fine_tuned_bert/vocab.txt',
 './fine_tuned_bert/added_tokens.json')

In [182]:

# Load the fine-tuned model with LangChain
hf_embedding = HuggingFaceEmbeddings(model_name="./fine_tuned_bert")


In [183]:
# Example sentence embeddings with LangChain
sentence = "I love using BERT for transfer learning."
embedding = hf_embedding.embed_query(sentence)
print("Sentence Embedding:", embedding)

Sentence Embedding: [0.34589746594429016, 0.36306294798851013, 0.0735308825969696, 0.17232845723628998, 0.25641950964927673, -0.6112189292907715, 0.18540552258491516, 0.6184269785881042, 0.028799623250961304, -0.3970279097557068, 0.06879080832004547, -0.38036468625068665, 0.2705947756767273, 0.4087296426296234, -0.11404649913311005, -0.05419629067182541, 0.06095239520072937, 0.056249797344207764, 0.13045738637447357, 0.2099624127149582, -0.13728414475917816, -0.0386010967195034, 0.0044868155382573605, 0.5139421820640564, 0.3674030900001526, -0.05744873732328415, -0.09730406105518341, 0.03610280528664589, -0.13072578608989716, -0.33737850189208984, 0.24164095520973206, 0.19513057172298431, 0.016824020072817802, 0.0882086306810379, -0.4711914658546448, -0.26859337091445923, -0.3821861445903778, -0.2459501475095749, -0.3571707606315613, -0.1687784492969513, -0.18893380463123322, -0.10749228298664093, -0.030550247058272362, -0.28504541516304016, 0.22028732299804688, -0.2861773669719696, -0