In [None]:
!pip install transformers

In [8]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

# Load and preprocess your dataset
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(
            texts, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt'
        )
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Load the pretrained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)  # Change num_labels as needed

your_actual_text_data = [
    "This is a positive example.",
    "Another positive text.",
    "A negative example with a negative sentiment.",
    "This is a neutral sentence.",
    "More positive text.",
    "Yet another example with negative sentiment.",
    "I am doing good today",
    "Why dont you get out of my house?",
    "I went to get a haircut",
    "This cat is 1 year old",
    "Dont call me anymore",
]

# Example labels (0 for negative, 1 for neutral, 2 for positive)

your_actual_label_data = [2, 2, 0, 1, 2, 0, 2, 0, 1, 1,0]
train_texts, val_texts, train_labels, val_labels = train_test_split(
    your_actual_text_data, your_actual_label_data, test_size=0.2, random_state=42
)

# Create DataLoaders for training and validation
train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Define the optimizer and training loop
optimizer = AdamW(model.parameters(), lr=2e-5)

num_epochs = 3  # Adjust the number of epochs as needed

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        inputs = {key: batch[key].to(model.device) for key in batch}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    model.eval()
    val_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in val_loader:
            inputs = {key: batch[key].to(model.device) for key in batch}
            outputs = model(**inputs)
            loss = outputs.loss
            val_loss += loss.item()
            total += inputs['labels'].size(0)
            _, predicted = torch.max(outputs.logits, 1)
            correct += (predicted == inputs['labels']).sum().item()

    val_loss /= len(val_loader)
    accuracy = correct / total

    print(f"Epoch {epoch + 1}/{num_epochs}: Validation Loss: {val_loss}, Accuracy: {100 * accuracy:.2f}%")

# Save the fine-tuned model
model.save_pretrained("fine-tuned-bert-model")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3: Validation Loss: 1.0844193696975708, Accuracy: 33.33%
Epoch 2/3: Validation Loss: 1.0836212635040283, Accuracy: 33.33%
Epoch 3/3: Validation Loss: 1.0797704458236694, Accuracy: 33.33%
