In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load data

In [2]:
# Load the saved CSV files
# data_csv_file = '/Users/rohit24/Projects/meta/full_data.csv'

# df = pd.read_csv(data_csv_file)

# label_2_id = {"__label__aggregate": 0, "__label__model_fields": 1, "__label__non_aggregate_or_model_data": 2}
# id_2_label = ["__label__aggregate", "__label__model_fields", "__label__non_aggregate_or_model_data"]
# df['label'] = df['target_label'].map(label_2_id)
# df.drop(columns=['target_label'], inplace=True)
# df.head(5)

# texts = df['combined_field_name_and_description']
# labels = df['label']

In [5]:
data_csv_file = '/Users/rohit24/Projects/rohit/pgpython/src/fast_text/data/IMDB_dataset_100.csv'

df = pd.read_csv(data_csv_file)
texts = df['review'].tolist()
labels = [1 if sentiment == "positive" else 0 for sentiment in df['sentiment'].tolist()]

# Bert
Ref: https://medium.com/@khang.pham.exxact/text-classification-with-bert-7afaacc5e49b

In [7]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup # AdamW deprecated
from torch import optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
class TextClassificationDataset(Dataset):

    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        if idx < 0 or idx >= len(self.texts):
            raise IndexError(f"Index {idx} out of range (dataset size: {len(self.texts)})")
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            return_tensors="pt",
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(label),
        }

In [9]:
class BERTClassifier(nn.Module):
    
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

In [10]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [11]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [12]:
def predict_sentiment(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        return "positive" if preds.item() == 1 else "negative"
        # return id_2_label[preds.item()]

In [20]:
# Set up parameters
bert_model_name = 'bert-base-uncased'
num_classes = 2
max_length = 128
batch_size = 16
num_epochs = 4
learning_rate = 2e-5

In [14]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [15]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

In [17]:
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [21]:
# Train the model
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, train_dataloader, optimizer, scheduler, device)
    accuracy, report = evaluate(model, val_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)

Epoch 1/4
Validation Accuracy: 0.3000
              precision    recall  f1-score   support

           0       0.33      0.56      0.42         9
           1       0.20      0.09      0.12        11

    accuracy                           0.30        20
   macro avg       0.27      0.32      0.27        20
weighted avg       0.26      0.30      0.26        20

Epoch 2/4
Validation Accuracy: 0.3000
              precision    recall  f1-score   support

           0       0.33      0.56      0.42         9
           1       0.20      0.09      0.12        11

    accuracy                           0.30        20
   macro avg       0.27      0.32      0.27        20
weighted avg       0.26      0.30      0.26        20

Epoch 3/4
Validation Accuracy: 0.3000
              precision    recall  f1-score   support

           0       0.33      0.56      0.42         9
           1       0.20      0.09      0.12        11

    accuracy                           0.30        20
   macro avg  

In [24]:
# Test sentiment prediction
test_text = "The movie was great and I really enjoyed the performances of the actors."
sentiment = predict_sentiment(test_text, model, tokenizer, device)
print(f"Query text: {test_text}")
print(f"Predicted sentiment: {sentiment}")

Query text: The movie was great and I really enjoyed the performances of the actors.
Predicted sentiment: negative
