In [4]:
# Install necessary libraries
%pip install contractions
%pip install transformers tokenizers torch scikit-learn

# Import required libraries
import re  # Used for removing whitespaces, punctuation, capitalization
import contractions  # Used for expanding contractions
import nltk  # Used to remove stop words
import pandas as pd
import torch
import math
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from nltk.corpus import stopwords
from tokenizers import BertWordPieceTokenizer
#from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [None]:
# Download NLTK stopwords 
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))

###### STEP 1: LOAD DATASET ######
data = pd.read_csv("/Users/sachin/Desktop/text.csv")  

print(data.head())

print("\nChecking for missing values:")
print(data.isnull().sum())

# Drop unnecessary columns (e.g., index/serial number)
if 'label' in data.columns and 'text' in data.columns:
    print("\nDropping unnecessary index column...")
    data = data[['label', 'text']]  # Retain only 'label' and 'text'


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sachin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/sachin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


   Unnamed: 0                                               text  label
0           0      i just feel really helpless and heavy hearted      4
1           1  ive enjoyed being able to slouch about relax a...      0
2           2  i gave up my internship with the dmrg and am f...      4
3           3                         i dont know i feel so lost      0
4           4  i am a kindergarten teacher and i am thoroughl...      4

Checking for missing values:
Unnamed: 0    0
text          0
label         0
dtype: int64

Dropping unnecessary index column...


In [6]:
print(data.head())

   label                                               text
0      4      i just feel really helpless and heavy hearted
1      0  ive enjoyed being able to slouch about relax a...
2      4  i gave up my internship with the dmrg and am f...
3      0                         i dont know i feel so lost
4      4  i am a kindergarten teacher and i am thoroughl...


In [8]:
###PREPROCESSING###

# Initialize tokenizer 
tokenizer = BertWordPieceTokenizer()

# Train tokenizer on your dataset (if required)
tokenizer.train(
    files=["/Users/sachin/Desktop/text.csv"],  # Path to your dataset file
    vocab_size=30000,
    min_frequency=2,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
)
tokenizer.save_model("/Users/sachin/Documents")
tokenizer = BertWordPieceTokenizer("/Users/sachin/Documents/vocab.txt")

# Define maximum sequence length
MAX_SEQ_LEN = 20

# Text cleaning function
def textCleaner(s):
    if not isinstance(s, str) or not s.strip():  # Handle empty or invalid input
        return [tokenizer.token_to_id("[PAD]")] * MAX_SEQ_LEN  # Return padded sequence
    
    # Clean the text
    s = re.sub(r'[^A-Za-z0-9\s]', '', s).lower()  # Remove non-alphanumeric characters and convert to lowercase
    s = contractions.fix(s)  # Expand contractions (e.g., "don't" -> "do not")
    s = re.sub(r'\s+', ' ', s).strip()  # Remove extra whitespaces and strip leading/trailing spaces
    s = re.sub(r'\d+', '', s)  # Remove numbers from text

    # Tokenize and convert to token IDs
    tokens = tokenizer.encode(s).ids  # Convert tokens to token IDs

    # Pad or truncate to MAX_SEQ_LEN
    if len(tokens) < MAX_SEQ_LEN:
        tokens += [tokenizer.token_to_id("[PAD]")] * (MAX_SEQ_LEN - len(tokens))  # Pad with [PAD]
    else:
        tokens = tokens[:MAX_SEQ_LEN]  # Truncate to MAX_SEQ_LEN

    return tokens


data['text'] = data['text'].apply(textCleaner)


print("\nPreprocessed text examples:")
print(data['text'].head())





Preprocessed text examples:
0    [2, 24, 187, 82, 230, 1029, 91, 3662, 6298, 3,...
1    [2, 24, 146, 2957, 294, 751, 89, 13210, 324, 1...
2    [2, 24, 2361, 217, 109, 12825, 148, 88, 19149,...
3    [2, 24, 173, 158, 220, 24, 82, 131, 939, 3, 0,...
4    [2, 24, 137, 16, 13908, 3305, 91, 24, 137, 567...
Name: text, dtype: object


In [9]:

###### STEP 3: DATASET CLASS ######

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.texts[idx], dtype=torch.long)
        label = torch.tensor(self.labels[idx], dtype=torch.long)

        # Create attention mask: 1 for non-padding tokens, 0 for padding tokens
        attention_mask = (input_ids != tokenizer.token_to_id("[PAD]")).long()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "label": label
        }


###### STEP 4: SPLIT DATA INTO TRAINING AND VALIDATION SETS ######

texts = list(data['text'])
labels = list(data['label'])

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42)

train_dataset = TextDataset(train_texts, train_labels)
val_dataset = TextDataset(val_texts, val_labels)

###### STEP 5: CUSTOM COLLATE FUNCTION FOR PADDING ######

def collate_fn(batch):
    # Extract input_ids, attention_masks, and labels from the batch
    input_ids = [item["input_ids"] for item in batch]
    attention_masks = [item["attention_mask"] for item in batch]
    labels = [item["label"] for item in batch]

    # Pad sequences
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.token_to_id("[PAD]"))
    attention_masks = pad_sequence(attention_masks, batch_first=True, padding_value=0)  # Padding value for mask is 0
    labels = torch.tensor(labels, dtype=torch.long)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
        "label": labels
    }

###### STEP 6: DATALOADERS ######

BATCH_SIZE = 16

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)
for batch in train_loader:
    print(batch["input_ids"].shape)       # Should be [batch_size, seq_len]
    print(batch["attention_mask"].shape) # Should be [batch_size, seq_len]
    print(batch["label"].shape)          # Should be [batch_size]
    break



torch.Size([16, 20])
torch.Size([16, 20])
torch.Size([16])


In [10]:
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_layers, num_classes, ff_dim):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.layers = nn.ModuleList([
            nn.TransformerEncoderLayer(embed_dim, num_heads, ff_dim)
            for _ in range(num_layers)
        ])
        self.classifier = nn.Linear(embed_dim, num_classes)

    def forward(self, x, attention_mask=None):
        """
        Args:
            x (torch.Tensor): Input token IDs of shape [batch_size, seq_len].
            attention_mask (torch.Tensor): Attention mask of shape [batch_size, seq_len].
                                             1 for valid tokens, 0 for padding tokens.
        
        Returns:
            torch.Tensor: Output logits of shape [batch_size, num_classes].
        """
        # Embed the input token IDs
        x = self.embedding(x)  # Shape: [batch_size, seq_len, embed_dim]

        # Permute dimensions for TransformerEncoderLayer (expects [seq_len, batch_size, embed_dim])
        x = x.permute(1, 0, 2)  # Shape: [seq_len, batch_size, embed_dim]

        # Apply each Transformer layer
        for layer in self.layers:
            x = layer(x, src_key_padding_mask=attention_mask == 0)  # Directly pass binary mask

        # Permute back to [batch_size, seq_len, embed_dim]
        x = x.permute(1, 0, 2)  # Shape: [batch_size, seq_len, embed_dim]

        # Pooling: Take mean over sequence length
        x = x.mean(dim=1)  # Shape: [batch_size, embed_dim]

        # Classification head
        return self.classifier(x)  # Shape: [batch_size, num_classes]
    


In [11]:
# Define constants for the Transformer model
VOCAB_SIZE = 30000          # Vocabulary size (number of unique tokens in your tokenizer)
EMBEDDING_SIZE = 512        # Size of the embedding vector for each token
NUM_CLASSES = 6             # Number of output classes (e.g., emotions: sadness, joy, etc.)
NUM_HEADS = 8               # Number of attention heads in each Transformer layer
NUM_LAYERS = 6              # Number of Transformer encoder layers
FF_DIM = 2048               # Size of the feedforward network inside each Transformer layer

# Initialize the model
model = TransformerModel(
    vocab_size=VOCAB_SIZE,
    embed_dim=EMBEDDING_SIZE,
    num_heads=NUM_HEADS,
    num_layers=NUM_LAYERS,
    num_classes=NUM_CLASSES,
    ff_dim=FF_DIM,
)

# Move the model to the appropriate device (e.g., MPS for Mac GPU)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)

print("Model initialized and moved to device:", device)


Model initialized and moved to device: mps


In [12]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=5):
    model.to(device)  # Ensure the model is on the correct device
    train_loss_history = []
    val_loss_history = []

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        total_train_loss = 0

        for batch in train_loader:
            input_ids = batch["input_ids"].to(device)         # Move input tensors to device
            attention_mask = batch["attention_mask"].to(device)  # Move attention masks to device
            labels = batch["label"].to(device)               # Move labels to device

            optimizer.zero_grad()                            # Clear previous gradients
            outputs = model(input_ids, attention_mask=attention_mask)  # Forward pass
            loss = criterion(outputs, labels)               # Compute loss
            loss.backward()                                  # Backpropagation
            optimizer.step()                                 # Update model parameters

            total_train_loss += loss.item()                  # Accumulate training loss

        avg_train_loss = total_train_loss / len(train_loader)
        train_loss_history.append(avg_train_loss)
        print(f"Epoch {epoch + 1}/{num_epochs}, Training Loss: {avg_train_loss:.4f}")

        # Validation phase
        model.eval()  # Set model to evaluation mode
        total_val_loss = 0

        with torch.no_grad():  # Disable gradient computation during validation
            for batch in val_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["label"].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)  # Forward pass
                loss = criterion(outputs, labels)                         # Compute loss

                total_val_loss += loss.item()                             # Accumulate validation loss

        avg_val_loss = total_val_loss / len(val_loader)
        val_loss_history.append(avg_val_loss)
        print(f"Epoch {epoch + 1}/{num_epochs}, Validation Loss: {avg_val_loss:.4f}")

    return train_loss_history, val_loss_history



In [13]:
num_epochs = 5  # Number of epochs for training
train_loss_history, val_loss_history = train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    criterion=criterion,
    optimizer=optimizer,
    num_epochs=num_epochs,
)

# Save the trained model weights
torch.save(model.state_dict(), "transformer_model_weights.pth")
print("Model weights saved successfully!")


Epoch 1/5, Training Loss: 1.5787
Epoch 1/5, Validation Loss: 1.5792
Epoch 2/5, Training Loss: 1.5739
Epoch 2/5, Validation Loss: 1.5765
Epoch 3/5, Training Loss: 1.5738
Epoch 3/5, Validation Loss: 1.5747
Epoch 4/5, Training Loss: 1.5735
Epoch 4/5, Validation Loss: 1.5761
Epoch 5/5, Training Loss: 1.5733
Epoch 5/5, Validation Loss: 1.5750
Model weights saved successfully!


In [16]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

def evaluate_model_on_validation(model, val_loader, device):
    """
    Evaluates the trained model on the validation set and computes accuracy and F1 score.

    Args:
        model: Trained PyTorch model.
        val_loader: DataLoader for the validation set.
        device: Device (CPU/GPU) where the model is loaded.

    Returns:
        accuracy: Validation accuracy as a float.
        f1_score: Validation F1-score as a float.
        Prints a classification report with precision, recall, and F1-score for each class.
    """
    model.eval()  # Set the model to evaluation mode
    all_preds = []  # To store all predictions
    all_labels = []  # To store all ground truth labels

    with torch.no_grad():  # Disable gradient computation
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)         # Move input IDs to device
            attention_mask = batch["attention_mask"].to(device)  # Move attention mask to device
            labels = batch["label"].to(device)               # Move labels to device

            # Forward pass through the model (fixed)
            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs, dim=1)  # Get predicted labels (highest logit)

            # Collect predictions and true labels
            all_preds.extend(preds.cpu().numpy())  # Move predictions to CPU and store
            all_labels.extend(labels.cpu().numpy())  # Move labels to CPU and store

    # Compute metrics
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="weighted")  # Weighted F1 for multiclass classification

    # Print results
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Validation F1 Score: {f1:.4f}")
    print("\nClassification Report:")
    print(classification_report(all_labels, all_preds))

    return accuracy, f1


In [17]:
# Evaluate on validation set
accuracy, f1 = evaluate_model_on_validation(model, val_loader, device)


Validation Accuracy: 0.3379
Validation F1 Score: 0.1706

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     24201
           1       0.34      1.00      0.51     28164
           2       0.00      0.00      0.00      6929
           3       0.00      0.00      0.00     11441
           4       0.00      0.00      0.00      9594
           5       0.00      0.00      0.00      3033

    accuracy                           0.34     83362
   macro avg       0.06      0.17      0.08     83362
weighted avg       0.11      0.34      0.17     83362



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
