# ***Recognizing Emotions in Poems using BERT***
**Varun Sundaram - 3750611**

# Due to computing constraints with our local machines, I have used google colab for this project. I have installed necessary packages although some packages might be built-in with colab. If you face any issues with packages or package versions, please install suggested packages from the output.

# Installing all the necessary packages for the project. Packages used in this process include pandas, numpy, transformers (to use pre-trained models), datasets (to format data), torch (used in transformers), scikit-learn (for classification tasks).

In [None]:
!pip install pandas
!pip install numpy
!pip install transformers
!pip install datasets
!pip install torch
!pip install scikit-learn

Collecting sympy==1.13.1 (from torch)
  Using cached sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Using cached sympy-1.13.1-py3-none-any.whl (6.2 MB)
Installing collected packages: sympy
  Attempting uninstall: sympy
    Found existing installation: sympy 1.13.3
    Uninstalling sympy-1.13.3:
      Successfully uninstalled sympy-1.13.3
Successfully installed sympy-1.13.1


# Loading and Displaying the Dataset

*   This code reads the final_df_emotions(remove-bias).csv file into a Pandas DataFrame for further analysis.
*   It displays the first few rows of the dataset to confirm the structure and ensure the data has been loaded correctly.
*   You can either download the file from the provided Kaggle link: https://www.kaggle.com/datasets/mexwell/poem-dataset or use the csv file provided and upload it.





In [None]:
import pandas as pd

file_path = 'final_df_emotions(remove-bias).csv'

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame
df.head()


Unnamed: 0,poem content,pred,label,score,anger,disgust,fear,joy,neutral,sadness,surprise,age,type
0,Let the bird of loudest lay\r\nOn the sole Ara...,5,sadness,0.650738,0.133402,0.076618,0.094706,0.006762,0.03296,0.650738,0.004814,Renaissance,Mythology & Folklore
1,"Sir Charles into my chamber coming in,\r\nWhen...",0,anger,0.264296,0.264296,0.145707,0.057977,0.262715,0.019884,0.244457,0.004963,Renaissance,Mythology & Folklore
2,"Our vice runs beyond all that old men saw,\r\n...",0,anger,0.758054,0.758054,0.210444,0.00369,0.001755,0.013997,0.011483,0.000578,Renaissance,Mythology & Folklore
3,"Lo I the man, whose Muse whilome did maske,\r\...",0,anger,0.89628,0.89628,0.009259,0.004949,0.00232,0.007623,0.078088,0.001481,Renaissance,Mythology & Folklore
4,"Long have I longd to see my love againe,\r\nSt...",5,sadness,0.600543,0.027077,0.036344,0.075487,0.017212,0.187106,0.600543,0.05623,Renaissance,Mythology & Folklore


# Data Preprocessing:
*   Converts all text to lowercase for consistency.
*   Removes punctuation and numeric digits using regular expressions.
*   Replaces multiple spaces with a single space to ensure uniform formatting.
*   Applies the cleaning function to the poem content column in the DataFrame.
*   Displays the first few rows of the cleaned text for verification.



In [None]:
import re  # Import the regular expression library for text pattern matching
import string  # Import the string library to access predefined sets like punctuation

# Define a function to preprocess the input text
def preprocess_text(text):

    # Convert the text to lowercase to maintain consistency
    text = text.lower()

    # Remove punctuation from the text using regex and string.punctuation
    text = re.sub(f"[{string.punctuation}]", "", text)

    # Replace multiple spaces with a single space to normalize spacing
    text = re.sub(r'\s+', ' ', text)

    # Remove all numeric digits from the text using regex
    text = re.sub(r'\d+', '', text)

    # Return the cleaned and processed text
    return text

# Apply the preprocess_text function to the 'poem content' column of the DataFrame
df['poem content'] = df['poem content'].apply(preprocess_text)

# Print a confirmation message after preprocessing is completed
print("Special characters removed from 'poem content'.")

# Display the first few rows of the processed 'poem content' column for verification
print(df['poem content'].head())

Special characters removed from 'poem content'.
0    let the bird of loudest lay on the sole arabia...
1    sir charles into my chamber coming in when i w...
2    our vice runs beyond all that old men saw and ...
3    lo i the man whose muse whilome did maske as t...
4    long have i longd to see my love againe still ...
Name: poem content, dtype: object


# Using the Most Frequent Baseline model to predict emotions in poems:

*   Encodes categorical labels into numeric format using LabelEncoder.
*   Splits the dataset into training and testing sets (80% training, 20% testing).
*   Trains a baseline model (DummyClassifier) that predicts the most frequent class.
*   Evaluates the model's performance using accuracy and a classification report.





In [None]:
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

#Split the data into features and target
X = df['poem content']  # Text
y = df['label']  # Labels

# Convert the labels to numeric encoding
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Initialize the DummyClassifier with strategy='most_frequent'
dummy_model = DummyClassifier(strategy='most_frequent')

# Train the baseline model (it just remembers the most frequent class)
dummy_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = dummy_model.predict(X_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Print the classification report for more metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Accuracy: 0.2777777777777778

Classification Report:
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        14
     disgust       0.00      0.00      0.00        12
        fear       0.00      0.00      0.00        23
         joy       0.00      0.00      0.00         8
     neutral       0.00      0.00      0.00         5
     sadness       0.28      1.00      0.43        25
    surprise       0.00      0.00      0.00         3

    accuracy                           0.28        90
   macro avg       0.04      0.14      0.06        90
weighted avg       0.08      0.28      0.12        90



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Logistic Regression with Bag of Words:

*   Splits the dataset into training and testing sets with 70% for training and 30% for testing.
*   Converts textual data into numerical features using the Bag of Words (BoW) representation with CountVectorizer and eliminates stop words for better contextual understanding.
*   Trains a logistic regression model to classify the text and evaluates its performance using accuracy and detailed classification metrics.






In [None]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


#Split the data into features and target
X = df['poem content']  # Text
y = df['label']  # Labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Convert text data to Bag of Words (BoW) features using CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train a Logistic Regression model
modelLogistic = LogisticRegression(max_iter=200)
modelLogistic.fit(X_train_vec, y_train)

# Predict on the test set
y_pred = modelLogistic.predict(X_test_vec)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Print detailed classification metrics (Precision, Recall, F1-Score)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 37.04%
Classification Report:
              precision    recall  f1-score   support

       anger       0.40      0.08      0.14        24
     disgust       1.00      0.14      0.25        14
        fear       0.37      0.53      0.44        36
         joy       0.00      0.00      0.00        10
     neutral       0.00      0.00      0.00         8
     sadness       0.36      0.68      0.47        40
    surprise       0.00      0.00      0.00         3

    accuracy                           0.37       135
   macro avg       0.30      0.20      0.18       135
weighted avg       0.38      0.37      0.31       135



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Multinomial Naive Bayes with Bag of Words:



*   Trains a Multinomial Naive Bayes classifier on Bag of Words features to perform text classification.
*   Predicts labels for the test data and evaluates performance using accuracy and classification metrics.







In [None]:
from sklearn.naive_bayes import MultinomialNB

modelNB = MultinomialNB()
modelNB.fit(X_train_vec, y_train)

# Predict on the test set
y_pred = modelNB.predict(X_test_vec)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Print detailed classification metrics (Precision, Recall, F1-Score)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 41.48%
Classification Report:
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        24
     disgust       0.00      0.00      0.00        14
        fear       0.41      0.69      0.52        36
         joy       0.00      0.00      0.00        10
     neutral       0.00      0.00      0.00         8
     sadness       0.42      0.78      0.54        40
    surprise       0.00      0.00      0.00         3

    accuracy                           0.41       135
   macro avg       0.12      0.21      0.15       135
weighted avg       0.23      0.41      0.30       135



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Now we test it on two BERT models - BERT and DistilBERT.
# We use pretrained models.

In [None]:
# Install the `datasets` library from Hugging Face for accessing prebuilt datasets
!pip install datasets

# Install the `string` library for text processing and handling string constants
!pip install string

[31mERROR: Could not find a version that satisfies the requirement string (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for string[0m[31m
[0m

In [None]:
# Display all column names (keys) of the DataFrame
# Useful for understanding the structure of the dataset and identifying feature names
df.keys()

Index(['poem content', 'pred', 'label', 'score', 'anger', 'disgust', 'fear',
       'joy', 'neutral', 'sadness', 'surprise', 'age', 'type'],
      dtype='object')

# Assigning labels to emotions.


*   Excludes specific columns (poem content, pred, label, score) to identify valid class labels for prediction.
*   Creates mappings between label indices and their corresponding label names for easy reference during model training and evaluation.
*   Outputs the list of labels to verify the classes being processed.





In [None]:
labels = [label for label in df.keys() if label not in ['poem content', 'pred','label','score']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['anger',
 'disgust',
 'fear',
 'joy',
 'neutral',
 'sadness',
 'surprise',
 'age',
 'type']

In [None]:
# A dictionary mapping each label name to its corresponding index
# Useful for converting label names into numeric indices during model training
label2id

{'anger': 0,
 'disgust': 1,
 'fear': 2,
 'joy': 3,
 'neutral': 4,
 'sadness': 5,
 'surprise': 6,
 'age': 7,
 'type': 8}

# Perform a three way split of dataset into train, validation and test.

*   Splits the dataset into training (80%), validation (10%), and test (10%) sets using stratified sampling to maintain label distribution.
*   Converts the DataFrames into Hugging Face Dataset format for integration with transformers.
*   Combines the train, validation, and test datasets into a DatasetDict.





In [None]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

# Prepare the data for splitting
df['text'] = df['poem content']  # Rename column for clarity
# df1 = df[['text', 'label']]  # Select only the relevant columns (text and label)
df1 = df

# Split into train, validation, and test sets
train_df, temp_df = train_test_split(df1, test_size=0.2, random_state=42, stratify=df['label'])  # 80% train, 20% temp
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])  # 10% val, 10% test

# Convert pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Create a DatasetDict
emotions = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

# Print the structure of the dataset
print(emotions)

DatasetDict({
    train: Dataset({
        features: ['poem content', 'pred', 'label', 'score', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise', 'age', 'type', 'text', '__index_level_0__'],
        num_rows: 360
    })
    validation: Dataset({
        features: ['poem content', 'pred', 'label', 'score', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise', 'age', 'type', 'text', '__index_level_0__'],
        num_rows: 45
    })
    test: Dataset({
        features: ['poem content', 'pred', 'label', 'score', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise', 'age', 'type', 'text', '__index_level_0__'],
        num_rows: 45
    })
})


**Version upgraded due to errors.**

In [None]:
# Install or upgrade the `sympy` library, which is used for symbolic mathematics
# This ensures you have the latest version with all updates and bug fixes
!pip install sympy --upgrade

Collecting sympy
  Using cached sympy-1.13.3-py3-none-any.whl.metadata (12 kB)
Using cached sympy-1.13.3-py3-none-any.whl (6.2 MB)
Installing collected packages: sympy
  Attempting uninstall: sympy
    Found existing installation: sympy 1.13.1
    Uninstalling sympy-1.13.1:
      Successfully uninstalled sympy-1.13.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.5.1+cu121 requires sympy==1.13.1; python_version >= "3.9", but you have sympy 1.13.3 which is incompatible.[0m[31m
[0mSuccessfully installed sympy-1.13.3


# Import BERT pre-trained model along with BERT tokenizer. Tokenize the text in all the split data.


# Tokenization and Data Preparation for BERT

*   Applies BERT tokenization to convert textual data into numerical representations, including padding and truncation for uniform input length (128 tokens).
*   Maps textual labels to numerical IDs for classification using a predefined dictionary (label2id).
*   Processes the training, validation, and test datasets by tokenizing the text and removing unnecessary columns to streamline input for the BERT model.


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score
import torch
import numpy as np

# Load the tokenizer
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the Dataset
def tokenize_function(examples):
  # take a batch of texts
  text = examples["text"]
  print(len(text))
  # encode them
  encoding = tokenizer_bert(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  # labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  # t_label=[0]* len(text)
  # for i in range(len(text)):
  #   label = [0] * len(id2label)
  #   for k, l in id2label.items():
  #       if l == examples["label"][i]:
  #           label[k] = 1.0
  #       else:
  #           label[k] = 0.0
  #   t_label[i]=label

  # encoding["labels"] = t_label

  for i in range(len(text)):
    examples['label'][i]= label2id[examples['label'][i]]

  encoding["labels"] = examples["label"]

  # labels_matrix = np.zeros((len(text), len(labels)))
  # # fill numpy array
  # for idx, label in enumerate(labels):
  #   labels_matrix[:, idx] = labels_batch[label]

  # = labels_matrix.tolist()

  return encoding

# Apply the tokenizer to all the datasets (train, validation, test)
train_dataset = train_dataset.map(tokenize_function, batched=True,remove_columns=['poem content', 'pred', 'label', 'score', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise', '__index_level_0__'])
val_dataset = val_dataset.map(tokenize_function, batched=True,remove_columns=['poem content', 'pred', 'label', 'score', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise', '__index_level_0__'])
test_dataset = test_dataset.map(tokenize_function, batched=True,remove_columns=['poem content', 'pred', 'label', 'score', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise', '__index_level_0__'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/360 [00:00<?, ? examples/s]

360


Map:   0%|          | 0/45 [00:00<?, ? examples/s]

45


Map:   0%|          | 0/45 [00:00<?, ? examples/s]

45


# BERT Model Fine-Tuning and Evaluation

*   Fine-tunes a pre-trained BERT model for sequence classification using custom training arguments, including 10 epochs, batch size, and weight decay for regularization.
*   Tracks model performance during training using an accuracy metric, saving the best model based on evaluation results.
*   Evaluates the trained model on the test dataset, providing final accuracy results and insights into its classification performance.





In [None]:
modelBert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(id2label),id2label=id2label,label2id=label2id)

# Define Training Arguments
training_args = TrainingArguments(
    output_dir='./results',           # output directory
    num_train_epochs=10,               # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    # warmup_steps=500,                 # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                # strength of weight decay
    logging_dir='./logs',             # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",      # Evaluate at the end of each epoch
    save_strategy="epoch",           # Save model at the end of each epoch
    load_best_model_at_end=True,      # Load the best model when finished training
    metric_for_best_model="accuracy",# Use accuracy as the metric for best model selection
    report_to=["none"]
)

# Define compute_metrics function
def compute_metrics(p):
    preds, labels = p
    predictions = torch.argmax(torch.tensor(preds), axis=1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

# Initialize Trainer
trainerBert = Trainer(
    model=modelBert,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer_bert,
)

# Train the Model
trainerBert.train()

# Evaluate the Model
results_bert = trainerBert.evaluate(test_dataset)
print("Test Results:", results_bert)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainerBert = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,1.7547,1.636588,0.377778
2,1.4898,1.555267,0.422222
3,1.2733,1.557851,0.422222
4,1.0137,1.696484,0.333333
5,0.5953,1.837032,0.444444
6,0.2278,1.899837,0.466667
7,0.1729,2.023918,0.444444
8,0.0577,2.412968,0.444444
9,0.0256,2.426956,0.466667
10,0.0218,2.451829,0.466667


Test Results: {'eval_loss': 1.8035495281219482, 'eval_accuracy': 0.4444444444444444, 'eval_runtime': 0.4001, 'eval_samples_per_second': 112.459, 'eval_steps_per_second': 14.995, 'epoch': 10.0}


# Saving the BERT model.

In [None]:
trainerBert.save_model("BERTModelForPoems")

# Dataset Splitting and Conversion for DistilBERT

*   Divides the dataset into training (80%), validation (10%), and testing (10%) sets, ensuring stratified sampling to maintain label distribution.
*   Converts the split datasets into Hugging Face Dataset format for  compatibility with transformer-based models.



# Note
Now we test the DistilBERT model for our poems dataset using a pre-trained DistilBERT model and AutoTokenizer.





In [None]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

# Prepare the data for splitting
df['text'] = df['poem content']  # Rename column for clarity
# df1 = df[['text', 'label']]  # Select only the relevant columns (text and label)
df1 = df

# Split into train, validation, and test sets
train_df, temp_df = train_test_split(df1, test_size=0.2, random_state=42, stratify=df['label'])  # 80% train, 20% temp
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])  # 10% val, 10% test

# Convert pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Create a DatasetDict
emotions = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

# Print the structure of the dataset
print(emotions)

DatasetDict({
    train: Dataset({
        features: ['poem content', 'pred', 'label', 'score', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise', 'age', 'type', 'text', '__index_level_0__'],
        num_rows: 360
    })
    validation: Dataset({
        features: ['poem content', 'pred', 'label', 'score', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise', 'age', 'type', 'text', '__index_level_0__'],
        num_rows: 45
    })
    test: Dataset({
        features: ['poem content', 'pred', 'label', 'score', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise', 'age', 'type', 'text', '__index_level_0__'],
        num_rows: 45
    })
})


# DistilBERT Tokenization and Dataset Preparation

*   Uses the AutoTokenizer for tokenizing text data, ensuring compatibility with the pre-trained DistilBERT model.
*   Encodes text into numerical format with padding and truncation for a fixed input size of 128 tokens.
*   Processes the training, validation, and test datasets by mapping textual labels to numeric IDs and removing unnecessary columns to prepare the data for model fine-tuning.





In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments,AutoTokenizer
from sklearn.metrics import accuracy_score
import torch
import numpy as np

# Load the tokenizer
tokenizer_auto=AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the Dataset
def tokenize_function(examples):
  # take a batch of texts
  text = examples["text"]
  print(len(text))
  # encode them
  encoding = tokenizer_auto(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  # labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  # t_label=[0]* len(text)
  # for i in range(len(text)):
  #   label = [0] * len(id2label)
  #   for k, l in id2label.items():
  #       if l == examples["label"][i]:
  #           label[k] = 1.0
  #       else:
  #           label[k] = 0.0
  #   t_label[i]=label

  # encoding["labels"] = t_label

  for i in range(len(text)):
    examples['label'][i]= label2id[examples['label'][i]]

  encoding["labels"] = examples["label"]

  # labels_matrix = np.zeros((len(text), len(labels)))
  # # fill numpy array
  # for idx, label in enumerate(labels):
  #   labels_matrix[:, idx] = labels_batch[label]

  # = labels_matrix.tolist()

  return encoding


# Apply the tokenizer to all the datasets (train, validation, test)
train_dataset = train_dataset.map(tokenize_function, batched=True,remove_columns=['poem content', 'pred', 'label', 'score', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise', '__index_level_0__'])
val_dataset = val_dataset.map(tokenize_function, batched=True,remove_columns=['poem content', 'pred', 'label', 'score', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise', '__index_level_0__'])
test_dataset = test_dataset.map(tokenize_function, batched=True,remove_columns=['poem content', 'pred', 'label', 'score', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise', '__index_level_0__'])

Map:   0%|          | 0/360 [00:00<?, ? examples/s]

360


Map:   0%|          | 0/45 [00:00<?, ? examples/s]

45


Map:   0%|          | 0/45 [00:00<?, ? examples/s]

45


# Train the DistilBERT model on the data and evaluate

*   Fine-tunes a pre-trained DistilBERT model for sequence classification using training arguments such as batch size, epochs, and evaluation strategy.
*   Tracks accuracy during training, saves the best-performing model, and evaluates it on the test dataset.
*   Provides the final test results, including accuracy, to assess the model's classification performance.





In [None]:
modelDistil = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(id2label),id2label=id2label,label2id=label2id)

# Define Training Arguments
training_args = TrainingArguments(
    output_dir='./results',           # output directory
    num_train_epochs=10,               # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    # warmup_steps=500,                 # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                # strength of weight decay
    logging_dir='./logs',             # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",      # Evaluate at the end of each epoch
    save_strategy="epoch",           # Save model at the end of each epoch
    load_best_model_at_end=True,      # Load the best model when finished training
    metric_for_best_model="accuracy",# Use accuracy as the metric for best model selection
    report_to=["none"]
)

# Step 4: Define compute_metrics function
def compute_metrics(p):
    preds, labels = p
    predictions = torch.argmax(torch.tensor(preds), axis=1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

# Step 5: Initialize Trainer
trainer_distil = Trainer(
    model=modelDistil,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer_auto,
)

# Step 6: Train the Model
trainer_distil.train()

# Step 7: Evaluate the Model
results_distil = trainer_distil.evaluate(test_dataset)
print("Test Results:", results_distil)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_distil = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,1.7287,1.651863,0.377778
2,1.5202,1.586125,0.377778
3,1.2238,1.579388,0.377778
4,0.9077,1.664261,0.377778
5,0.5942,1.776924,0.466667
6,0.2557,1.92486,0.444444
7,0.1891,2.059615,0.422222
8,0.0835,2.247864,0.4
9,0.0389,2.391015,0.4
10,0.0324,2.357251,0.444444


Test Results: {'eval_loss': 1.9355568885803223, 'eval_accuracy': 0.3333333333333333, 'eval_runtime': 0.2252, 'eval_samples_per_second': 199.789, 'eval_steps_per_second': 26.639, 'epoch': 10.0}


# Saving the DistilBERT model.

In [None]:
trainerBert.save_model("DistilBERTModelForPoems")

# Now, we predict emotions in poems with our model.

*   First poem is from https://discoverpoetry.com/poems/poems-about-anger/ and is the 4th one. This poem depicts **anger** as stated by the source.
*   The second poem is from https://discoverpoetry.com/poems/poems-about-joy/ and is the 7th one. This poem depicts **joy** according to the source.
*   The third poem is from https://discoverpoetry.com/poems/poems-about-fear/ and is the 3rd poem. It depicts **fear** according to the source.






Our BERT model predicts **anger** for the first one and is rightly classified according to the source.
The second poem is predicted as **sadness** which is misclassified compared to the source.
The third poem is right classified as **fear**.

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the trained model and tokenizer
modelBert = BertForSequenceClassification.from_pretrained("BERTModelForPoems")
tokenizerBert = BertTokenizer.from_pretrained("BERTModelForPoems")

# Define the function for prediction
def predict_poem(poem):
    # Tokenize the poem (with padding and truncation)
    inputs = tokenizerBert(poem, padding=True, truncation=True, return_tensors="pt", max_length=512)

    # Run the input through the model
    with torch.no_grad():
        outputs = modelBert(**inputs)

    # Get the predicted class (argmax over logits)
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=-1)

    # Decode the predicted class ID to its corresponding label
    predicted_class_id = prediction.item()
    predicted_label = id2label.get(predicted_class_id)

    return predicted_label

# Example custom poems
poem1 = "Stop me, good people! Don't you see My temper is running away with me? Help, Master Commonsense! Are you afraid? Good Mistress Prudence, come to my aid! Stop me, Conscience! Stop me, I pray! My temper, my temper is running away! Dear Brother Kindness, snatch after the reins! Help, or my temper will dash out my brains! Help, or I'll get a terrible fall! Help, Shame, Caution, Love, Wisdom, and all!"
poem2 = "Away, sad voices, telling Of old, forgotten pain! My heart, at grief rebelling, To joy returns again. My life, at tears protesting, To long delight returns, Where, close of all my questing, Her dear eyes love discerns."
poem3 = "Presentiment is that long shadow on the lawn Indicative that suns go down; The notice to the startled grass That darkness is about to pass."
# Predict for custom poems
predicted_label1 = predict_poem(poem1)
predicted_label2 = predict_poem(poem2)
predicted_label3 = predict_poem(poem3)

print(f"Predicted Label for Poem 1: {predicted_label1}")
print(f"Predicted Label for Poem 2: {predicted_label2}")
print(f"Predicted Label for Poem 3: {predicted_label3}")


Predicted Label for Poem 1: anger
Predicted Label for Poem 2: sadness
Predicted Label for Poem 3: fear


# The DistilBert model predicts the same as BERT model.






In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AutoTokenizer
import torch

# Load the trained model and tokenizer
modelBert = BertForSequenceClassification.from_pretrained("DistilBERTModelForPoems")
tokenizerBert = BertTokenizer.from_pretrained("DistilBERTModelForPoems")

# Define the function for prediction
def predict_poem(poem):
    # Tokenize the poem (with padding and truncation)
    inputs = tokenizerBert(poem, padding=True, truncation=True, return_tensors="pt", max_length=512)

    # Run the input through the model
    with torch.no_grad():
        outputs = modelBert(**inputs)

    # Get the predicted class (argmax over logits)
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=-1)

    # Decode the predicted class ID to its corresponding label
    predicted_class_id = prediction.item()
    predicted_label = id2label.get(predicted_class_id)

    return predicted_label

# Example custom poems
poem1 = "Stop me, good people! Don't you see My temper is running away with me? Help, Master Commonsense! Are you afraid? Good Mistress Prudence, come to my aid! Stop me, Conscience! Stop me, I pray! My temper, my temper is running away! Dear Brother Kindness, snatch after the reins! Help, or my temper will dash out my brains! Help, or I'll get a terrible fall! Help, Shame, Caution, Love, Wisdom, and all!"
poem2 = "Away, sad voices, telling Of old, forgotten pain! My heart, at grief rebelling, To joy returns again. My life, at tears protesting, To long delight returns, Where, close of all my questing, Her dear eyes love discerns."
poem3 = "Presentiment is that long shadow on the lawn Indicative that suns go down; The notice to the startled grass That darkness is about to pass."
# Predict for custom poems
predicted_label1 = predict_poem(poem1)
predicted_label2 = predict_poem(poem2)
predicted_label3 = predict_poem(poem3)

print(f"Predicted Label for Poem 1: {predicted_label1}")
print(f"Predicted Label for Poem 2: {predicted_label2}")
print(f"Predicted Label for Poem 3: {predicted_label3}")


Predicted Label for Poem 1: anger
Predicted Label for Poem 2: sadness
Predicted Label for Poem 3: fear
