**Authors**: Céline Hirsch, Sandra Frey, Sina Röllin

**Deep Learning Project**: Inclusiveness in Sarcasm Detection

# Final Model Text

In [1]:
from functions_text_model import *
import os
import json
import random
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader
import json
from transformers import BertForSequenceClassification, BertTokenizer
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.utils.data import random_split
from sklearn.model_selection import ParameterGrid

  from .autonotebook import tqdm as notebook_tqdm


First we have to load the datasets, which have already been prepared in `text-data-preparation.ipynb`. The different datasets are the mixed, the female and the male datasets. All of these datasets were split into training, validation and testing subsets.

In [2]:
# Load the data from the JSON files
with open('data/train_mixed.json') as file:
    mixed_train = json.load(file)

with open('data/val_mixed.json') as file:
    mixed_val = json.load(file)

with open('data/test_mixed.json') as file:
    mixed_test = json.load(file)

# Convert the data to a list of dictionaries
mixed_train_data = list(mixed_train.values())
mixed_val_data = list(mixed_val.values())
mixed_test_data = list(mixed_test.values())

# Shuffle mixed data (to avoid having all female speakers first and then all male speakers)
random.shuffle(mixed_train_data)
random.shuffle(mixed_val_data)
random.shuffle(mixed_test_data)

In [7]:
print(mixed_train_data)

[{'utterance': "Okay, listen, I'm really worried about this whole Raj thing.", 'speaker': 'PENNY', 'context': ['(video game sound effects) (knocking)', "I'm pumping!", "It's Penny.", 'Oh. Come in.', 'I thought you were pumping.', 'I thought you were my boss.'], 'show': 'BBT', 'sarcasm': True, 'gender': 'F'}, {'utterance': "Yeah well that's because uh .. I stayed in my room. Yeah, you don't want to look in my hamper.", 'speaker': 'JOEY', 'context': ['Yeah, I mean I was up sick all night.', 'Yeah me too, all night. Really?!', "How come we didn't cross paths?"], 'show': 'FRIENDS', 'sarcasm': True, 'gender': 'M'}, {'utterance': "See Joe that's why your parents told you not to jump on your bed!", 'speaker': 'CHANDLER', 'context': ['Whoa!'], 'show': 'FRIENDS', 'sarcasm': True, 'gender': 'M'}, {'utterance': 'I thought his work wife was standing in my kitchen.', 'speaker': 'BERNADETTE', 'context': ['Hey, Bernadette.', 'Oh, hey, Raj.', "Howard's not here.", "Oh, I know, he's been in the lab eve

In [3]:
utterances = [item['utterance'] for item in mixed_train_data]
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
utterances_tok = [tokenizer.encode(utt) for utt in utterances]
len(utterances), len(utterances_tok)
lengths = [len(utt) for utt in utterances_tok]

Let's define some classes and functions needed in the training pipeline.

In [4]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


# Function to encode the text
def encode_text(text):
    encoded_dict = tokenizer.encode_plus(
                        text,                      # Input text
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 64,           # Pad & truncate all sentences
                        truncation = True,
                        #padding = 'max_length',
                        return_attention_mask = True,   # Construct attention masks
                        return_tensors = None,     # Return pytorch tensors
                   )
    return encoded_dict['input_ids'], encoded_dict['attention_mask']


# Function to pad the input_ids and attention_mask
def collate_fn(batch):
    # Unpack raw inputs (assuming each item is a dict with 'input_ids')
    input_ids = [item[0] for item in batch]
    
    # Find the max length in the batch
    max_len = max(len(ids) for ids in input_ids)
    
    # Pad sequences and generate attention masks
    padded_input_ids = [ids + [0] * (max_len - len(ids)) for ids in input_ids]
    attention_masks = [[1] * len(ids) + [0] * (max_len - len(ids)) for ids in input_ids]
    
    return {
        'input_ids': torch.tensor(padded_input_ids),
        'attention_mask': torch.tensor(attention_masks),
        'sarcasm': torch.tensor([item[2] for item in batch])
    } 


# PyTorch Dataset
class SarcasmDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        utterance = item['utterance']
        context = item['context']
        utterance_and_context = ' '.join([sentence for sentence in context] + [utterance]) # Combining the utterance and its context into one string.
        sarcasm = int(item['sarcasm'])
        input_ids, attention_mask = encode_text(utterance_and_context)
        return input_ids, attention_mask, sarcasm
    

# Set seeds
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

We will now do the hyperparameter tuning for the 3 different models: 
- one model will be trained on the mixed data 
- one model will be trained on utterances from female speakers only
- one model will be trained on utterances from male speakers only

The performance of each of these models is then evaluated on the validation dataset. 

In [5]:
# HYPERPARAMETER TUNING MIXED MODEL

# Set seed
set_seed(42)
    
# Set device
device = torch.device("cpu")

dropout_prob = 0

# Define the hyperparameters to tune
param_grid = {
    'lr': [1e-3], # 1e-3, 1e-4, 1e-5
    'num_epochs': [20],
    'batch_size': [64], #8, 16, 32, 64
    'weight_decay': [0.05, 0.1],
    'dropout_prob': [0] #0, 0.1
}

# Create a parameter grid
grid = ParameterGrid(param_grid)

# Initialize a list to store the results
results = []

# Define the datasets
train_dataset = SarcasmDataset(mixed_train_data)
val_dataset = SarcasmDataset(mixed_val_data)

# For each combination of hyperparameters
for params in grid:

    # Create the DataLoaders
    train_dataloader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True, collate_fn=collate_fn)
    val_dataloader = DataLoader(val_dataset, batch_size=params['batch_size'], shuffle=True, collate_fn=collate_fn)

    # Create a new model
    model = BertForSequenceClassification.from_pretrained(
        "prajjwal1/bert-tiny",
        num_labels = 2,
        output_attentions = False,
        output_hidden_states = False,
    )

    model.classifier = nn.Sequential(
        nn.Dropout(params['dropout_prob']),
        nn.Linear(in_features=128, out_features=64, bias=True),
        nn.Tanh(),
        nn.Linear(in_features=64, out_features=16, bias=True),
        nn.Tanh(),
        nn.Linear(in_features=16, out_features=2, bias=True)
    )

    model.to(device)

    # Create a new optimizer with the current learning rate
    optimizer = AdamW(model.parameters(), lr=params['lr'], weight_decay=params['weight_decay'])

    # Create the optimizer  
    criterion = nn.CrossEntropyLoss()

    # Define metrics
    metrics = {'ACC': acc, 'F1-weighted': f1}

    # Initialize lists to store losses and metrics
    train_loss_log, test_loss_log = [], []
    metrics_names = list(metrics.keys())
    train_metrics_log = [[] for _ in range(len(metrics))]
    test_metrics_log = [[] for _ in range(len(metrics))]

    # Train and evaluate the model for the current number of epochs
    for epoch in range(params['num_epochs']):
        print(f"Epoch {epoch + 1}")
        print('learning rate:', params['lr'], 'batch size:', params['batch_size'], 'num_epochs:', params['num_epochs'])
        train_loss, train_metrics = train_epoch(model, optimizer, criterion, metrics, train_dataloader, device)
        val_loss, val_metrics = evaluate(model, criterion, metrics, val_dataloader, device)

        # Log the losses and metrics
        train_loss_log.append(train_loss)
        test_loss_log.append(val_loss)
        train_metrics_log = update_metrics_log(metrics_names, train_metrics_log, train_metrics)
        test_metrics_log = update_metrics_log(metrics_names, test_metrics_log, val_metrics)

    # Store the results
    results.append({
        'lr': params['lr'],
        'batch_size': params['batch_size'],
        'num_epochs': params['num_epochs'],
        'weight_decay': params['weight_decay'],
        'dropout_prob': params['dropout_prob'],
        'eval_loss': val_loss,
        'eval_metrics': val_metrics
    })

    # Plot and save the training and testing metrics
    plot_filename = f'hyperparameter_tuning/plot_lr_{params["lr"]}_bs_{params["batch_size"]}_wd_{params["weight_decay"]}_dp_{params["dropout_prob"]}.png'
    plot_training_hyperparameters(train_loss_log, test_loss_log, metrics_names, train_metrics_log, test_metrics_log, plot_filename)

# Save the results to a JSON file
results_path = 'hyperparameter_tuning/text_hyperparameter_mixed.json'
os.makedirs(os.path.dirname(results_path), exist_ok=True)

with open(results_path, 'w') as f:
    json.dump(results, f, indent=4)

print(results)

train Loss: 0.0537,  ACC: 0.9922, F1-weighted: 0.9922


100%|██████████| 1/1 [00:00<00:00, 10.05it/s]


eval Loss: 1.8068,  ACC: 0.4821, F1-weighted: 0.4739
[{'lr': 0.001, 'batch_size': 64, 'num_epochs': 20, 'weight_decay': 0.05, 'dropout_prob': 0, 'eval_loss': 2.080536127090454, 'eval_metrics': {'ACC': 0.4642857142857143, 'F1-weighted': 0.44664031620553357}}, {'lr': 0.001, 'batch_size': 64, 'num_epochs': 20, 'weight_decay': 0.1, 'dropout_prob': 0, 'eval_loss': 1.806823492050171, 'eval_metrics': {'ACC': 0.48214285714285715, 'F1-weighted': 0.47392290249433106}}]


In [12]:
# Find the item with the highest ACC
item_with_highest_acc = max(results, key=lambda x: x['eval_metrics']['ACC'])

# Find the item with the lowest ACC
item_with_lowest_acc = min(results, key=lambda x: x['eval_metrics']['ACC'])

# Display results
print("Item with highest ACC:", item_with_highest_acc)
print("Item with lowest ACC:", item_with_lowest_acc)

Item with highest ACC: {'lr': 0.0001, 'batch_size': 8, 'num_epochs': 20, 'weight_decay': 0.1, 'dropout_prob': 0, 'eval_loss': 1.0349595800042153, 'eval_metrics': {'ACC': 0.6416666666666667, 'F1-weighted': 0.46316646316646326}}
Item with lowest ACC: {'lr': 1e-05, 'batch_size': 16, 'num_epochs': 20, 'weight_decay': 0.05, 'dropout_prob': 0, 'eval_loss': 0.7157171070575714, 'eval_metrics': {'ACC': 0.4375, 'F1-weighted': 0.31118968689094917}}


In [6]:
print(results)

[{'lr': 0.001, 'batch_size': 8, 'num_epochs': 20, 'weight_decay': 0.05, 'dropout_prob': 0, 'eval_loss': 0.8169898509979248, 'eval_metrics': {'ACC': 0.525, 'F1-weighted': 0.5096296296296295}}, {'lr': 0.001, 'batch_size': 8, 'num_epochs': 20, 'weight_decay': 0.1, 'dropout_prob': 0, 'eval_loss': 0.7019194960594177, 'eval_metrics': {'ACC': 0.525, 'F1-weighted': 0.5096296296296295}}, {'lr': 0.0001, 'batch_size': 8, 'num_epochs': 20, 'weight_decay': 0.05, 'dropout_prob': 0, 'eval_loss': 1.3557068169116975, 'eval_metrics': {'ACC': 0.5666666666666667, 'F1-weighted': 0.4182384282384282}}, {'lr': 0.0001, 'batch_size': 8, 'num_epochs': 20, 'weight_decay': 0.1, 'dropout_prob': 0, 'eval_loss': 1.0349595800042153, 'eval_metrics': {'ACC': 0.6416666666666667, 'F1-weighted': 0.46316646316646326}}, {'lr': 1e-05, 'batch_size': 8, 'num_epochs': 20, 'weight_decay': 0.05, 'dropout_prob': 0, 'eval_loss': 0.6665356079737346, 'eval_metrics': {'ACC': 0.5333333333333333, 'F1-weighted': 0.3840885040885041}}, {'lr

Now that we trained the different models with different hyperparameters and evaluated them on the validation sets, we can choose which hyperparameters give us the best model.

We then train those models again with the number of epochs we chose from the graphs. That way we can save the models to access them again later.

In [6]:
# TRAIN MIXED MODEL WITH CHOSEN HYPERPARAMETERS

set_seed(42)
    
# Set device
device = torch.device("cpu")

# Set hyperparameters
lr = 0.001
num_epochs = 10
batch_size = 16
weight_decay = 0.1
dropout_prob = 0.1

# Define the datasets
train_dataset = SarcasmDataset(mixed_train_data)
val_dataset = SarcasmDataset(mixed_val_data)

# Create the DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

# Create a new model
model = BertForSequenceClassification.from_pretrained(
    "prajjwal1/bert-tiny",
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)

model.classifier = nn.Sequential(
    nn.Dropout(dropout_prob),
    nn.Linear(in_features=128, out_features=64, bias=True),
    nn.Tanh(),
    nn.Linear(in_features=64, out_features=16, bias=True),
    nn.Tanh(),
    nn.Linear(in_features=16, out_features=2, bias=True)
)

model.to(device)

# Create a new optimizer with the current learning rate
optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

# Create the optimizer  
criterion = nn.CrossEntropyLoss()

# Define metrics
metrics = {'ACC': acc, 'F1-weighted': f1}

# Initialize lists to store losses and metrics
train_loss_log, test_loss_log = [], []
metrics_names = list(metrics.keys())
train_metrics_log = [[] for _ in range(len(metrics))]
test_metrics_log = [[] for _ in range(len(metrics))]

# Train and evaluate the model for the current number of epochs
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}")
    train_loss, train_metrics = train_epoch(model, optimizer, criterion, metrics, train_dataloader, device)
    val_loss, val_metrics = evaluate(model, criterion, metrics, val_dataloader, device)

    # Log the losses and metrics
    train_loss_log.append(train_loss)
    test_loss_log.append(val_loss)
    train_metrics_log = update_metrics_log(metrics_names, train_metrics_log, train_metrics)
    test_metrics_log = update_metrics_log(metrics_names, test_metrics_log, val_metrics)

# Save model
#torch.save(model.state_dict(), 'models/mixed_model_text.pth')

plot_filename = f'hyperparameter_tuning/test_plot.png'
plot_training_hyperparameters(train_loss_log, test_loss_log, metrics_names, train_metrics_log, test_metrics_log, plot_filename)

train Loss: 0.7942,  ACC: 0.5000, F1-weighted: 0.5000


100%|██████████| 4/4 [00:00<00:00, 30.89it/s]


eval Loss: 0.8404,  ACC: 0.4375, F1-weighted: 0.2628


Now that we have chosen the best hyperparameters for each model and saved the model weights, we will evaluate the model performances on the untouched test sets.

In [29]:
# Test mixed model performance on test set

set_seed(42)
    
# Set device
device = torch.device("cpu")

# Set hyperparameters
batch_size = 16
dropout_prob = 0

# Define the dataset
test_dataset = SarcasmDataset(mixed_test_data)

# Create the DataLoader
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

# Load the model
mixed_model = BertForSequenceClassification.from_pretrained(
    "prajjwal1/bert-tiny",
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False,
)

mixed_model.classifier = nn.Sequential(
    nn.Dropout(dropout_prob),
    nn.Linear(in_features=128, out_features=64, bias=True),
    nn.Tanh(),
    nn.Linear(in_features=64, out_features=16, bias=True),
    nn.Tanh(),
    nn.Linear(in_features=16, out_features=2, bias=True)
    )

# Load the weights
state_dict = torch.load("models/mixed_model_text.pth")
mixed_model.load_state_dict(state_dict)

mixed_model.eval()
mixed_model.to(device)

# Define the loss criterion
criterion = nn.CrossEntropyLoss()

# Define the metrics
metrics = {'ACC': acc, 'F1-weighted': f1}

# Evaluate the model
test_loss, test_metrics = evaluate(mixed_model, criterion, metrics, test_dataloader, device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 6/6 [00:00<00:00, 30.01it/s]

eval Loss: 0.5574,  ACC: 0.8507, F1-weighted: 0.7190



