Now that we have created, trained and saved the different models, we will evaluate their performance on the other datasets. Here are the different evaluations we will perform:
- performance of mixed_model on the female dataset
- performance of mixed_model on the male dataset
- performance of F_model on the male dataset
- performance of F_model on the mixed dataset
- performance of M_model on the female dataset
- performance of M_model on the mixed dataset

In [1]:
import torch
import json
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification, BertTokenizer
import torch.nn as nn
import functions_text_model as functions

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Class for the PyTorch sarcasm detection Dataset
class SarcasmDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        utterance = item['utterance']
        sarcasm = int(item['sarcasm'])
        input_ids, attention_mask = encode_text(utterance)
        return input_ids.flatten(), attention_mask.flatten(), sarcasm


# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


# Function to encode the text
def encode_text(text):
    encoded_dict = tokenizer.encode_plus(
                        text,                      # Input text
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 64,           # Pad & truncate all sentences
                        truncation = True,
                        padding = 'max_length',
                        return_attention_mask = True,   # Construct attention masks
                        return_tensors = 'pt',     # Return pytorch tensors
                   )
    return encoded_dict['input_ids'], encoded_dict['attention_mask']

# Store the results
results = {}

In [23]:
# Set hyperparameters
batch_size = 64
dropout_prob = 0


# Load the data from the JSON files
with open('data/F_data.json') as file:
    F_data = json.load(file)

with open('data/M_data.json') as file:
    M_data = json.load(file)

with open('data/mixed_data_enriched.json') as file:
    mixed_data = json.load(file)

# Convert the data to lists of dictionaries
mixed_data = list(mixed_data.values())
F_data = list(F_data.values())
M_data = list(M_data.values())

# Create Pytorch datasets
F_dataset = SarcasmDataset(F_data)
M_dataset = SarcasmDataset(M_data)
mixed_dataset = SarcasmDataset(mixed_data)

# Create dataloaders
F_dataloader = DataLoader(F_dataset, batch_size, shuffle=True)
M_dataloader = DataLoader(M_dataset, batch_size, shuffle=True)
mixed_dataloader = DataLoader(mixed_dataset, batch_size, shuffle=True)



# Initialize the models
device = torch.device("cpu")

# Mixed model
mixed_model = BertForSequenceClassification.from_pretrained(
    "prajjwal1/bert-tiny",
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False,
)

mixed_model.classifier = nn.Sequential(
    nn.Dropout(dropout_prob),
    nn.Linear(in_features=128, out_features=64, bias=True),
    nn.Tanh(),
    nn.Linear(in_features=64, out_features=16, bias=True),
    nn.Tanh(),
    nn.Linear(in_features=16, out_features=2, bias=True)
    )

# Load the weights
state_dict = torch.load("models/mixed_model_enriched_text.pth")
mixed_model.load_state_dict(state_dict)

mixed_model.eval()
mixed_model.to(device)

# F model
F_model = BertForSequenceClassification.from_pretrained(
    "prajjwal1/bert-tiny",
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False,
)

F_model.classifier = nn.Sequential(
    nn.Dropout(dropout_prob),
    nn.Linear(in_features=128, out_features=64, bias=True),
    nn.Tanh(),
    nn.Linear(in_features=64, out_features=16, bias=True),
    nn.Tanh(),
    nn.Linear(in_features=16, out_features=2, bias=True)
    )

# Load the weights
state_dict = torch.load("models/F_model_text.pth")
F_model.load_state_dict(state_dict)

F_model.eval()
F_model.to(device)

# M model
M_model = BertForSequenceClassification.from_pretrained(
    "prajjwal1/bert-tiny",
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False,
)

M_model.classifier = nn.Sequential(
    nn.Dropout(dropout_prob),
    nn.Linear(in_features=128, out_features=64, bias=True),
    nn.Tanh(),
    nn.Linear(in_features=64, out_features=16, bias=True),
    nn.Tanh(),
    nn.Linear(in_features=16, out_features=2, bias=True)
    )

# Load the weights
state_dict = torch.load("models/M_model_text.pth")
M_model.load_state_dict(state_dict)

M_model.eval()
M_model.to(device)

# Define the loss criterion
criterion = nn.CrossEntropyLoss()

# Define the metrics
metrics = {'ACC': functions.acc, 'F1-weighted': functions.f1}

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
# MIXED MODEL ON FEMALE DATASET

# Set seed for reproducibility
torch.manual_seed(42)

# Evaluate the model
test_loss, test_metrics = functions.evaluate(mixed_model, criterion, metrics, F_dataloader, device)

# Store the results
results['Mixed model on Female dataset'] = {'Test loss': test_loss, 'Test metrics': test_metrics}

100%|██████████| 4/4 [00:00<00:00, 22.21it/s]

eval Loss: 0.6555,  ACC: 0.6733, F1-weighted: 0.6524





In [25]:
# MIXED MODEL ON MALE DATASET

# Set seed for reproducibility
torch.manual_seed(42)

# Evaluate the model
test_loss, test_metrics = functions.evaluate(mixed_model, criterion, metrics, M_dataloader, device)

# Store the results
results['Mixed model on Male dataset'] = {'Test loss': test_loss, 'Test metrics': test_metrics}

100%|██████████| 7/7 [00:00<00:00, 17.64it/s]

eval Loss: 0.4879,  ACC: 0.8158, F1-weighted: 0.8133





In [26]:
# FEMALE MODEL ON MALE DATASET

# Set seed for reproducibility
torch.manual_seed(42)

# Evaluate the model
test_loss, test_metrics = functions.evaluate(F_model, criterion, metrics, M_dataloader, device)

# Store the results
results['Female model on Male dataset'] = {'Test loss': test_loss, 'Test metrics': test_metrics}

100%|██████████| 7/7 [00:00<00:00, 16.69it/s]

eval Loss: 0.6879,  ACC: 0.5487, F1-weighted: 0.5296





In [27]:
# FEMALE MODEL ON MIXED DATASET

# Set seed for reproducibility
torch.manual_seed(42)

# Evaluate the model
test_loss, test_metrics = functions.evaluate(F_model, criterion, metrics, mixed_dataloader, device)

# Store the results
results['Female model on Mixed dataset'] = {'Test loss': test_loss, 'Test metrics': test_metrics}

100%|██████████| 7/7 [00:00<00:00, 15.92it/s]

eval Loss: 0.6884,  ACC: 0.5446, F1-weighted: 0.5322





In [28]:
# MALE MODEL ON FEMALE DATASET

# Set seed for reproducibility
torch.manual_seed(42)

# Evaluate the model
test_loss, test_metrics = functions.evaluate(M_model, criterion, metrics, F_dataloader, device)

# Store the results
results['Male model on Female dataset'] = {'Test loss': test_loss, 'Test metrics': test_metrics}

100%|██████████| 4/4 [00:00<00:00, 17.55it/s]

eval Loss: 0.6555,  ACC: 0.6733, F1-weighted: 0.6524





In [29]:
# MALE MODEL ON MIXED DATASET

# Set seed for reproducibility
torch.manual_seed(42)

# Evaluate the model
test_loss, test_metrics = functions.evaluate(M_model, criterion, metrics, mixed_dataloader, device)

# Store the results
results['Male model on Mixed dataset'] = {'Test loss': test_loss, 'Test metrics': test_metrics}

100%|██████████| 7/7 [00:00<00:00, 17.67it/s]

eval Loss: 0.4842,  ACC: 0.8192, F1-weighted: 0.8172





In [30]:
# Save the results
with open('results/text_model_evaluation.json', 'w') as f:
    json.dump(results, f, indent=4)