In [31]:
import os
import json
import torch
import functions_video_model as functions
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import torchvision.models as models
import cv2


First we have to load the video files and select the ones we are interested in. We create different sets of videos:
- cleaned_videos: all videos for which the speaker gender has been identified
- F_videos: all videos in which the speaker is a woman
- M_videos: all videos in which the speaker is a man

In [32]:
# Directory containing the videos
video_dir = 'data/mmsd_raw_data/utterances_final/'

# Get a list of all video files in the directory
video_files = [os.path.join(video_dir, f) for f in os.listdir(video_dir) if f.endswith('.mp4')]

In [39]:
# Read the JSON files from data labeling
with open('data/sarcasm_data.json') as file:
    all_data = json.load(file)

with open('data/F_data.json') as file:
    F_data = json.load(file)

with open('data/M_data.json') as file:
    M_data = json.load(file)

# Extract the keys from the JSON data
F_keys = list(F_data.keys())
M_keys = list(M_data.keys())

# Get a list of video files with utterances by gender
F_videos = [video for video in video_files if os.path.splitext(os.path.basename(video))[0] in F_keys]
M_videos = [video for video in video_files if os.path.splitext(os.path.basename(video))[0] in M_keys]

# Extract sarcasm labels
sarcasm_labels = {os.path.splitext(os.path.basename(key))[0]: int(value['sarcasm']) for key, value in all_data.items()}
F_sarcasm_labels = {os.path.splitext(os.path.basename(key))[0]: int(value['sarcasm']) for key, value in F_data.items()}
M_sarcasm_labels = {os.path.splitext(os.path.basename(key))[0]: int(value['sarcasm']) for key, value in M_data.items()}

# Prepare datasets for training
mixed_data = [{'video_path': path, 'sarcasm': sarcasm_labels[os.path.splitext(os.path.basename(path))[0]]} for path in video_files]
female_data = [{'video_path': path, 'sarcasm': F_sarcasm_labels[os.path.splitext(os.path.basename(path))[0]]} for path in F_videos]
male_data = [{'video_path': path, 'sarcasm': M_sarcasm_labels[os.path.splitext(os.path.basename(path))[0]]} for path in M_videos]

Let's define some classes and functions needed in the training pipeline.

In [40]:
# Class for the PyTorch sarcasm detection Dataset
class VideoDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        video_path = item['video_path']
        sarcasm = int(item['sarcasm'])
        video_features = extract_video_features(video_path)
        return video_features, sarcasm
    

# Class for the model
class SarcasmDetectionModel(nn.Module):
    def __init__(self, num_classes):
        super(SarcasmDetectionModel, self).__init__()
        self.i3d = models.video.r3d_18(pretrained=True)
        self.i3d.fc = nn.Identity()  # Remove final fully connected layer
        self.classifier = nn.Sequential(
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        features = self.i3d(x)
        logits = self.classifier(features)
        return logits
    


def extract_video_features(video_path, sample_rate=5):
    cap = cv2.VideoCapture(video_path)
    frames = []
    frame_count = 0
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        # Only add frame to list every 'sample_rate' frames
        if frame_count % sample_rate == 0:
            frame = cv2.resize(frame, (128, 128))  # Resize frame
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
            frames.append(frame)
        frame_count += 1
    
    cap.release()
    
    video_tensor = torch.tensor(frames, dtype=torch.float32)  # Convert frames to tensor
    video_tensor = video_tensor.permute(3, 0, 1, 2)  # Should be [C, T, H, W]
    
    return video_tensor

We will now train 3 different models: 
- one model will be trained on all of the videos (video_files)
- one model will be trained on videos from female speakers only (F_videos)
- one model will be trained on videos from male speakers only (M_videos)

In [41]:
# MIXED MODEL

# Set seed for reproducibility
torch.manual_seed(42)

# Set device
device = torch.device("cpu")

# Set hyperparameters
learning_rate = 0.0001
num_epochs = 5
batch_size = 8
weight_decay = 0.05
dropout_prob = 0

# Split dataset into training and testing
dataset = VideoDataset(mixed_data)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Initialize dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=functions.custom_collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=functions.custom_collate_fn)

# Initialize the model
mixed_model = SarcasmDetectionModel(num_classes=2).to(device)

# Define optimizer and criterion
optimizer = optim.AdamW(mixed_model.parameters(), lr=learning_rate, weight_decay=weight_decay)
criterion = nn.CrossEntropyLoss()

# Training loop
best_val_acc = 0.0
for epoch in range(num_epochs):
    train_loss, train_acc = functions.train_epoch(mixed_model, train_dataloader, criterion, optimizer, device)
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")

    test_loss, test_acc = functions.evaluate_model(mixed_model, test_dataloader, criterion, device)
    print(f"Epoch {epoch+1}/{num_epochs}, Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}")

# Save model
torch.save(mixed_model.state_dict(), 'models/mixed_model_video.pth')




train Loss: 0.6096 , train Acc: 0.6812
Epoch 1/5, Train Loss: 0.6096, Train Acc: 0.6812
test Loss: 0.6208 , test Acc: 0.6449
Epoch 1/5, Test Loss: 0.6208, Test Acc: 0.6449
train Loss: 0.4785 , train Acc: 0.7935
Epoch 2/5, Train Loss: 0.4785, Train Acc: 0.7935
test Loss: 0.9638 , test Acc: 0.6232
Epoch 2/5, Test Loss: 0.9638, Test Acc: 0.6232
train Loss: 0.3273 , train Acc: 0.8768
Epoch 3/5, Train Loss: 0.3273, Train Acc: 0.8768
test Loss: 0.6269 , test Acc: 0.7391
Epoch 3/5, Test Loss: 0.6269, Test Acc: 0.7391
train Loss: 0.2641 , train Acc: 0.8967
Epoch 4/5, Train Loss: 0.2641, Train Acc: 0.8967
test Loss: 1.5408 , test Acc: 0.5217
Epoch 4/5, Test Loss: 1.5408, Test Acc: 0.5217
train Loss: 0.2316 , train Acc: 0.9130
Epoch 5/5, Train Loss: 0.2316, Train Acc: 0.9130
test Loss: 0.7365 , test Acc: 0.7101
Epoch 5/5, Test Loss: 0.7365, Test Acc: 0.7101


In [42]:
# FEMALE MODEL

# Set seed for reproducibility
torch.manual_seed(42)

# Set device
device = torch.device("cpu")

# Set hyperparameters
learning_rate = 0.0001
num_epochs = 5
batch_size = 8
weight_decay = 0.05
dropout_prob = 0

# Split dataset into training and testing
dataset = VideoDataset(female_data)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Initialize dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=functions.custom_collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=functions.custom_collate_fn)

# Initialize the model
F_model = SarcasmDetectionModel(num_classes=2).to(device)

# Define optimizer and criterion
optimizer = optim.AdamW(F_model.parameters(), lr=learning_rate, weight_decay=weight_decay)
criterion = nn.CrossEntropyLoss()

# Training loop
best_val_acc = 0.0
for epoch in range(num_epochs):
    train_loss, train_acc = functions.train_epoch(F_model, train_dataloader, criterion, optimizer, device)
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")

    test_loss, test_acc = functions.evaluate_model(F_model, test_dataloader, criterion, device)
    print(f"Epoch {epoch+1}/{num_epochs}, Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}")
    
# Save model
torch.save(F_model.state_dict(), 'models/F_model_video.pth')

train Loss: 0.6506 , train Acc: 0.6481
Epoch 1/5, Train Loss: 0.6506, Train Acc: 0.6481
test Loss: 0.4810 , test Acc: 0.8537
Epoch 1/5, Test Loss: 0.4810, Test Acc: 0.8537
train Loss: 0.3936 , train Acc: 0.8333
Epoch 2/5, Train Loss: 0.3936, Train Acc: 0.8333
test Loss: 0.3998 , test Acc: 0.8293
Epoch 2/5, Test Loss: 0.3998, Test Acc: 0.8293
train Loss: 0.2877 , train Acc: 0.9074
Epoch 3/5, Train Loss: 0.2877, Train Acc: 0.9074
test Loss: 0.3236 , test Acc: 0.8049
Epoch 3/5, Test Loss: 0.3236, Test Acc: 0.8049
train Loss: 0.2171 , train Acc: 0.9383
Epoch 4/5, Train Loss: 0.2171, Train Acc: 0.9383
test Loss: 0.5551 , test Acc: 0.7317
Epoch 4/5, Test Loss: 0.5551, Test Acc: 0.7317
train Loss: 0.2077 , train Acc: 0.9259
Epoch 5/5, Train Loss: 0.2077, Train Acc: 0.9259
test Loss: 0.4882 , test Acc: 0.7561
Epoch 5/5, Test Loss: 0.4882, Test Acc: 0.7561


In [43]:
# MALE MODEL

# Set seed for reproducibility
torch.manual_seed(42)

# Set device
device = torch.device("cpu")

# Set hyperparameters
learning_rate = 0.0001
num_epochs = 5
batch_size = 5
weight_decay = 0.05
dropout_prob = 0

# Split dataset into training and testing
dataset = VideoDataset(male_data)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Initialize dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=functions.custom_collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=functions.custom_collate_fn)

# Initialize the model
M_model = SarcasmDetectionModel(num_classes=2).to(device)

# Define optimizer and criterion
optimizer = optim.AdamW(M_model.parameters(), lr=learning_rate, weight_decay=weight_decay)
criterion = nn.CrossEntropyLoss()

# Training loop
best_val_acc = 0.0
for epoch in range(num_epochs):
    train_loss, train_acc = functions.train_epoch(M_model, train_dataloader, criterion, optimizer, device)
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")

    test_loss, test_acc = functions.evaluate_model(M_model, test_dataloader, criterion, device)
    print(f"Epoch {epoch+1}/{num_epochs}, Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}") 

# Save model
torch.save(M_model.state_dict(), 'models/M_model_video.pth')

train Loss: 0.6415 , train Acc: 0.6395
Epoch 1/5, Train Loss: 0.6415, Train Acc: 0.6395
test Loss: 0.6296 , test Acc: 0.6977
Epoch 1/5, Test Loss: 0.6296, Test Acc: 0.6977
train Loss: 0.4621 , train Acc: 0.7965
Epoch 2/5, Train Loss: 0.4621, Train Acc: 0.7965
test Loss: 1.0514 , test Acc: 0.6163
Epoch 2/5, Test Loss: 1.0514, Test Acc: 0.6163
train Loss: 0.3692 , train Acc: 0.8430
Epoch 3/5, Train Loss: 0.3692, Train Acc: 0.8430
test Loss: 0.6644 , test Acc: 0.6977
Epoch 3/5, Test Loss: 0.6644, Test Acc: 0.6977
train Loss: 0.3037 , train Acc: 0.8547
Epoch 4/5, Train Loss: 0.3037, Train Acc: 0.8547
test Loss: 1.2365 , test Acc: 0.6279
Epoch 4/5, Test Loss: 1.2365, Test Acc: 0.6279
train Loss: 0.2234 , train Acc: 0.9273
Epoch 5/5, Train Loss: 0.2234, Train Acc: 0.9273
test Loss: 0.8160 , test Acc: 0.6512
Epoch 5/5, Test Loss: 0.8160, Test Acc: 0.6512


Now that we have created, trained and saved the different models, we will evaluate their performance on the other datasets. Here are the different evaluations we will perform:
- performance of mixed_model on the female dataset
- performance of mixed_model on the male dataset
- performance of F_model on the male dataset
- performance of M_model on the female dataset