In [5]:
import os
import torch
import torchvision
import torchvision.models as models
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import cv2
import json

In [6]:
# Directory containing the videos
video_dir = 'data/mmsd_raw_data/utterances_final/'

# Get a list of all video files in the directory
video_files = [os.path.join(video_dir, f) for f in os.listdir(video_dir) if f.endswith('.mp4')]

# Function to extract visual features using I3D
def extract_video_features(video_path, sample_rate=5):
    cap = cv2.VideoCapture(video_path)
    frames = []
    frame_count = 0
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        # Only add frame to list every 'sample_rate' frames
        if frame_count % sample_rate == 0:
            frame = cv2.resize(frame, (128, 128))  # Resize frame
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
            frames.append(frame)
        frame_count += 1
    
    cap.release()
    
    video_tensor = torch.tensor(frames, dtype=torch.float32)  # Convert frames to tensor
    video_tensor = video_tensor.permute(3, 0, 1, 2)  # Should be [C, T, H, W]

    print("Shape after permutation:", video_tensor.shape)
    
    return video_tensor


# PyTorch Dataset for sarcasm detection
class VideoDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        video_path = item['video_path']
        sarcasm = int(item['sarcasm'])
        video_features = extract_video_features(video_path)
        return video_features, sarcasm
    

# Function to add padding to the videos
def custom_collate_fn(batch):
    videos, labels = zip(*batch)
    max_frames = max(video.size(1) for video in videos)  # Find the max number of frames

    padded_videos = []
    for video in videos:
        padding_needed = max_frames - video.size(1)
        if padding_needed > 0:
            pad = torch.zeros((video.shape[0], padding_needed, video.shape[2], video.shape[3]), dtype=video.dtype, device=video.device)
            padded_video = torch.cat([video, pad], dim=1)
        else:
            padded_video = video
        padded_videos.append(padded_video)

    videos_tensor = torch.stack(padded_videos)  # Stack along a new batch dimension
    labels_tensor = torch.tensor(labels, dtype=torch.long)

    print('padding done')       # remove this line

    return videos_tensor, labels_tensor

# Load the data from the JSON file
with open('sarcasm_data.json') as f:
    text_data = json.load(f)
    # Extract sarcasm labels
    sarcasm_labels = {os.path.splitext(os.path.basename(k))[0]: int(v['sarcasm']) for k, v in text_data.items()}

# Create the dataset
data = [{'video_path': path, 'sarcasm': sarcasm_labels[os.path.splitext(os.path.basename(path))[0]]} for path in video_files]

dataset = VideoDataset(data)

# Create DataLoader
data_loader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=custom_collate_fn)


In [7]:
# Set hyperparameters
learning_rate = 0.0001
num_epochs = 10
batch_size = 8
weight_decay = 0.05  # L2 Regularization
dropout_prob = 0

# Set seed for reproducibility
torch.manual_seed(42)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create a new model
class SarcasmDetectionModel(nn.Module):
    def __init__(self, num_classes):
        super(SarcasmDetectionModel, self).__init__()
        self.i3d = models.video.r3d_18(pretrained=True)
        self.i3d.fc = nn.Identity()  # Remove final fully connected layer
        self.classifier = nn.Sequential(
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        print("Input shape to model:", x.shape)   # Remove this line
        features = self.i3d(x)
        logits = self.classifier(features)
        return logits

# Initialize the model
model = SarcasmDetectionModel(num_classes=2).to(device)

# Create optimizer and criterion
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
criterion = nn.CrossEntropyLoss()

# Define dataset and dataloaders (assuming SarcasmDataset is already defined)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=custom_collate_fn)

# Define training function
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct_preds = 0
    total_preds = 0

    for inputs, labels in dataloader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        _, preds = torch.max(outputs, 1)
        correct_preds += torch.sum(preds == labels).item()
        total_preds += labels.size(0)

    epoch_loss = running_loss / len(dataloader)
    epoch_acc = correct_preds / total_preds

    return epoch_loss, epoch_acc

# Define evaluation function
def evaluate_model(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct_preds = 0
    total_preds = 0

    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            running_loss += loss.item()

            _, preds = torch.max(outputs, 1)
            correct_preds += torch.sum(preds == labels).item()
            total_preds += labels.size(0)

    epoch_loss = running_loss / len(dataloader)
    epoch_acc = correct_preds / total_preds

    return epoch_loss, epoch_acc




In [8]:
# Training loop
best_val_acc = 0.0
for epoch in range(num_epochs):
    train_loss, train_acc = train_epoch(model, train_dataloader, criterion, optimizer, device)
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")

    test_loss, test_acc = evaluate_model(model, test_dataloader, criterion, device)
    print(f"Epoch {epoch+1}/{num_epochs}, Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}")
    
    # Save the model if validation accuracy improves
    if test_acc > best_val_acc:
        best_val_acc = test_acc
        torch

  video_tensor = torch.tensor(frames, dtype=torch.float32)  # Convert frames to tensor


Shape after permutation: torch.Size([3, 29, 128, 128])
Shape after permutation: torch.Size([3, 24, 128, 128])
Shape after permutation: torch.Size([3, 58, 128, 128])
Shape after permutation: torch.Size([3, 34, 128, 128])
Shape after permutation: torch.Size([3, 34, 128, 128])
Shape after permutation: torch.Size([3, 48, 128, 128])
Shape after permutation: torch.Size([3, 23, 128, 128])
Shape after permutation: torch.Size([3, 33, 128, 128])
padding done
Input shape to model: torch.Size([8, 3, 58, 128, 128])
Shape after permutation: torch.Size([3, 9, 128, 128])
Shape after permutation: torch.Size([3, 38, 128, 128])
Shape after permutation: torch.Size([3, 34, 128, 128])
Shape after permutation: torch.Size([3, 63, 128, 128])
Shape after permutation: torch.Size([3, 20, 128, 128])
Shape after permutation: torch.Size([3, 34, 128, 128])
Shape after permutation: torch.Size([3, 30, 128, 128])
Shape after permutation: torch.Size([3, 21, 128, 128])
padding done
Input shape to model: torch.Size([8, 3, 

In [41]:
# Test batch
# Get one batch of data
test_inputs, test_labels = next(iter(data_loader))

# Print shape of the batch
print("Batch shape:", test_inputs.shape)

# Optionally, pass this test batch through the model to see the output
model.to('cpu')  # Make sure model is on the right device
model.eval()
with torch.no_grad():
    outputs = model(test_inputs)
    print("Output shape:", outputs.shape)


Shape after permutation: torch.Size([3, 118, 224, 224])
Shape after permutation: torch.Size([3, 144, 224, 224])
Shape after permutation: torch.Size([3, 156, 224, 224])
Shape after permutation: torch.Size([3, 24, 224, 224])
Shape after permutation: torch.Size([3, 91, 224, 224])
Shape after permutation: torch.Size([3, 29, 224, 224])
Shape after permutation: torch.Size([3, 200, 224, 224])
Shape after permutation: torch.Size([3, 221, 224, 224])
padding done
Batch shape: torch.Size([8, 3, 221, 224, 224])
Input shape to model: torch.Size([8, 3, 221, 224, 224])
Output shape: torch.Size([8, 2])
