In [None]:
# Multimodal Movie Revenue Prediction System

This notebook implements a comprehensive deep learning system that combines:
- **Text Analysis**: Movie plot/synopsis using BERT
- **Video Analysis**: YouTube trailer processing using ResNet50 CNN
- **Audio Analysis**: Trailer audio processing using 1D CNNs
- **Multimodal Fusion**: Advanced deep learning combination

## Revenue Categories
- Disaster (0), Flop (1), Successful (2), Average (3)
- Hit (4), Outstanding (5), Superhit (6), Blockbuster (7)

## Architecture Overview
```
Movie Plot → [BERT] → 768-dim features
YouTube URL → [Video Processor] → [ResNet50] → 2048-dim features
            → [Audio Extractor] → [1D CNN] → 1024-dim features
                                            ↓
                              [Fusion Network] → 8 Revenue Classes
```


In [None]:
## 1. Installation and Imports

First, let's install the required packages and import all necessary libraries.


In [None]:
# Install required packages
!pip install torch torchvision torchaudio transformers
!pip install opencv-python librosa soundfile pytube
!pip install scikit-learn pandas numpy matplotlib seaborn tqdm
!pip install accelerate datasets


In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import cv2
import librosa
from pytube import YouTube
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score, 
                           confusion_matrix, classification_report)
from sklearn.utils.class_weight import compute_class_weight
import warnings
warnings.filterwarnings('ignore')

# Deep Learning Models
import torchvision
from torchvision import transforms, models
from torch.optim import Adam, AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Audio processing
import torchaudio
from torchaudio import transforms as audio_transforms

# Additional utilities
from tqdm import tqdm
import pickle
import json
from urllib.parse import urlparse, parse_qs
import re
import time

print("All libraries imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")


In [None]:
## 2. Configuration and Parameters

Setting up all the hyperparameters and configuration for our multimodal system.


In [None]:
class Config:
    """Configuration class for model parameters"""
    # Data splits
    SPLIT_OPTIONS = {
        'option1': (0.7, 0.2, 0.1),  # 70-20-10
        'option2': (0.75, 0.15, 0.1), # 75-15-10
        'option3': (0.8, 0.1, 0.1)    # 80-10-10
    }
    
    # Model parameters
    MAX_TEXT_LENGTH = 512
    VIDEO_FRAME_SIZE = 224
    AUDIO_SAMPLE_RATE = 16000
    AUDIO_DURATION = 30  # seconds
    
    # Training parameters
    BATCH_SIZE = 16
    LEARNING_RATE = 1e-4
    NUM_EPOCHS = 50
    PATIENCE = 10
    
    # Model dimensions
    TEXT_EMBEDDING_DIM = 768  # BERT-base
    VIDEO_EMBEDDING_DIM = 2048  # ResNet50
    AUDIO_EMBEDDING_DIM = 1024
    FUSION_DIM = 512
    NUM_CLASSES = 8
    
    # Video processing
    FRAMES_PER_VIDEO = 30
    VIDEO_DURATION = 60  # seconds to analyze

# Label mapping for revenue categories
LABEL_MAPPING = {
    'Disaster': 0, 'Flop': 1, 'Successful': 2, 'Average': 3,
    'Hit': 4, 'Outstanding': 5, 'Superhit': 6, 'Blockbuster': 7
}

print("Configuration loaded successfully!")
print(f"Number of revenue classes: {Config.NUM_CLASSES}")
print(f"Data split options: {list(Config.SPLIT_OPTIONS.keys())}")
print(f"Model will process {Config.FRAMES_PER_VIDEO} frames per video")


In [None]:
## 3. YouTube Video Processing

This class handles downloading YouTube trailers and extracting frames and audio for analysis.


In [None]:
class YouTubeVideoProcessor:
    """Handles YouTube video downloading and processing"""
    
    def __init__(self, output_dir='temp_videos'):
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)
        print(f"Video processor initialized. Output directory: {output_dir}")
    
    def extract_video_id(self, url):
        """Extract YouTube video ID from URL"""
        if 'youtube.com/watch' in url:
            return parse_qs(urlparse(url).query).get('v', [None])[0]
        elif 'youtu.be' in url:
            return url.split('/')[-1].split('?')[0]
        elif re.match(r'^[A-Za-z0-9_-]{11}$', url):
            return url
        return None
    
    def download_video(self, video_url, video_id):
        """Download YouTube video"""
        try:
            yt = YouTube(video_url)
            stream = yt.streams.filter(file_extension='mp4', res='720p').first()
            if not stream:
                stream = yt.streams.filter(file_extension='mp4').first()
            
            video_path = os.path.join(self.output_dir, f"{video_id}.mp4")
            if not os.path.exists(video_path):
                print(f"Downloading video: {video_id}")
                stream.download(output_path=self.output_dir, filename=f"{video_id}.mp4")
            return video_path
        except Exception as e:
            print(f"Error downloading video {video_id}: {e}")
            return None
    
    def extract_frames(self, video_path, num_frames=30):
        """Extract frames from video"""
        try:
            cap = cv2.VideoCapture(video_path)
            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            fps = cap.get(cv2.CAP_PROP_FPS)
            
            # Extract frames evenly distributed across the video
            frame_indices = np.linspace(0, total_frames-1, num_frames, dtype=int)
            frames = []
            
            for frame_idx in frame_indices:
                cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
                ret, frame = cap.read()
                if ret:
                    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                    frames.append(frame)
            
            cap.release()
            print(f"Extracted {len(frames)} frames from video")
            return np.array(frames)
        except Exception as e:
            print(f"Error extracting frames: {e}")
            return None
    
    def extract_audio(self, video_path):
        """Extract audio from video"""
        try:
            # Extract audio using librosa
            audio, sr = librosa.load(video_path, sr=Config.AUDIO_SAMPLE_RATE, 
                                   duration=Config.AUDIO_DURATION)
            print(f"Extracted audio: {len(audio)} samples at {sr} Hz")
            return audio, sr
        except Exception as e:
            print(f"Error extracting audio: {e}")
            return None, None

# Test the video processor
video_processor = YouTubeVideoProcessor()
print("YouTube Video Processor created successfully!")


In [None]:
## 4. Deep Learning Model Architectures

Now let's define our deep learning models for each modality (text, video, audio).


In [None]:
### 4.1 Text Encoder (BERT-based)


In [None]:
class TextEncoder(nn.Module):
    """BERT-based text encoder for movie plots/synopsis"""
    
    def __init__(self, model_name='bert-base-uncased', freeze_bert=False):
        super(TextEncoder, self).__init__()
        print(f"Loading BERT model: {model_name}")
        self.bert = BertModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(Config.TEXT_EMBEDDING_DIM, Config.TEXT_EMBEDDING_DIM)
        
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
            print("BERT parameters frozen")
        
        print(f"Text encoder initialized with {sum(p.numel() for p in self.parameters()):,} parameters")
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        encoded = self.fc(pooled_output)
        return F.relu(encoded)

# Test the text encoder
text_encoder = TextEncoder()
print("✅ Text Encoder created successfully!")


In [None]:
### 4.2 Video Encoder (CNN-based with ResNet50)


In [None]:
class VideoEncoder(nn.Module):
    """CNN-based video encoder for trailer frames"""
    
    def __init__(self):
        super(VideoEncoder, self).__init__()
        print("Loading ResNet50 backbone...")
        
        # Use pre-trained ResNet50 as backbone
        self.backbone = models.resnet50(pretrained=True)
        self.backbone.fc = nn.Identity()  # Remove final FC layer
        
        # Temporal pooling and processing
        self.temporal_conv = nn.Conv1d(Config.VIDEO_EMBEDDING_DIM, 
                                     Config.VIDEO_EMBEDDING_DIM, 
                                     kernel_size=3, padding=1)
        self.global_pool = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(Config.VIDEO_EMBEDDING_DIM, Config.VIDEO_EMBEDDING_DIM)
        self.dropout = nn.Dropout(0.3)
        
        print(f"Video encoder initialized with {sum(p.numel() for p in self.parameters()):,} parameters")
    
    def forward(self, frames):
        # frames: (batch_size, num_frames, 3, height, width)
        batch_size, num_frames = frames.shape[:2]
        
        # Reshape for CNN processing
        frames = frames.view(-1, 3, Config.VIDEO_FRAME_SIZE, Config.VIDEO_FRAME_SIZE)
        
        # Extract features for each frame
        frame_features = self.backbone(frames)  # (batch*num_frames, 2048)
        frame_features = frame_features.view(batch_size, num_frames, -1)
        
        # Temporal processing
        frame_features = frame_features.transpose(1, 2)  # (batch, features, frames)
        temporal_features = self.temporal_conv(frame_features)
        pooled_features = self.global_pool(temporal_features).squeeze(-1)
        
        # Final encoding
        encoded = self.fc(pooled_features)
        encoded = self.dropout(encoded)
        return F.relu(encoded)

# Test the video encoder
video_encoder = VideoEncoder()
print("✅ Video Encoder created successfully!")

# Test with dummy data
dummy_frames = torch.randn(2, Config.FRAMES_PER_VIDEO, 3, Config.VIDEO_FRAME_SIZE, Config.VIDEO_FRAME_SIZE)
with torch.no_grad():
    video_features = video_encoder(dummy_frames)
print(f"Video encoder output shape: {video_features.shape}")


In [None]:
### 4.3 Audio Encoder (1D CNN)


In [None]:
class AudioEncoder(nn.Module):
    """CNN-based audio encoder for trailer audio"""
    
    def __init__(self):
        super(AudioEncoder, self).__init__()
        print("Initializing Audio Encoder with 1D CNNs...")
        
        # 1D CNN for audio processing with fixed kernel sizes
        # Input: 480000 samples (16000 * 30 seconds)
        self.conv1 = nn.Conv1d(1, 64, kernel_size=1024, stride=512)      # Output: ~935
        self.conv2 = nn.Conv1d(64, 128, kernel_size=16, stride=8)        # Output: ~115  
        self.conv3 = nn.Conv1d(128, 256, kernel_size=8, stride=4)        # Output: ~27
        self.conv4 = nn.Conv1d(256, 512, kernel_size=4, stride=2)        # Output: ~12
        
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(512, Config.AUDIO_EMBEDDING_DIM)
        self.dropout = nn.Dropout(0.3)
        
        print(f"Audio encoder initialized with {sum(p.numel() for p in self.parameters()):,} parameters")
    
    def forward(self, audio):
        # audio: (batch_size, audio_length)
        audio = audio.unsqueeze(1)  # (batch_size, 1, audio_length)
        
        # Progressive feature extraction with proper kernel sizes
        x = F.relu(self.conv1(audio))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        
        # Global pooling and final projection
        x = self.pool(x).squeeze(-1)
        x = self.fc(x)
        x = self.dropout(x)
        return F.relu(x)

# Test the audio encoder
audio_encoder = AudioEncoder()
print("✅ Audio Encoder created successfully!")

# Test with dummy data
dummy_audio = torch.randn(2, Config.AUDIO_SAMPLE_RATE * Config.AUDIO_DURATION)
with torch.no_grad():
    audio_features = audio_encoder(dummy_audio)
print(f"Audio encoder output shape: {audio_features.shape}")

# Debug: Show the progression of feature map sizes
print("\n🔍 Audio feature map size progression:")
test_audio = torch.randn(1, Config.AUDIO_SAMPLE_RATE * Config.AUDIO_DURATION)
test_audio = test_audio.unsqueeze(1)
print(f"Input: {test_audio.shape}")

with torch.no_grad():
    x1 = F.relu(audio_encoder.conv1(test_audio))
    print(f"After conv1: {x1.shape}")
    
    x2 = F.relu(audio_encoder.conv2(x1))
    print(f"After conv2: {x2.shape}")
    
    x3 = F.relu(audio_encoder.conv3(x2))
    print(f"After conv3: {x3.shape}")
    
    x4 = F.relu(audio_encoder.conv4(x3))
    print(f"After conv4: {x4.shape}")
    
    x5 = audio_encoder.pool(x4).squeeze(-1)
    print(f"After pooling: {x5.shape}")


In [None]:
### 4.4 Multimodal Fusion Model


In [None]:
class MultimodalFusionModel(nn.Module):
    """Multimodal fusion model combining text, video, and audio"""
    
    def __init__(self):
        super(MultimodalFusionModel, self).__init__()
        print("Creating Multimodal Fusion Model...")
        
        self.text_encoder = TextEncoder()
        self.video_encoder = VideoEncoder()
        self.audio_encoder = AudioEncoder()
        
        # Fusion layers
        total_dim = Config.TEXT_EMBEDDING_DIM + Config.VIDEO_EMBEDDING_DIM + Config.AUDIO_EMBEDDING_DIM
        print(f"Total input dimension for fusion: {total_dim}")
        
        self.fusion_layers = nn.Sequential(
            nn.Linear(total_dim, Config.FUSION_DIM),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(Config.FUSION_DIM, Config.FUSION_DIM // 2),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(Config.FUSION_DIM // 2, Config.NUM_CLASSES)
        )
        
        # Attention mechanism for modality fusion (optional)
        self.attention = nn.MultiheadAttention(embed_dim=Config.FUSION_DIM, num_heads=8)
        self.modality_projection = nn.Linear(total_dim, Config.FUSION_DIM)
        
        total_params = sum(p.numel() for p in self.parameters())
        print(f"Complete multimodal model initialized with {total_params:,} parameters")
    
    def forward(self, text_input_ids, text_attention_mask, video_frames, audio):
        # Encode each modality
        text_features = self.text_encoder(text_input_ids, text_attention_mask)
        video_features = self.video_encoder(video_frames)
        audio_features = self.audio_encoder(audio)
        
        # Concatenate features
        combined_features = torch.cat([text_features, video_features, audio_features], dim=1)
        
        # Apply fusion layers
        output = self.fusion_layers(combined_features)
        return output

# Create the complete multimodal model
model = MultimodalFusionModel()
print("✅ Complete Multimodal Fusion Model created successfully!")

# Show model architecture summary
print("\n📊 Model Architecture Summary:")
print(f"Text Encoder (BERT): {Config.TEXT_EMBEDDING_DIM}-dim output")
print(f"Video Encoder (ResNet50): {Config.VIDEO_EMBEDDING_DIM}-dim output") 
print(f"Audio Encoder (1D CNN): {Config.AUDIO_EMBEDDING_DIM}-dim output")
print(f"Fusion Network: {Config.TEXT_EMBEDDING_DIM + Config.VIDEO_EMBEDDING_DIM + Config.AUDIO_EMBEDDING_DIM} → {Config.FUSION_DIM} → {Config.FUSION_DIM//2} → {Config.NUM_CLASSES}")
print(f"Final Output: {Config.NUM_CLASSES} revenue classes")


In [None]:
## 5. Dataset and Data Loading

Creating the dataset class to handle multimodal movie data.


In [None]:
class MovieDataset(Dataset):
    """Dataset class for movie data with multimodal inputs"""
    
    def __init__(self, dataframe, tokenizer, video_processor, transform=None, mode='train'):
        self.df = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.video_processor = video_processor
        self.transform = transform
        self.mode = mode
        
        # Video transforms
        self.video_transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((Config.VIDEO_FRAME_SIZE, Config.VIDEO_FRAME_SIZE)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                               std=[0.229, 0.224, 0.225])
        ])
        
        print(f"Dataset created with {len(self.df)} samples in {mode} mode")
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # Text processing
        text = str(row['Description']) if pd.notna(row['Description']) else ""
        text_encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=Config.MAX_TEXT_LENGTH,
            return_tensors='pt'
        )
        
        # Video processing
        video_id = self.video_processor.extract_video_id(row['Trailer'])
        frames = np.zeros((Config.FRAMES_PER_VIDEO, Config.VIDEO_FRAME_SIZE, 
                          Config.VIDEO_FRAME_SIZE, 3))
        audio = np.zeros(Config.AUDIO_SAMPLE_RATE * Config.AUDIO_DURATION)
        
        if video_id and self.mode == 'train':  # Only process videos during training
            try:
                video_path = self.video_processor.download_video(row['Trailer'], video_id)
                if video_path:
                    extracted_frames = self.video_processor.extract_frames(video_path, Config.FRAMES_PER_VIDEO)
                    extracted_audio, _ = self.video_processor.extract_audio(video_path)
                    
                    if extracted_frames is not None:
                        frames = extracted_frames
                    if extracted_audio is not None:
                        audio = extracted_audio[:len(audio)]  # Truncate to desired length
            except:
                pass  # Use zero frames/audio if processing fails
        
        # Transform video frames
        transformed_frames = []
        for frame in frames:
            if frame.max() > 1:  # If pixel values are in [0, 255]
                frame = frame.astype(np.uint8)
            else:  # If pixel values are in [0, 1]
                frame = (frame * 255).astype(np.uint8)
            transformed_frame = self.video_transform(frame)
            transformed_frames.append(transformed_frame)
        
        video_tensor = torch.stack(transformed_frames)
        audio_tensor = torch.FloatTensor(audio)
        
        # Target
        target = row['y'] if 'y' in row else 0
        
        return {
            'input_ids': text_encoding['input_ids'].squeeze(),
            'attention_mask': text_encoding['attention_mask'].squeeze(),
            'video_frames': video_tensor,
            'audio': audio_tensor,
            'target': torch.LongTensor([target])[0]
        }

print("✅ MovieDataset class created successfully!")


In [None]:
## 6. Training and Evaluation Classes


In [None]:
class ModelTrainer:
    """Training and evaluation utilities"""
    
    def __init__(self, model, device):
        self.model = model
        self.device = device
        self.model.to(device)
        print(f"Model moved to device: {device}")
    
    def train_epoch(self, dataloader, optimizer, criterion):
        self.model.train()
        total_loss = 0
        predictions = []
        targets = []
        
        for batch in tqdm(dataloader, desc="Training"):
            optimizer.zero_grad()
            
            input_ids = batch['input_ids'].to(self.device)
            attention_mask = batch['attention_mask'].to(self.device)
            video_frames = batch['video_frames'].to(self.device)
            audio = batch['audio'].to(self.device)
            target = batch['target'].to(self.device)
            
            outputs = self.model(input_ids, attention_mask, video_frames, audio)
            loss = criterion(outputs, target)
            
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            predictions.extend(torch.argmax(outputs, dim=1).cpu().numpy())
            targets.extend(target.cpu().numpy())
        
        avg_loss = total_loss / len(dataloader)
        accuracy = accuracy_score(targets, predictions)
        return avg_loss, accuracy
    
    def evaluate(self, dataloader, criterion):
        self.model.eval()
        total_loss = 0
        predictions = []
        targets = []
        
        with torch.no_grad():
            for batch in tqdm(dataloader, desc="Evaluating"):
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                video_frames = batch['video_frames'].to(self.device)
                audio = batch['audio'].to(self.device)
                target = batch['target'].to(self.device)
                
                outputs = self.model(input_ids, attention_mask, video_frames, audio)
                loss = criterion(outputs, target)
                
                total_loss += loss.item()
                predictions.extend(torch.argmax(outputs, dim=1).cpu().numpy())
                targets.extend(target.cpu().numpy())
        
        avg_loss = total_loss / len(dataloader)
        metrics = self.calculate_metrics(targets, predictions)
        return avg_loss, metrics, predictions, targets
    
    def calculate_metrics(self, y_true, y_pred):
        """Calculate comprehensive evaluation metrics"""
        metrics = {
            'accuracy': accuracy_score(y_true, y_pred),
            'f1_weighted': f1_score(y_true, y_pred, average='weighted'),
            'f1_macro': f1_score(y_true, y_pred, average='macro'),
            'precision_weighted': precision_score(y_true, y_pred, average='weighted'),
            'precision_macro': precision_score(y_true, y_pred, average='macro'),
            'recall_weighted': recall_score(y_true, y_pred, average='weighted'),
            'recall_macro': recall_score(y_true, y_pred, average='macro'),
        }
        return metrics
    
    def plot_confusion_matrix(self, y_true, y_pred, class_names):
        """Plot confusion matrix"""
        cm = confusion_matrix(y_true, y_pred)
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                   xticklabels=class_names, yticklabels=class_names)
        plt.title('Confusion Matrix')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.tight_layout()
        plt.show()
        return cm

print("✅ ModelTrainer class created successfully!")


In [None]:
## 7. Data Preparation and Training Pipeline


In [None]:
def prepare_data(data_path, split_option='option1'):
    """Prepare and split the dataset"""
    print(f"Loading data from: {data_path}")
    
    # Load data
    df = pd.read_csv(data_path)
    df = df.dropna(subset=['Description', 'Trailer', 'Verdict'])
    
    # Label mapping
    df['y'] = df['Verdict'].map(LABEL_MAPPING)
    df = df.dropna(subset=['y'])
    
    # Get split ratios
    train_ratio, val_ratio, test_ratio = Config.SPLIT_OPTIONS[split_option]
    print(f"Using split option {split_option}: {train_ratio}-{val_ratio}-{test_ratio}")
    
    # Split data
    X = df.drop(['y', 'Verdict'], axis=1)
    y = df['y']
    
    # First split: train vs (val + test)
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=(val_ratio + test_ratio), 
        random_state=42, stratify=y
    )
    
    # Second split: val vs test
    val_size = val_ratio / (val_ratio + test_ratio)
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=(1 - val_size),
        random_state=42, stratify=y_temp
    )
    
    # Combine X and y back
    train_df = pd.concat([X_train, y_train], axis=1)
    val_df = pd.concat([X_val, y_val], axis=1)
    test_df = pd.concat([X_test, y_test], axis=1)
    
    print(f"Data split results:")
    print(f"  Train: {len(train_df)} samples ({len(train_df)/len(df):.1%})")
    print(f"  Validation: {len(val_df)} samples ({len(val_df)/len(df):.1%})")
    print(f"  Test: {len(test_df)} samples ({len(test_df)/len(df):.1%})")
    
    return train_df, val_df, test_df

def create_data_loaders(train_df, val_df, test_df, tokenizer, video_processor):
    """Create PyTorch data loaders"""
    
    train_dataset = MovieDataset(train_df, tokenizer, video_processor, mode='train')
    val_dataset = MovieDataset(val_df, tokenizer, video_processor, mode='val')
    test_dataset = MovieDataset(test_df, tokenizer, video_processor, mode='test')
    
    train_loader = DataLoader(train_dataset, batch_size=Config.BATCH_SIZE, 
                             shuffle=True, num_workers=2)
    val_loader = DataLoader(val_dataset, batch_size=Config.BATCH_SIZE, 
                           shuffle=False, num_workers=2)
    test_loader = DataLoader(test_dataset, batch_size=Config.BATCH_SIZE, 
                            shuffle=False, num_workers=2)
    
    return train_loader, val_loader, test_loader

print("✅ Data preparation functions created!")


In [None]:
## 8. Main Training Function


In [None]:
def train_model(model, train_loader, val_loader, device, split_name="option1"):
    """Train the multimodal model"""
    
    print(f"🚀 Starting training for {split_name}")
    
    # Setup training
    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=Config.LEARNING_RATE, weight_decay=0.01)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.5)
    
    trainer = ModelTrainer(model, device)
    
    best_val_acc = 0
    patience_counter = 0
    train_losses = []
    val_losses = []
    train_accs = []
    val_accs = []
    
    for epoch in range(Config.NUM_EPOCHS):
        print(f"\n📊 Epoch {epoch+1}/{Config.NUM_EPOCHS}")
        
        # Training
        train_loss, train_acc = trainer.train_epoch(train_loader, optimizer, criterion)
        
        # Validation
        val_loss, val_metrics, _, _ = trainer.evaluate(val_loader, criterion)
        val_acc = val_metrics['accuracy']
        
        # Learning rate scheduling
        scheduler.step(val_loss)
        
        # Save metrics
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        train_accs.append(train_acc)
        val_accs.append(val_acc)
        
        print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
        print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
        print(f"Val F1 (weighted): {val_metrics['f1_weighted']:.4f}")
        print(f"Val F1 (macro): {val_metrics['f1_macro']:.4f}")
        
        # Early stopping
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            patience_counter = 0
            torch.save(model.state_dict(), f'best_multimodal_model_{split_name}.pth')
            print("💾 Saved best model!")
        else:
            patience_counter += 1
            
        if patience_counter >= Config.PATIENCE:
            print(f"⏹️ Early stopping triggered after {epoch+1} epochs")
            break
    
    # Plot training curves
    plt.figure(figsize=(15, 5))
    
    plt.subplot(1, 3, 1)
    plt.plot(train_losses, label='Train', marker='o')
    plt.plot(val_losses, label='Validation', marker='s')
    plt.title(f'Training Loss - {split_name}')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    
    plt.subplot(1, 3, 2)
    plt.plot(train_accs, label='Train', marker='o')
    plt.plot(val_accs, label='Validation', marker='s')
    plt.title(f'Training Accuracy - {split_name}')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.grid(True)
    
    plt.subplot(1, 3, 3)
    lr_values = [scheduler.get_last_lr()[0]] * len(train_losses) if hasattr(scheduler, 'get_last_lr') else [Config.LEARNING_RATE] * len(train_losses)
    plt.plot(lr_values, marker='o')
    plt.title(f'Learning Rate - {split_name}')
    plt.xlabel('Epoch')
    plt.ylabel('Learning Rate')
    plt.grid(True)
    
    plt.tight_layout()
    plt.savefig(f'training_curves_{split_name}.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    return trainer

print("✅ Training function created!")


In [None]:
## 9. Evaluation Function


In [None]:
def evaluate_model(trainer, test_loader, split_name="option1"):
    """Comprehensive model evaluation"""
    
    print(f"📈 Evaluating model on test set for {split_name}...")
    
    # Load best model
    trainer.model.load_state_dict(torch.load(f'best_multimodal_model_{split_name}.pth'))
    
    # Evaluate
    test_loss, test_metrics, predictions, targets = trainer.evaluate(test_loader, nn.CrossEntropyLoss())
    
    # Print metrics
    print(f"\n{'='*70}")
    print(f"🎯 TEST SET EVALUATION RESULTS - {split_name.upper()}")
    print(f"{'='*70}")
    print(f"📊 MAIN METRICS:")
    print(f"   Test Loss: {test_loss:.4f}")
    print(f"   Accuracy: {test_metrics['accuracy']:.4f} ({test_metrics['accuracy']*100:.2f}%)")
    print(f"   F1 Score (Weighted): {test_metrics['f1_weighted']:.4f}")
    print(f"   F1 Score (Macro): {test_metrics['f1_macro']:.4f}")
    print(f"   Precision (Weighted): {test_metrics['precision_weighted']:.4f}")
    print(f"   Precision (Macro): {test_metrics['precision_macro']:.4f}")
    print(f"   Recall (Weighted): {test_metrics['recall_weighted']:.4f}")
    print(f"   Recall (Macro): {test_metrics['recall_macro']:.4f}")
    
    # Classification report
    class_names = list(LABEL_MAPPING.keys())
    print(f"\n📈 DETAILED CLASSIFICATION REPORT:")
    print(classification_report(targets, predictions, target_names=class_names))
    
    # Enhanced confusion matrix visualization
    print(f"\n🎯 CONFUSION MATRIX ANALYSIS:")
    cm = confusion_matrix(targets, predictions)
    
    # Plot enhanced confusion matrix
    plt.figure(figsize=(14, 10))
    
    # Create heatmap
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
               xticklabels=class_names, yticklabels=class_names,
               cbar_kws={'label': 'Number of Samples'})
    plt.title(f'Confusion Matrix - {split_name.upper()}', fontsize=16, fontweight='bold')
    plt.xlabel('Predicted Revenue Category', fontsize=12)
    plt.ylabel('Actual Revenue Category', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    
    # Save confusion matrix plot
    plt.savefig(f'confusion_matrix_{split_name}.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Print confusion matrix as numbers
    print(f"\nConfusion Matrix Values:")
    print("Rows = Actual Revenue Category, Columns = Predicted Revenue Category")
    print(f"{'':>12}", end="")
    for name in class_names:
        print(f"{name[:8]:>8}", end="")
    print()
    
    for i, name in enumerate(class_names):
        print(f"{name[:12]:>12}", end="")
        for j in range(len(class_names)):
            print(f"{cm[i,j]:>8}", end="")
        print()
    
    # Per-class detailed metrics
    print(f"\n📊 PER-CLASS DETAILED METRICS:")
    per_class_f1 = f1_score(targets, predictions, average=None)
    per_class_precision = precision_score(targets, predictions, average=None, zero_division=0)
    per_class_recall = recall_score(targets, predictions, average=None, zero_division=0)
    
    print(f"{'Class':<15} {'Precision':<10} {'Recall':<10} {'F1-Score':<10} {'Support':<10}")
    print("-" * 55)
    
    for i, class_name in enumerate(class_names):
        support = np.sum(targets == i)
        print(f"{class_name:<15} {per_class_precision[i]:<10.3f} {per_class_recall[i]:<10.3f} {per_class_f1[i]:<10.3f} {support:<10}")
    
    # Model performance summary
    print(f"\n🏆 PERFORMANCE SUMMARY:")
    print(f"   Total Test Samples: {len(targets)}")
    print(f"   Correct Predictions: {np.sum(targets == predictions)}")
    print(f"   Incorrect Predictions: {np.sum(targets != predictions)}")
    print(f"   Best Performing Class: {class_names[np.argmax(per_class_f1)]}")
    print(f"   Worst Performing Class: {class_names[np.argmin(per_class_f1)]}")
    
    # Save results
    results = {
        'split_name': split_name,
        'test_metrics': test_metrics,
        'confusion_matrix': cm.tolist(),
        'predictions': predictions,
        'targets': targets,
        'classification_report': classification_report(targets, predictions, 
                                                     target_names=class_names, 
                                                     output_dict=True)
    }
    
    # Save to JSON
    with open(f'evaluation_results_{split_name}.json', 'w') as f:
        # Convert numpy arrays to lists for JSON serialization
        serializable_results = {
            'split_name': results['split_name'],
            'test_metrics': results['test_metrics'],
            'confusion_matrix': results['confusion_matrix'],
            'predictions': [int(x) for x in results['predictions']],
            'targets': [int(x) for x in results['targets']],
            'classification_report': results['classification_report']
        }
        json.dump(serializable_results, f, indent=2)
    
    print(f"\n💾 Files saved:")
    print(f"   📄 evaluation_results_{split_name}.json")
    print(f"   📊 confusion_matrix_{split_name}.png")
    print(f"\n✅ Evaluation completed successfully!")
    
    return results

print("✅ Evaluation function created!")


In [None]:
## 10. Complete Training Pipeline

Now let's run the complete training and evaluation pipeline for all three data split options.


In [None]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"🔧 Using device: {device}")

# Initialize components
print("\n🚀 Initializing components...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
video_processor = YouTubeVideoProcessor()

# Data path - update this to your dataset path
data_path = 'Data/TMRDB.csv'  # Update this path

print("\n✅ All components initialized successfully!")
print("Ready to start training!")


In [None]:
# Main training loop for all split options
def run_complete_training():
    """Run training for all three data split options"""
    
    print("🎬 MULTIMODAL MOVIE REVENUE PREDICTION SYSTEM")
    print("=" * 70)
    
    all_results = {}
    
    # Test all three split options
    for split_option in ['option1', 'option2', 'option3']:
        print(f"\n{'='*30} {split_option.upper()} {'='*30}")
        
        try:
            # Prepare data
            train_df, val_df, test_df = prepare_data(data_path, split_option)
            
            # Create data loaders
            train_loader, val_loader, test_loader = create_data_loaders(
                train_df, val_df, test_df, tokenizer, video_processor
            )
            
            # Initialize fresh model for each split
            model = MultimodalFusionModel()
            print(f"\n📊 Model initialized with {sum(p.numel() for p in model.parameters()):,} parameters")
            
            # Train model
            trainer = train_model(model, train_loader, val_loader, device, split_option)
            
            # Evaluate model
            results = evaluate_model(trainer, test_loader, split_option)
            all_results[split_option] = results
            
            print(f"\n✅ Completed {split_option}")
            
        except Exception as e:
            print(f"❌ Error in {split_option}: {e}")
            continue
    
    # Compare results across splits
    print(f"\n{'='*70}")
    print("📊 COMPARISON ACROSS DIFFERENT DATA SPLITS")
    print(f"{'='*70}")
    
    comparison_data = []
    for split_option, results in all_results.items():
        ratios = Config.SPLIT_OPTIONS[split_option]
        comparison_data.append({
            'Split': f"{ratios[0]*100:.0f}-{ratios[1]*100:.0f}-{ratios[2]*100:.0f}",
            'Accuracy': f"{results['test_metrics']['accuracy']:.4f}",
            'F1 (Weighted)': f"{results['test_metrics']['f1_weighted']:.4f}",
            'F1 (Macro)': f"{results['test_metrics']['f1_macro']:.4f}",
            'Precision (W)': f"{results['test_metrics']['precision_weighted']:.4f}",
            'Recall (W)': f"{results['test_metrics']['recall_weighted']:.4f}"
        })
    
    if comparison_data:
        comparison_df = pd.DataFrame(comparison_data)
        print(comparison_df.to_string(index=False))
        
        # Save comparison
        comparison_df.to_csv('split_comparison.csv', index=False)
        print("\n💾 Comparison saved to 'split_comparison.csv'")
    
    print("\n🎉 Training completed! Generated files:")
    for split_option in all_results.keys():
        print(f"   - best_multimodal_model_{split_option}.pth")
        print(f"   - evaluation_results_{split_option}.json")
        print(f"   - training_curves_{split_option}.png")
    print("   - split_comparison.csv")
    
    return all_results

# To run the complete training, uncomment the line below:
# all_results = run_complete_training()


In [None]:
## 11. Quick Demo/Test (Optional)

You can run this section to test individual components or train on a smaller sample.


In [None]:
# Demo: Test Confusion Matrix and Evaluation Display
def demo_confusion_matrix():
    """Demo function to test confusion matrix and evaluation display"""
    print("🎯 DEMO: Testing Confusion Matrix and Evaluation Display")
    print("=" * 60)
    
    # Create dummy predictions and targets for demonstration
    np.random.seed(42)
    n_samples = 100
    n_classes = 8
    
    # Generate dummy targets and predictions
    targets = np.random.randint(0, n_classes, n_samples)
    predictions = np.random.randint(0, n_classes, n_samples)
    
    # Make some predictions correct for realistic metrics
    correct_mask = np.random.random(n_samples) < 0.6  # 60% accuracy
    predictions[correct_mask] = targets[correct_mask]
    
    class_names = list(LABEL_MAPPING.keys())
    
    # Calculate metrics
    accuracy = accuracy_score(targets, predictions)
    f1_weighted = f1_score(targets, predictions, average='weighted')
    f1_macro = f1_score(targets, predictions, average='macro')
    precision_weighted = precision_score(targets, predictions, average='weighted')
    recall_weighted = recall_score(targets, predictions, average='weighted')
    
    # Print metrics
    print(f"📊 EVALUATION METRICS:")
    print(f"   Accuracy: {accuracy:.4f}")
    print(f"   F1 Score (Weighted): {f1_weighted:.4f}")
    print(f"   F1 Score (Macro): {f1_macro:.4f}")
    print(f"   Precision (Weighted): {precision_weighted:.4f}")
    print(f"   Recall (Weighted): {recall_weighted:.4f}")
    
    # Classification report
    print(f"\n📈 CLASSIFICATION REPORT:")
    print(classification_report(targets, predictions, target_names=class_names))
    
    # Confusion Matrix
    print(f"\n🎯 CONFUSION MATRIX:")
    cm = confusion_matrix(targets, predictions)
    
    # Display confusion matrix as heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
               xticklabels=class_names, yticklabels=class_names,
               cbar_kws={'label': 'Number of Samples'})
    plt.title('Confusion Matrix - Demo', fontsize=16, fontweight='bold')
    plt.xlabel('Predicted Revenue Category', fontsize=12)
    plt.ylabel('Actual Revenue Category', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()
    
    # Display confusion matrix as numbers
    print(f"\nConfusion Matrix (Numbers):")
    print("Rows = Actual, Columns = Predicted")
    print(f"{'':>12}", end="")
    for name in class_names:
        print(f"{name[:8]:>8}", end="")
    print()
    
    for i, name in enumerate(class_names):
        print(f"{name[:12]:>12}", end="")
        for j in range(len(class_names)):
            print(f"{cm[i,j]:>8}", end="")
        print()
    
    # Per-class metrics
    print(f"\n📊 PER-CLASS DETAILED METRICS:")
    per_class_f1 = f1_score(targets, predictions, average=None)
    per_class_precision = precision_score(targets, predictions, average=None)
    per_class_recall = recall_score(targets, predictions, average=None)
    
    print(f"{'Class':<12} {'Precision':<10} {'Recall':<10} {'F1-Score':<10} {'Support':<10}")
    print("-" * 52)
    
    for i, class_name in enumerate(class_names):
        support = np.sum(targets == i)
        print(f"{class_name:<12} {per_class_precision[i]:<10.3f} {per_class_recall[i]:<10.3f} {per_class_f1[i]:<10.3f} {support:<10}")
    
    print(f"\n✅ Demo completed! All visualization functions are working.")
    
    return cm, accuracy, f1_weighted

# Run the demo
print("🚀 Running confusion matrix demo...")
demo_cm, demo_acc, demo_f1 = demo_confusion_matrix()


In [None]:
# Simple Test Function to Display Output Values
def test_evaluation_display():
    """Test function to show evaluation metrics and confusion matrix display"""
    print("🧪 TESTING EVALUATION DISPLAY FUNCTIONS")
    print("=" * 50)
    
    # Create a ModelTrainer instance for testing
    model = MultimodalFusionModel()
    device = torch.device('cpu')
    trainer = ModelTrainer(model, device)
    
    # Generate dummy test data
    np.random.seed(42)
    n_samples = 80
    targets = np.random.randint(0, 8, n_samples)
    predictions = np.random.randint(0, 8, n_samples)
    
    # Make some predictions match targets for realistic accuracy
    correct_indices = np.random.choice(n_samples, size=int(0.65 * n_samples), replace=False)
    predictions[correct_indices] = targets[correct_indices]
    
    class_names = list(LABEL_MAPPING.keys())
    
    # Test the actual plot_confusion_matrix function from ModelTrainer
    print("📊 Testing ModelTrainer.plot_confusion_matrix function:")
    cm = trainer.plot_confusion_matrix(targets, predictions, class_names)
    
    # Calculate and display metrics manually 
    accuracy = accuracy_score(targets, predictions)
    f1_weighted = f1_score(targets, predictions, average='weighted')
    f1_macro = f1_score(targets, predictions, average='macro')
    
    print(f"\n📈 SAMPLE EVALUATION METRICS:")
    print(f"   Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"   F1 Score (Weighted): {f1_weighted:.4f}")
    print(f"   F1 Score (Macro): {f1_macro:.4f}")
    
    # Test the evaluate function from ModelTrainer
    print(f"\n🔍 Testing ModelTrainer.calculate_metrics function:")
    test_metrics = trainer.calculate_metrics(targets, predictions)
    
    for metric, value in test_metrics.items():
        print(f"   {metric}: {value:.4f}")
    
    print(f"\n✅ All display functions are working correctly!")
    print(f"🎯 You should see:")
    print(f"   • A confusion matrix heatmap above")
    print(f"   • Numerical accuracy and F1 scores")
    print(f"   • All evaluation metrics")
    
    return targets, predictions, cm

# Run the test
test_targets, test_predictions, test_cm = test_evaluation_display()


In [None]:
## 📊 Troubleshooting Visualization Issues

If you cannot see the confusion matrix or evaluation outputs, try these solutions:

### 1. **Ensure Matplotlib Backend is Set Correctly**
```python
import matplotlib
matplotlib.use('inline')  # For Jupyter notebooks
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 100
```

### 2. **Force Display with explicit show()**
```python
# After any plotting command, add:
plt.tight_layout()
plt.show()
```

### 3. **Check Jupyter Magic Commands**
Make sure you have this at the top of your notebook:
```python
%matplotlib inline
```

### 4. **Alternative: Save and Display Images**
If plots still don't show, the functions now save images to files:
- `confusion_matrix_[split_name].png`
- `training_curves_[split_name].png`

### 5. **Test Display Function**
Run the test functions above to verify everything works:
- `demo_confusion_matrix()` - Shows example confusion matrix
- `test_evaluation_display()` - Tests ModelTrainer functions

### 6. **Check for Output Suppression**
If running programmatically, outputs might be suppressed. Make sure to:
- Run cells individually to see outputs
- Check that print statements are executing
- Look for saved files in your directory

### What You Should See:
✅ **Confusion Matrix**: Heatmap with 8x8 grid (revenue categories)  
✅ **Metrics**: Accuracy, F1, Precision, Recall values  
✅ **Classification Report**: Per-class performance breakdown  
✅ **Performance Summary**: Best/worst performing classes


In [None]:
# Quick test with dummy data
def quick_test():
    """Quick test of the model architecture with dummy data"""
    print("🧪 Running quick test with dummy data...")
    
    # Create dummy inputs
    batch_size = 2
    dummy_text_ids = torch.randint(0, 1000, (batch_size, Config.MAX_TEXT_LENGTH))
    dummy_attention_mask = torch.ones(batch_size, Config.MAX_TEXT_LENGTH)
    dummy_video = torch.randn(batch_size, Config.FRAMES_PER_VIDEO, 3, Config.VIDEO_FRAME_SIZE, Config.VIDEO_FRAME_SIZE)
    dummy_audio = torch.randn(batch_size, Config.AUDIO_SAMPLE_RATE * Config.AUDIO_DURATION)
    
    # Test model forward pass
    model = MultimodalFusionModel()
    model.eval()
    
    with torch.no_grad():
        outputs = model(dummy_text_ids, dummy_attention_mask, dummy_video, dummy_audio)
    
    print(f"✅ Model test successful!")
    print(f"   Input shapes:")
    print(f"     Text: {dummy_text_ids.shape}")
    print(f"     Video: {dummy_video.shape}")
    print(f"     Audio: {dummy_audio.shape}")
    print(f"   Output shape: {outputs.shape}")
    print(f"   Predicted classes: {torch.argmax(outputs, dim=1).numpy()}")
    
    return True

# Run quick test
test_result = quick_test()


In [None]:
## 12. Usage Instructions

### To train the complete system:

1. **Update the data path** in cell 28 to point to your `TMRDB.csv` file
2. **Uncomment the training line** in cell 29: `# all_results = run_complete_training()`
3. **Run all cells** in sequence

### What the system does:

- **Downloads YouTube trailers** automatically
- **Extracts video frames** using OpenCV
- **Extracts audio** using librosa
- **Processes text** using BERT
- **Trains multimodal fusion** model
- **Evaluates on all metrics** (accuracy, F1, precision, recall, confusion matrix)
- **Tests 3 different data splits** (70-20-10, 75-15-10, 80-10-10)

### Output files generated:

- `best_multimodal_model_[split].pth` - Trained model weights
- `evaluation_results_[split].json` - Detailed metrics
- `training_curves_[split].png` - Training visualizations  
- `split_comparison.csv` - Performance comparison
- `confusion_matrix.png` - Confusion matrix plots

### Model Architecture Summary:

```
📊 Total Parameters: ~140M
🧠 Text Encoder (BERT): 768-dim → Revenue Classes
🎬 Video Encoder (ResNet50): 2048-dim → Revenue Classes  
🎵 Audio Encoder (1D CNN): 1024-dim → Revenue Classes
🔗 Fusion Network: 3840-dim → 512 → 256 → 8 classes
```

### Revenue Categories:
0. Disaster, 1. Flop, 2. Successful, 3. Average
4. Hit, 5. Outstanding, 6. Superhit, 7. Blockbuster
