In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import librosa
import numpy as np
import pandas as pd

from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import os

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# -----------------------
# 1. Data Loading Functions
# -----------------------

def load_text_data(text_dir):
    """Load text data from train/test/validation folders"""
    all_data = []
    
    for split in ['train', 'test', 'validation']:
        split_path = os.path.join(text_dir, split)
        if os.path.exists(split_path):
            for file in os.listdir(split_path):
                if file.endswith('.csv'):
                    file_path = os.path.join(split_path, file)
                    df = pd.read_csv(file_path)
                    df['split'] = split
                    all_data.append(df)
    
    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)
        return combined_df
    else:
        print("No text data found!")
        return pd.DataFrame()

def load_speech_data(audio_dir):
    """Load speech data by scanning audio files"""
    data = []
    
    for file in os.listdir(audio_dir):
        if file.endswith('.wav'):
            parts = file.split('_')
            if len(parts) == 3:
                word = parts[1]
                emotion = parts[2].replace('.wav', '')
                data.append({
                    'word': word,
                    'emotion': emotion,
                    'speech_path': os.path.join(audio_dir, file)
                })
    
    return pd.DataFrame(data)

In [None]:
def create_combined_dataset(text_df, speech_df):
    """Combine text and speech datasets by matching words and emotions"""
    combined_data = []
    
    # Check what columns exist in text_df
    print(f"Text DataFrame columns: {text_df.columns.tolist()}")
    print(f"Text DataFrame shape: {text_df.shape}")
    print("Sample text data:")
    print(text_df.head())
    
    # Get unique combinations from both datasets
    if 'word' in text_df.columns and 'emotion' in text_df.columns:
        text_combinations = set(zip(text_df['word'], text_df['emotion']))
    else:
        # Assume first column is word, second is emotion
        text_combinations = set(zip(text_df.iloc[:, 0], text_df.iloc[:, 1]))
    
    speech_combinations = set(zip(speech_df['word'], speech_df['emotion']))
    
    print(f"Text combinations found: {len(text_combinations)}")
    print(f"Speech combinations found: {len(speech_combinations)}")
    
    # Find common combinations
    common_combinations = text_combinations.intersection(speech_combinations)
    print(f"Common combinations: {len(common_combinations)}")
    
    for word, emotion in common_combinations:
        # Get text data
        if 'word' in text_df.columns and 'emotion' in text_df.columns:
            text_row = text_df[(text_df['word'] == word) & (text_df['emotion'] == emotion)]
        else:
            text_row = text_df[(text_df.iloc[:, 0] == word) & (text_df.iloc[:, 1] == emotion)]
        
        # Get speech data
        speech_row = speech_df[(speech_df['word'] == word) & (speech_df['emotion'] == emotion)]
        
        if not text_row.empty and not speech_row.empty:
            # Get text content (check for various column names)
            text_content = word  # fallback to word
            if 'text' in text_row.columns:
                text_content = text_row['text'].iloc[0]
            elif 'sentence' in text_row.columns:
                text_content = text_row['sentence'].iloc[0]
            elif len(text_row.columns) > 2:
                text_content = text_row.iloc[0, 2]
            
            combined_data.append({
                'word': word,
                'emotion': emotion,
                'text_content': str(text_content),
                'speech_path': speech_row['speech_path'].iloc[0]
            })
    
    return pd.DataFrame(combined_data)

In [None]:
# -----------------------
# 2. Preprocessing Functions
# -----------------------

def extract_mfcc(file_path, n_mfcc=40, max_len=100):
    """Extract MFCC features from audio"""
    try:
        y, sr = librosa.load(file_path, sr=None)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        delta = librosa.feature.delta(mfcc)
        delta2 = librosa.feature.delta(mfcc, order=2)
        combined = np.vstack([mfcc, delta, delta2]).T  # (time, feature_dim)

        if combined.shape[0] < max_len:
            pad_width = max_len - combined.shape[0]
            combined = np.pad(combined, ((0, pad_width), (0, 0)), mode='constant')
        else:
            combined = combined[:max_len, :]
        
        # Normalize
        if np.std(combined) > 0:
            combined = (combined - np.mean(combined)) / np.std(combined)
        
        return combined
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        # Return zero array if file can't be processed
        return np.zeros((max_len, n_mfcc * 3))

In [None]:
# -----------------------
# 3. Dataset Class
# -----------------------

class MultimodalDataset(Dataset):
    def __init__(self, df, tokenizer, label_encoder, max_text_length=128, max_audio_length=100):
        self.df = df
        self.tokenizer = tokenizer
        self.label_encoder = label_encoder
        self.max_text_length = max_text_length
        self.max_audio_length = max_audio_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # Process text
        text_content = str(row['text_content'])
        text_encoding = self.tokenizer(
            text_content,
            truncation=True,
            padding='max_length',
            max_length=self.max_text_length,
            return_tensors='pt'
        )
        
        # Process audio
        mfcc = extract_mfcc(row['speech_path'], max_len=self.max_audio_length)
        
        # Process label
        label = self.label_encoder.transform([row['emotion']])[0]
        
        return {
            'input_ids': text_encoding['input_ids'].squeeze(),
            'attention_mask': text_encoding['attention_mask'].squeeze(),
            'mfcc': torch.tensor(mfcc, dtype=torch.float32),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# -----------------------
# 4. Model Architecture
# -----------------------

class MultimodalEmotionModel(nn.Module):
    def __init__(self, num_classes=5, text_model_name='bert-base-uncased', 
                 lstm_hidden=128, audio_feature_dim=120):
        super().__init__()
        
        # Text encoder (BERT)
        self.text_encoder = AutoModel.from_pretrained(text_model_name)
        text_hidden_size = self.text_encoder.config.hidden_size
        
        # Audio encoder (CNN + LSTM)
        self.audio_cnn = nn.Sequential(
            nn.Conv1d(audio_feature_dim, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Conv1d(128, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(64)
        )
        
        self.audio_lstm = nn.LSTM(64, lstm_hidden, batch_first=True, bidirectional=True)
        
        # Fusion layer
        fusion_input_dim = text_hidden_size + (lstm_hidden * 2)  # *2 for bidirectional
        self.fusion_layer = nn.Sequential(
            nn.Linear(fusion_input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        
        # Classifier
        self.classifier = nn.Linear(128, num_classes)

    def forward(self, input_ids, attention_mask, mfcc):
        # Text encoding
        text_outputs = self.text_encoder(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        text_features = text_outputs.pooler_output  # [batch_size, hidden_size]
        
        # Audio encoding
        # mfcc: [batch_size, time, features] -> [batch_size, features, time]
        audio_cnn_input = mfcc.permute(0, 2, 1)
        audio_cnn_out = self.audio_cnn(audio_cnn_input)  # [batch_size, 64, time]
        
        # Back to [batch_size, time, features] for LSTM
        audio_lstm_input = audio_cnn_out.permute(0, 2, 1)
        _, (h_n, _) = self.audio_lstm(audio_lstm_input)
        
        # Concatenate forward and backward hidden states
        audio_features = torch.cat((h_n[-2], h_n[-1]), dim=1)  # [batch_size, lstm_hidden*2]
        
        # Fusion
        combined_features = torch.cat((text_features, audio_features), dim=1)
        fusion_output = self.fusion_layer(combined_features)
        
        # Classification
        logits = self.classifier(fusion_output)
        return logits

In [None]:
# -----------------------
# 5. Training Functions
# -----------------------

def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        mfcc = batch['mfcc'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, mfcc)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total_predictions += labels.size(0)
        correct_predictions += (predicted == labels).sum().item()
    
    avg_loss = total_loss / len(dataloader)
    accuracy = correct_predictions / total_predictions
    return avg_loss, accuracy

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            mfcc = batch['mfcc'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask, mfcc)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total_predictions += labels.size(0)
            correct_predictions += (predicted == labels).sum().item()
    
    avg_loss = total_loss / len(dataloader)
    accuracy = correct_predictions / total_predictions
    return avg_loss, accuracy

In [None]:
# -----------------------
# 6. Load and Prepare Data
# -----------------------

# Set paths
text_dir = 'dataset/text'
audio_dir = 'dataset/speech'

# Load datasets
print("Loading text dataset...")
text_df = load_text_data(text_dir)
print(f"Text dataset shape: {text_df.shape}")

print("\nLoading speech dataset...")
speech_df = load_speech_data(audio_dir)
print(f"Speech dataset shape: {speech_df.shape}")

# Combine datasets
print("\nCombining datasets...")
combined_df = create_combined_dataset(text_df, speech_df)
print(f"Combined dataset shape: {combined_df.shape}")

if not combined_df.empty:
    print("\nSample combined data:")
    print(combined_df.head())
else:
    print("ERROR: No matching data found between text and speech datasets!")


In [None]:
# -----------------------
# 7. Prepare for Training (run this cell after checking the data above)
# -----------------------

# Prepare labels
label_encoder = LabelEncoder()
combined_df['emotion_encoded'] = label_encoder.fit_transform(combined_df['emotion'])

print(f"Emotion classes: {label_encoder.classes_}")
print(f"Number of classes: {len(label_encoder.classes_)}")

# Split data
train_df, val_df = train_test_split(combined_df, test_size=0.2, random_state=42, 
                                   stratify=combined_df['emotion_encoded'])

print(f"Train size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Create datasets and dataloaders
train_dataset = MultimodalDataset(train_df, tokenizer, label_encoder)
val_dataset = MultimodalDataset(val_df, tokenizer, label_encoder)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

print("Datasets and dataloaders created successfully!")



In [None]:
# -----------------------
# 8. Initialize Model and Training
# -----------------------

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = MultimodalEmotionModel(
    num_classes=len(label_encoder.classes_),
    text_model_name='bert-base-uncased'
).to(device)

optimizer = optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

print("Model initialized successfully!")

In [None]:
# -----------------------
# 9. Training Loop
# -----------------------

num_epochs = 20
best_val_acc = 0

for epoch in range(num_epochs):
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc = evaluate(model, val_loader, criterion, device)
    
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
    print("-" * 50)
    
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), 'best_multimodal_model.pth')

print(f"Training completed! Best validation accuracy: {best_val_acc:.4f}")
