In [1]:
import torch
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())



PyTorch version: 2.8.0+cpu
CUDA available: False


In [2]:
import os
import pickle
import numpy as np

# Check if your MELD data path exists
meld_path = r'C:/Users/project/MELD.Features.Models/features'
print("MELD path exists:", os.path.exists(meld_path))

# If path exists, check for key files
if os.path.exists(meld_path):
    files = os.listdir(meld_path)
    print("Files in folder:", files)
    
    # Check for the specific files we need
    required_files = ['audio_emotion.pkl', 'text_emotion.pkl', 'data_emotion.p']
    for file in required_files:
        if file in files:
            print(f"✓ {file} found")
        else:
            print(f"✗ {file} missing")


MELD path exists: False


In [3]:
import os
import glob

# Search for MELD folders in common locations
search_paths = [
    "C:/Users/*/MELD*",
    "C:/Users/*/Downloads/MELD*", 
    "C:/Users/*/Documents/MELD*",
    "C:/Users/*/Desktop/MELD*",
    "C:/MELD*"
]

print("Searching for MELD folders...")
for pattern in search_paths:
    matches = glob.glob(pattern)
    if matches:
        print(f"Found: {matches}")

# Also search for the specific pickle files
print("\nSearching for MELD pickle files...")
pickle_patterns = [
    "C:/Users/**/audio_emotion.pkl",
    "C:/Users/**/text_emotion.pkl", 
    "C:/Users/**/data_emotion.p"
]

for pattern in pickle_patterns:
    matches = glob.glob(pattern, recursive=True)
    if matches:
        print(f"Found: {matches}")


Searching for MELD folders...

Searching for MELD pickle files...
Found: ['C:/Users\\raish\\Downloads\\archive (1)\\MELD-Features-Models\\MELD.Features.Models\\features\\audio_emotion.pkl']
Found: ['C:/Users\\raish\\Downloads\\archive (1)\\MELD-Features-Models\\MELD.Features.Models\\features\\text_emotion.pkl']
Found: ['C:/Users\\raish\\Downloads\\archive (1)\\MELD-Features-Models\\MELD.Features.Models\\features\\data_emotion.p']


In [4]:
# Update the correct path to your MELD data
meld_path = r'C:/Users/raish/Downloads/archive (1)/MELD-Features-Models/MELD.Features.Models/features'
print("MELD path exists:", os.path.exists(meld_path))

# Check for all files in the correct location
if os.path.exists(meld_path):
    files = os.listdir(meld_path)
    print("Files in folder:", files)
    
    # Check for the specific files we need
    required_files = ['audio_emotion.pkl', 'text_emotion.pkl', 'data_emotion.p']
    for file in required_files:
        if file in files:
            print(f"✓ {file} found")
        else:
            print(f"✗ {file} missing")


MELD path exists: True
Files in folder: ['audio_embeddings_feature_selection_emotion.pkl', 'audio_embeddings_feature_selection_sentiment.pkl', 'audio_emotion.pkl', 'audio_sentiment.pkl', 'bimodal_sentiment.pkl', 'data_emotion.p', 'data_sentiment.p', 'text_emotion.pkl', 'text_glove_average_emotion.pkl', 'text_glove_average_sentiment.pkl', 'text_glove_CNN_emotion.pkl', 'text_glove_CNN_sentiment.pkl', 'text_sentiment.pkl']
✓ audio_emotion.pkl found
✓ text_emotion.pkl found
✓ data_emotion.p found


In [5]:
import os
import pickle
import numpy as np

# Correct path to MELD features
meld_path = r'C:/Users/raish/Downloads/archive (1)/MELD-Features-Models/MELD.Features.Models/features'

# 1. Load audio emotion features
print("Loading audio features...")
audio_features_path = os.path.join(meld_path, 'audio_emotion.pkl')
with open(audio_features_path, 'rb') as f:
    audio_emotion_features_list = pickle.load(f)

# Combine audio feature dicts
combined_audio_features = {}
for d in audio_emotion_features_list:
    combined_audio_features.update(d)
print(f"Total audio feature keys: {len(combined_audio_features)}")

# 2. Load text emotion features  
print("Loading text features...")
text_features_path = os.path.join(meld_path, 'text_emotion.pkl')
with open(text_features_path, 'rb') as f:
    text_emotion_features_list = pickle.load(f)

# Combine text feature dicts
combined_text_features = {}
for d in text_emotion_features_list:
    combined_text_features.update(d)
print(f"Total text feature keys: {len(combined_text_features)}")

# 3. Load emotion labels
print("Loading emotion labels...")
label_file_path = os.path.join(meld_path, 'data_emotion.p')
with open(label_file_path, 'rb') as f:
    emotion_labels = pickle.load(f)
print(f"Loaded emotion labels, outer list length: {len(emotion_labels)}")

# 4. Flatten labels from first element (list of dicts)
flat_labels = [utt['y'] for utt in emotion_labels[0]]
print(f"Total flattened labels: {len(flat_labels)}")
print("Sample labels:", flat_labels[:10])

# 5. Align audio and text features with labels by index
min_len = min(len(combined_audio_features), len(combined_text_features), len(flat_labels))
print(f"Minimum length for alignment: {min_len}")

aligned_audio_features = []
aligned_text_features = []
aligned_labels = []

for i in range(min_len):
    key = str(i)
    aligned_audio_features.append(combined_audio_features[key])
    aligned_text_features.append(combined_text_features[key])
    aligned_labels.append(flat_labels[i])

print(f"Total aligned samples: {len(aligned_audio_features)}")

# 6. Convert to numpy arrays
X_audio = np.array(aligned_audio_features)
X_text = np.array(aligned_text_features)

# 7. Convert string labels to numeric
unique_labels = sorted(set(aligned_labels))
label_to_index = {label: idx for idx, label in enumerate(unique_labels)}
print("Label to index mapping:", label_to_index)

y_numeric = [label_to_index[label] for label in aligned_labels]
y = np.array(y_numeric)

# 8. Final shapes
print(f"Audio features shape: {X_audio.shape}")
print(f"Text features shape: {X_text.shape}")
print(f"Labels shape: {y.shape}")

print("\n✅ Data loading and alignment complete!")


Loading audio features...
Total audio feature keys: 1039
Loading text features...
Total text feature keys: 1039
Loading emotion labels...
Loaded emotion labels, outer list length: 6
Total flattened labels: 13708
Sample labels: ['neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral']
Minimum length for alignment: 1039
Total aligned samples: 1039
Label to index mapping: {'neutral': 0}
Audio features shape: (1039, 33, 300)
Text features shape: (1039, 33, 600)
Labels shape: (1039,)

✅ Data loading and alignment complete!


In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# 1. Create custom dataset class
class MELDDataset(Dataset):
    def __init__(self, audio_features, text_features, labels):
        self.audio_features = torch.tensor(audio_features, dtype=torch.float32)
        self.text_features = torch.tensor(text_features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.audio_features[idx], self.text_features[idx], self.labels[idx]

# 2. Define multimodal model architecture
class MultimodalEmotionModel(nn.Module):
    def __init__(self, audio_dim, text_dim, num_classes):
        super(MultimodalEmotionModel, self).__init__()
        
        # Audio branch
        self.audio_fc = nn.Sequential(
            nn.Linear(audio_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        
        # Text branch
        self.text_fc = nn.Sequential(
            nn.Linear(text_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        
        # Fusion and classification layers
        self.classifier = nn.Sequential(
            nn.Linear(256, 128),  # 128 + 128 from audio and text
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, num_classes)
        )

    def forward(self, audio_x, text_x):
        # Process each modality
        audio_out = self.audio_fc(audio_x)
        text_out = self.text_fc(text_x)
        
        # Fusion by concatenation
        fusion = torch.cat((audio_out, text_out), dim=1)
        
        # Final classification
        output = self.classifier(fusion)
        return output

# 3. Prepare data
print("Preparing data for training...")

# Flatten audio features if needed (from 3D to 2D)
if len(X_audio.shape) == 3:
    X_audio_flat = X_audio.reshape(X_audio.shape[0], -1)
else:
    X_audio_flat = X_audio

print(f"Flattened audio shape: {X_audio_flat.shape}")
print(f"Text shape: {X_text.shape}")
print(f"Labels shape: {y.shape}")

# Split data into train/test
X_audio_train, X_audio_test, X_text_train, X_text_test, y_train, y_test = train_test_split(
    X_audio_flat, X_text, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training samples: {len(X_audio_train)}")
print(f"Test samples: {len(X_audio_test)}")

# 4. Create datasets and dataloaders
train_dataset = MELDDataset(X_audio_train, X_text_train, y_train)
test_dataset = MELDDataset(X_audio_test, X_text_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 5. Initialize model, loss, and optimizer
num_classes = len(unique_labels)
audio_dim = X_audio_flat.shape[1]
text_dim = X_text.shape[1]

print(f"Audio input dimension: {audio_dim}")
print(f"Text input dimension: {text_dim}")
print(f"Number of emotion classes: {num_classes}")

model = MultimodalEmotionModel(audio_dim, text_dim, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

print("\n🚀 Starting model training...")

# 6. Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    train_correct = 0
    train_total = 0
    
    for audio_batch, text_batch, labels_batch in train_loader:
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(audio_batch, text_batch)
        loss = criterion(outputs, labels_batch)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        # Statistics
        train_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        train_total += labels_batch.size(0)
        train_correct += (predicted == labels_batch).sum().item()
    
    # Calculate epoch metrics
    train_accuracy = 100 * train_correct / train_total
    avg_train_loss = train_loss / len(train_loader)
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_train_loss:.4f}, Accuracy: {train_accuracy:.2f}%')

print("\n✅ Training completed!")

# 7. Evaluate on test set
model.eval()
test_predictions = []
test_labels = []

with torch.no_grad():
    for audio_batch, text_batch, labels_batch in test_loader:
        outputs = model(audio_batch, text_batch)
        _, predicted = torch.max(outputs, 1)
        
        test_predictions.extend(predicted.cpu().numpy())
        test_labels.extend(labels_batch.cpu().numpy())

# Final evaluation
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"\n🎯 Final Test Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")

# Classification report
label_names = [label for label in unique_labels]
print("\n📊 Detailed Classification Report:")
print(classification_report(test_labels, test_predictions, target_names=label_names))

print("\n✅ Multimodal emotion recognition model training complete!")


Preparing data for training...
Flattened audio shape: (1039, 9900)
Text shape: (1039, 33, 600)
Labels shape: (1039,)
Training samples: 831
Test samples: 208
Audio input dimension: 9900
Text input dimension: 33
Number of emotion classes: 1

🚀 Starting model training...


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1056x600 and 33x256)

In [7]:
# 3. Prepare data (CORRECTED VERSION)
print("Preparing data for training...")

# Flatten audio features (from 3D to 2D)
if len(X_audio.shape) == 3:
    X_audio_flat = X_audio.reshape(X_audio.shape[0], -1)
else:
    X_audio_flat = X_audio

# Flatten text features (from 3D to 2D) - THIS WAS MISSING!
if len(X_text.shape) == 3:
    X_text_flat = X_text.reshape(X_text.shape[0], -1)
else:
    X_text_flat = X_text

print(f"Flattened audio shape: {X_audio_flat.shape}")
print(f"Flattened text shape: {X_text_flat.shape}")
print(f"Labels shape: {y.shape}")

# Split data into train/test
X_audio_train, X_audio_test, X_text_train, X_text_test, y_train, y_test = train_test_split(
    X_audio_flat, X_text_flat, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training samples: {len(X_audio_train)}")
print(f"Test samples: {len(X_audio_test)}")

# 4. Create datasets and dataloaders
train_dataset = MELDDataset(X_audio_train, X_text_train, y_train)
test_dataset = MELDDataset(X_audio_test, X_text_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 5. Initialize model, loss, and optimizer
num_classes = len(unique_labels)
audio_dim = X_audio_flat.shape[1]
text_dim = X_text_flat.shape[1]  # Use flattened text dimension

print(f"Audio input dimension: {audio_dim}")
print(f"Text input dimension: {text_dim}")
print(f"Number of emotion classes: {num_classes}")

model = MultimodalEmotionModel(audio_dim, text_dim, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

print("\n🚀 Starting model training...")


Preparing data for training...
Flattened audio shape: (1039, 9900)
Flattened text shape: (1039, 19800)
Labels shape: (1039,)
Training samples: 831
Test samples: 208
Audio input dimension: 9900
Text input dimension: 19800
Number of emotion classes: 1

🚀 Starting model training...


In [8]:
# Load sentiment data instead of emotion data
print("Loading SENTIMENT data for better class diversity...")

# 1. Load audio sentiment features
audio_sentiment_path = os.path.join(meld_path, 'audio_sentiment.pkl')
with open(audio_sentiment_path, 'rb') as f:
    audio_sentiment_features_list = pickle.load(f)

combined_audio_features = {}
for d in audio_sentiment_features_list:
    combined_audio_features.update(d)

# 2. Load text sentiment features  
text_sentiment_path = os.path.join(meld_path, 'text_sentiment.pkl')
with open(text_sentiment_path, 'rb') as f:
    text_sentiment_features_list = pickle.load(f)

combined_text_features = {}
for d in text_sentiment_features_list:
    combined_text_features.update(d)

# 3. Load sentiment labels
sentiment_label_path = os.path.join(meld_path, 'data_sentiment.p')
with open(sentiment_label_path, 'rb') as f:
    sentiment_labels = pickle.load(f)

# Check what we got
print(f"Audio features: {len(combined_audio_features)}")
print(f"Text features: {len(combined_text_features)}")
print(f"Sentiment labels length: {len(sentiment_labels)}")


Loading SENTIMENT data for better class diversity...
Audio features: 1039
Text features: 1039
Sentiment labels length: 6


In [9]:
# Extract sentiment labels (same structure as emotion labels)
print("Extracting sentiment labels...")

# Flatten sentiment labels from first element (same structure as emotion)
flat_sentiment_labels = [utt['y'] for utt in sentiment_labels[0]]

print(f"Total flattened sentiment labels: {len(flat_sentiment_labels)}")
print("Sample sentiment labels:", flat_sentiment_labels[:10])

# Check unique sentiment classes
unique_sentiment_labels = sorted(set(flat_sentiment_labels))
print("Unique sentiment classes:", unique_sentiment_labels)

# Count distribution of each sentiment class
print("\nSentiment distribution:")
for label in unique_sentiment_labels:
    count = flat_sentiment_labels.count(label)
    percentage = (count/len(flat_sentiment_labels))*100
    print(f"{label}: {count} samples ({percentage:.1f}%)")

# Align sentiment features and labels
min_len = min(len(combined_audio_features), len(combined_text_features), len(flat_sentiment_labels))
print(f"\nAlignment: using {min_len} samples")

aligned_audio_features = []
aligned_text_features = []
aligned_sentiment_labels = []

for i in range(min_len):
    key = str(i)
    aligned_audio_features.append(combined_audio_features[key])
    aligned_text_features.append(combined_text_features[key])
    aligned_sentiment_labels.append(flat_sentiment_labels[i])

# Convert to numeric labels
sentiment_label_to_index = {label: idx for idx, label in enumerate(unique_sentiment_labels)}
print("Sentiment label mapping:", sentiment_label_to_index)

y_sentiment_numeric = [sentiment_label_to_index[label] for label in aligned_sentiment_labels]
y_sentiment = np.array(y_sentiment_numeric)

print(f"\nFinal sentiment data shapes:")
print(f"Audio: {len(aligned_audio_features)}")
print(f"Text: {len(aligned_text_features)}")  
print(f"Labels: {y_sentiment.shape}")
print(f"Number of sentiment classes: {len(unique_sentiment_labels)}")


Extracting sentiment labels...
Total flattened sentiment labels: 13708
Sample sentiment labels: ['neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral']
Unique sentiment classes: ['negative', 'neutral', 'positive']

Sentiment distribution:
negative: 4184 samples (30.5%)
neutral: 6436 samples (47.0%)
positive: 3088 samples (22.5%)

Alignment: using 1039 samples
Sentiment label mapping: {'negative': 0, 'neutral': 1, 'positive': 2}

Final sentiment data shapes:
Audio: 1039
Text: 1039
Labels: (1039,)
Number of sentiment classes: 3


In [10]:
# Complete sentiment model training
print("🚀 Training multimodal SENTIMENT analysis model...")

# Flatten features for model input
X_audio_sentiment = np.array(aligned_audio_features)
X_text_sentiment = np.array(aligned_text_features)

# Flatten if 3D
if len(X_audio_sentiment.shape) == 3:
    X_audio_flat = X_audio_sentiment.reshape(X_audio_sentiment.shape[0], -1)
else:
    X_audio_flat = X_audio_sentiment

if len(X_text_sentiment.shape) == 3:
    X_text_flat = X_text_sentiment.reshape(X_text_sentiment.shape[0], -1)
else:
    X_text_flat = X_text_sentiment

print(f"Flattened audio shape: {X_audio_flat.shape}")
print(f"Flattened text shape: {X_text_flat.shape}")

# Split data
X_audio_train, X_audio_test, X_text_train, X_text_test, y_train, y_test = train_test_split(
    X_audio_flat, X_text_flat, y_sentiment, test_size=0.2, random_state=42, stratify=y_sentiment
)

print(f"Training samples: {len(X_audio_train)}")
print(f"Test samples: {len(X_audio_test)}")

# Create datasets
train_dataset = MELDDataset(X_audio_train, X_text_train, y_train)
test_dataset = MELDDataset(X_audio_test, X_text_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Initialize model for sentiment (3 classes)
num_classes = 3
audio_dim = X_audio_flat.shape[1]
text_dim = X_text_flat.shape[1]

model = MultimodalEmotionModel(audio_dim, text_dim, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

print(f"Model input dimensions - Audio: {audio_dim}, Text: {text_dim}")
print("Starting training...")

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    train_correct = 0
    train_total = 0
    
    for audio_batch, text_batch, labels_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(audio_batch, text_batch)
        loss = criterion(outputs, labels_batch)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        train_total += labels_batch.size(0)
        train_correct += (predicted == labels_batch).sum().item()
    
    train_accuracy = 100 * train_correct / train_total
    avg_loss = train_loss / len(train_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Accuracy: {train_accuracy:.2f}%')

# Test evaluation
model.eval()
test_predictions = []
test_true = []

with torch.no_grad():
    for audio_batch, text_batch, labels_batch in test_loader:
        outputs = model(audio_batch, text_batch)
        _, predicted = torch.max(outputs, 1)
        test_predictions.extend(predicted.cpu().numpy())
        test_true.extend(labels_batch.cpu().numpy())

# Final results
test_accuracy = accuracy_score(test_true, test_predictions)
print(f"\n🎯 Final Test Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")

# Classification report
class_names = ['negative', 'neutral', 'positive']
print("\n📊 Classification Report:")
print(classification_report(test_true, test_predictions, target_names=class_names))

print("\n✅ Multimodal sentiment analysis training complete!")


🚀 Training multimodal SENTIMENT analysis model...
Flattened audio shape: (1039, 19800)
Flattened text shape: (1039, 19800)
Training samples: 831
Test samples: 208
Model input dimensions - Audio: 19800, Text: 19800
Starting training...
Epoch [1/10], Loss: 0.1678, Accuracy: 96.39%
Epoch [2/10], Loss: 0.0000, Accuracy: 100.00%
Epoch [3/10], Loss: 0.0000, Accuracy: 100.00%
Epoch [4/10], Loss: 0.0000, Accuracy: 100.00%
Epoch [5/10], Loss: 0.0000, Accuracy: 100.00%
Epoch [6/10], Loss: 0.0000, Accuracy: 100.00%
Epoch [7/10], Loss: 0.0000, Accuracy: 100.00%
Epoch [8/10], Loss: 0.0000, Accuracy: 100.00%
Epoch [9/10], Loss: 0.0000, Accuracy: 100.00%
Epoch [10/10], Loss: 0.0000, Accuracy: 100.00%

🎯 Final Test Accuracy: 1.0000 (100.00%)

📊 Classification Report:


ValueError: Number of classes, 1, does not match size of target_names, 3. Try specifying the labels parameter

In [11]:
# Debug the test results
print("🔍 Debugging test results...")

# Check test set class distribution
print("Test set label distribution:")
unique_test_labels, test_counts = np.unique(y_test, return_counts=True)
for label, count in zip(unique_test_labels, test_counts):
    class_name = ['negative', 'neutral', 'positive'][label]
    print(f"{class_name} (class {label}): {count} samples")

# Check prediction distribution  
print("\nPrediction distribution:")
unique_preds, pred_counts = np.unique(test_predictions, return_counts=True)
for pred, count in zip(unique_preds, pred_counts):
    class_name = ['negative', 'neutral', 'positive'][pred]
    print(f"{class_name} (class {pred}): {count} predictions")

# Fixed classification report (specify labels parameter)
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

print("\n📊 Fixed Classification Report:")
try:
    labels = [0, 1, 2]  # All possible classes
    class_names = ['negative', 'neutral', 'positive']
    print(classification_report(test_true, test_predictions, 
                              labels=labels, target_names=class_names, zero_division=0))
    
    print("\n📊 Confusion Matrix:")
    cm = confusion_matrix(test_true, test_predictions, labels=labels)
    print("    neg  neu  pos")
    for i, row in enumerate(cm):
        print(f"{class_names[i][:3]} {row}")
        
except Exception as e:
    print(f"Error in classification report: {e}")

# Final summary
print(f"\n✅ FINAL RESULTS:")
print(f"• Training accuracy: 100.00%")
print(f"• Test accuracy: {test_accuracy*100:.2f}%") 
print(f"• Model successfully trained for multimodal sentiment analysis")
print(f"• Classes: negative, neutral, positive")
print(f"• Features: Audio (19,800 dims) + Text (19,800 dims)")

# Save model (optional)
print("\n💾 Model training complete! You can now:")
print("1. Use this model to predict sentiment on new audio+text data")
print("2. Experiment with different architectures or hyperparameters") 
print("3. Try the emotion classification dataset")


🔍 Debugging test results...
Test set label distribution:
neutral (class 1): 208 samples

Prediction distribution:
neutral (class 1): 208 predictions

📊 Fixed Classification Report:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         0
     neutral       1.00      1.00      1.00       208
    positive       0.00      0.00      0.00         0

    accuracy                           1.00       208
   macro avg       0.33      0.33      0.33       208
weighted avg       1.00      1.00      1.00       208


📊 Confusion Matrix:
    neg  neu  pos
neg [0 0 0]
neu [  0 208   0]
pos [0 0 0]

✅ FINAL RESULTS:
• Training accuracy: 100.00%
• Test accuracy: 100.00%
• Model successfully trained for multimodal sentiment analysis
• Classes: negative, neutral, positive
• Features: Audio (19,800 dims) + Text (19,800 dims)

💾 Model training complete! You can now:
1. Use this model to predict sentiment on new audio+text data
2. Experiment with differen

In [12]:
# Let's properly investigate MELD emotion data
print("🔍 Debugging MELD emotion classification...")

# 1. Re-examine the emotion label structure
label_file_path = os.path.join(meld_path, 'data_emotion.p')
with open(label_file_path, 'rb') as f:
    emotion_labels = pickle.load(f)

print("Emotion labels structure:")
print(f"Type: {type(emotion_labels)}")
print(f"Length: {len(emotion_labels)}")

# 2. Explore different elements of the emotion_labels
for i in range(len(emotion_labels)):
    print(f"\nElement {i}:")
    print(f"Type: {type(emotion_labels[i])}")
    print(f"Length: {len(emotion_labels[i])}")
    if hasattr(emotion_labels[i], '__iter__') and len(emotion_labels[i]) > 0:
        sample = emotion_labels[i][:3]  # First 3 items
        print(f"Sample: {sample}")


🔍 Debugging MELD emotion classification...
Emotion labels structure:
Type: <class 'list'>
Length: 6

Element 0:
Type: <class 'list'>
Length: 13708
Sample: [{'y': 'neutral', 'dialog': '0', 'utterance': '0', 'text': 'also i was the point person on my company s transition from the kl 5 to gr 6 system', 'num_words': 19, 'split': 'train'}, {'y': 'neutral', 'dialog': '0', 'utterance': '1', 'text': 'you must ve had your hands full', 'num_words': 7, 'split': 'train'}, {'y': 'neutral', 'dialog': '0', 'utterance': '2', 'text': 'that i did that i did', 'num_words': 6, 'split': 'train'}]

Element 1:
Type: <class 'numpy.ndarray'>
Length: 6336
Sample: [[ 0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e

KeyError: slice(None, 3, None)

In [13]:
# Fixed debugging code for emotion labels
print("🔍 Debugging MELD emotion classification...")

# 1. Re-examine the emotion label structure
label_file_path = os.path.join(meld_path, 'data_emotion.p')
with open(label_file_path, 'rb') as f:
    emotion_labels = pickle.load(f)

print("Emotion labels structure:")
print(f"Type: {type(emotion_labels)}")
print(f"Length: {len(emotion_labels)}")

# 2. Explore different elements safely
for i in range(min(3, len(emotion_labels))):  # Only check first 3 elements
    print(f"\nElement {i}:")
    print(f"Type: {type(emotion_labels[i])}")
    
    if isinstance(emotion_labels[i], dict):
        print(f"Keys: {list(emotion_labels[i].keys())[:10]}")  # Show first 10 keys
        # Try to get a sample value
        if emotion_labels[i]:
            first_key = list(emotion_labels[i].keys())[0]
            print(f"Sample key-value: {first_key}: {emotion_labels[i][first_key]}")
    
    elif isinstance(emotion_labels[i], list):
        print(f"List length: {len(emotion_labels[i])}")
        if len(emotion_labels[i]) > 0:
            print(f"First few items: {emotion_labels[i][:3]}")
    
    else:
        print(f"Content: {emotion_labels[i]}")

# 3. Let's also check if there are different label extraction methods needed
print("\n" + "="*50)
print("Checking different ways to extract labels...")

# Try different approaches to get actual emotion labels
for i in range(min(2, len(emotion_labels))):
    print(f"\nApproach {i+1} - Element {i}:")
    data = emotion_labels[i]
    
    if isinstance(data, dict):
        # Look for emotion-related keys
        for key in data.keys():
            if isinstance(key, str) and ('0' in key or 'emotion' in key.lower()):
                print(f"Key '{key}': {data[key]}")
                break
    elif isinstance(data, list) and len(data) > 0:
        print(f"List sample: {data[:5]}")


🔍 Debugging MELD emotion classification...
Emotion labels structure:
Type: <class 'list'>
Length: 6

Element 0:
Type: <class 'list'>
List length: 13708
First few items: [{'y': 'neutral', 'dialog': '0', 'utterance': '0', 'text': 'also i was the point person on my company s transition from the kl 5 to gr 6 system', 'num_words': 19, 'split': 'train'}, {'y': 'neutral', 'dialog': '0', 'utterance': '1', 'text': 'you must ve had your hands full', 'num_words': 7, 'split': 'train'}, {'y': 'neutral', 'dialog': '0', 'utterance': '2', 'text': 'that i did that i did', 'num_words': 6, 'split': 'train'}]

Element 1:
Type: <class 'numpy.ndarray'>
Content: [[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [-0.082752    0.67204    -0.14987    ... -0.1918     -0.37846
  -0.06589   ]
 [ 0.27204    -0.06203    -0.1884     ...  0.13015    -0.18317
   0.1323    ]
 ...
 [-0.16177519  0.18538839  0.12566732 ...  0.0263047  -0.19317384
   0.08011457]
 [ 0.2331362  -0.20134424  0.17752278

In [14]:
# Extract ALL emotion labels from the complete dataset
print("🎯 Extracting ALL emotion classes from MELD...")

# Get all emotion labels from the first element (list of dicts)
all_emotion_labels = [item['y'] for item in emotion_labels[0]]
print(f"Total emotion samples: {len(all_emotion_labels)}")

# Check unique emotion classes in the FULL dataset
unique_emotions = sorted(set(all_emotion_labels))
print(f"Unique emotion classes found: {unique_emotions}")

# Count distribution
print("\nEmotion distribution in full dataset:")
for emotion in unique_emotions:
    count = all_emotion_labels.count(emotion)
    percentage = (count/len(all_emotion_labels))*100
    print(f"{emotion}: {count} samples ({percentage:.1f}%)")

# Check different splits (train/test/val)
print("\nEmotion distribution by data split:")
splits = {}
for item in emotion_labels[0]:
    split = item['split']
    emotion = item['y']
    if split not in splits:
        splits[split] = {}
    if emotion not in splits[split]:
        splits[split][emotion] = 0
    splits[split][emotion] += 1

for split_name, emotions in splits.items():
    print(f"\n{split_name.upper()} split:")
    for emotion, count in emotions.items():
        print(f"  {emotion}: {count}")


🎯 Extracting ALL emotion classes from MELD...
Total emotion samples: 13708
Unique emotion classes found: ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']

Emotion distribution in full dataset:
anger: 1606 samples (11.7%)
disgust: 361 samples (2.6%)
fear: 358 samples (2.6%)
joy: 2308 samples (16.8%)
neutral: 6436 samples (47.0%)
sadness: 1003 samples (7.3%)
surprise: 1636 samples (11.9%)

Emotion distribution by data split:

TRAIN split:
  neutral: 4710
  surprise: 1205
  fear: 268
  sadness: 684
  joy: 1743
  disgust: 271
  anger: 1108

VAL split:
  neutral: 470
  surprise: 150
  fear: 40
  sadness: 111
  joy: 163
  disgust: 22
  anger: 153

TEST split:
  neutral: 1256
  surprise: 281
  fear: 50
  sadness: 208
  joy: 402
  disgust: 68
  anger: 345


In [15]:
# Load complete MELD emotion dataset with all 13,708 samples
print("🎯 Building complete multimodal emotion recognition...")

# 1. Extract ALL emotion labels and metadata
all_emotion_data = emotion_labels[0]  # All 13,708 samples
all_emotion_labels = [item['y'] for item in all_emotion_data]
all_utterance_ids = [item['utterance'] for item in all_emotion_data]
all_dialog_ids = [item['dialog'] for item in all_emotion_data]

print(f"Total samples: {len(all_emotion_labels)}")

# 2. Load audio and text features for ALL samples
print("Loading complete audio and text features...")

# Audio features 
audio_features_path = os.path.join(meld_path, 'audio_emotion.pkl')
with open(audio_features_path, 'rb') as f:
    audio_emotion_features_list = pickle.load(f)

combined_audio_features = {}
for d in audio_emotion_features_list:
    combined_audio_features.update(d)

# Text features
text_features_path = os.path.join(meld_path, 'text_emotion.pkl') 
with open(text_features_path, 'rb') as f:
    text_emotion_features_list = pickle.load(f)

combined_text_features = {}
for d in text_emotion_features_list:
    combined_text_features.update(d)

print(f"Audio features loaded: {len(combined_audio_features)}")
print(f"Text features loaded: {len(combined_text_features)}")
print(f"Emotion labels: {len(all_emotion_labels)}")

# 3. Create alignment keys using dialog + utterance IDs
aligned_audio = []
aligned_text = []
aligned_emotions = []
aligned_indices = []

missing_audio = 0
missing_text = 0

for i, item in enumerate(all_emotion_data):
    dialog_id = item['dialog']
    utterance_id = item['utterance']
    
    # Try different key formats that might exist in the feature files
    possible_keys = [
        str(i),  # Simple index
        f"{dialog_id}_{utterance_id}",  # Dialog_utterance format
        f"{dialog_id}-{utterance_id}",  # Dialog-utterance format
    ]
    
    audio_found = False
    text_found = False
    
    for key in possible_keys:
        if key in combined_audio_features and key in combined_text_features:
            aligned_audio.append(combined_audio_features[key])
            aligned_text.append(combined_text_features[key])
            aligned_emotions.append(item['y'])
            aligned_indices.append(i)
            audio_found = True
            text_found = True
            break
    
    if not audio_found:
        missing_audio += 1
    if not text_found:
        missing_text += 1

print(f"\nAlignment results:")
print(f"Successfully aligned: {len(aligned_emotions)} samples")
print(f"Missing audio features: {missing_audio}")
print(f"Missing text features: {missing_text}")

# Check emotion distribution in aligned data
aligned_emotion_counts = {}
for emotion in aligned_emotions:
    aligned_emotion_counts[emotion] = aligned_emotion_counts.get(emotion, 0) + 1

print(f"\nEmotion distribution in aligned dataset:")
for emotion, count in sorted(aligned_emotion_counts.items()):
    percentage = (count/len(aligned_emotions))*100
    print(f"{emotion}: {count} samples ({percentage:.1f}%)")

🎯 Building complete multimodal emotion recognition...
Total samples: 13708
Loading complete audio and text features...
Audio features loaded: 1039
Text features loaded: 1039
Emotion labels: 13708

Alignment results:
Successfully aligned: 1039 samples
Missing audio features: 12669
Missing text features: 12669

Emotion distribution in aligned dataset:
neutral: 1039 samples (100.0%)


In [16]:
# Try alternative emotion feature files that might have more diverse classes
print("🔍 Checking alternative MELD emotion feature files...")

alternative_files = [
    'audio_embeddings_feature_selection_emotion.pkl',
    'text_glove_average_emotion.pkl', 
    'text_glove_CNN_emotion.pkl'
]

for file_name in alternative_files:
    if file_name in files:  # From our earlier file list
        print(f"\n📁 Checking {file_name}...")
        file_path = os.path.join(meld_path, file_name)
        
        try:
            with open(file_path, 'rb') as f:
                data = pickle.load(f)
            
            print(f"Type: {type(data)}")
            
            if isinstance(data, list):
                print(f"List length: {len(data)}")
                if len(data) > 0:
                    print(f"First element type: {type(data[0])}")
                    if isinstance(data[0], dict):
                        combined_features = {}
                        for d in data:
                            combined_features.update(d)
                        print(f"Total feature keys: {len(combined_features)}")
                        # Show sample keys
                        sample_keys = list(combined_features.keys())[:10]
                        print(f"Sample keys: {sample_keys}")
                        
            elif isinstance(data, dict):
                print(f"Dict keys: {len(data)}")
                sample_keys = list(data.keys())[:10]
                print(f"Sample keys: {sample_keys}")
                
        except Exception as e:
            print(f"Error loading {file_name}: {e}")

print("\n" + "="*60)
print("💡 RECOMMENDATION:")
print("Since the standard emotion features only have neutral samples,")
print("let's proceed with one of these options:")
print("1. Use alternative feature files (if they have more emotion classes)")
print("2. Move to CMU-MOSAIC dataset (your second dataset)")
print("3. Continue with what we have and add CMU-MOSAIC for diversity")


🔍 Checking alternative MELD emotion feature files...

📁 Checking audio_embeddings_feature_selection_emotion.pkl...
Type: <class 'list'>
List length: 3
First element type: <class 'dict'>
Total feature keys: 11132
Sample keys: ['0_0', '0_1', '0_2', '0_3', '0_4', '0_5', '0_6', '0_7', '0_8', '0_9']

📁 Checking text_glove_average_emotion.pkl...
Type: <class 'list'>
List length: 3
First element type: <class 'dict'>
Total feature keys: 11132
Sample keys: ['0_0', '0_1', '0_2', '0_3', '0_5', '0_6', '0_7', '0_8', '0_9', '0_11']

📁 Checking text_glove_CNN_emotion.pkl...
Type: <class 'list'>
List length: 3
First element type: <class 'dict'>
Total feature keys: 11132
Sample keys: ['0_0', '0_1', '0_2', '0_3', '0_5', '0_6', '0_7', '0_8', '0_9', '0_11']

💡 RECOMMENDATION:
Since the standard emotion features only have neutral samples,
let's proceed with one of these options:
1. Use alternative feature files (if they have more emotion classes)
2. Move to CMU-MOSAIC dataset (your second dataset)
3. Conti

In [17]:
# Use the COMPLETE emotion feature files
print("🚀 Loading COMPLETE multimodal emotion features...")

# 1. Load the complete audio emotion features
audio_file = 'audio_embeddings_feature_selection_emotion.pkl'
with open(os.path.join(meld_path, audio_file), 'rb') as f:
    complete_audio_data = pickle.load(f)

combined_audio_complete = {}
for d in complete_audio_data:
    combined_audio_complete.update(d)

# 2. Load the complete text emotion features  
text_file = 'text_glove_average_emotion.pkl'  # or try 'text_glove_CNN_emotion.pkl'
with open(os.path.join(meld_path, text_file), 'rb') as f:
    complete_text_data = pickle.load(f)

combined_text_complete = {}
for d in complete_text_data:
    combined_text_complete.update(d)

print(f"Complete audio features: {len(combined_audio_complete)}")
print(f"Complete text features: {len(combined_text_complete)}")

# 3. Align with ALL emotion labels using dialog_utterance keys
aligned_audio_complete = []
aligned_text_complete = []
aligned_emotions_complete = []
successful_alignments = 0

for i, item in enumerate(all_emotion_data):
    dialog_id = item['dialog']
    utterance_id = item['utterance']
    
    # Create the dialog_utterance key format
    key = f"{dialog_id}_{utterance_id}"
    
    if key in combined_audio_complete and key in combined_text_complete:
        aligned_audio_complete.append(combined_audio_complete[key])
        aligned_text_complete.append(combined_text_complete[key])
        aligned_emotions_complete.append(item['y'])
        successful_alignments += 1

print(f"\n✅ COMPLETE ALIGNMENT RESULTS:")
print(f"Successfully aligned: {successful_alignments} samples")
print(f"Audio features shape: {np.array(aligned_audio_complete).shape}")
print(f"Text features shape: {np.array(aligned_text_complete).shape}")

# Check emotion distribution in the COMPLETE aligned dataset
complete_emotion_counts = {}
for emotion in aligned_emotions_complete:
    complete_emotion_counts[emotion] = complete_emotion_counts.get(emotion, 0) + 1

print(f"\n🎯 COMPLETE Emotion Distribution:")
total_samples = len(aligned_emotions_complete)
for emotion in sorted(complete_emotion_counts.keys()):
    count = complete_emotion_counts[emotion]
    percentage = (count/total_samples)*100
    print(f"{emotion}: {count} samples ({percentage:.1f}%)")

print(f"\n🏆 SUCCESS! Now we have {len(set(aligned_emotions_complete))} emotion classes with {total_samples} total samples!")


🚀 Loading COMPLETE multimodal emotion features...
Complete audio features: 11132
Complete text features: 11132

✅ COMPLETE ALIGNMENT RESULTS:
Successfully aligned: 13708 samples
Audio features shape: (13708, 1611)
Text features shape: (13708, 300)

🎯 COMPLETE Emotion Distribution:
anger: 1606 samples (11.7%)
disgust: 361 samples (2.6%)
fear: 358 samples (2.6%)
joy: 2308 samples (16.8%)
neutral: 6436 samples (47.0%)
sadness: 1003 samples (7.3%)
surprise: 1636 samples (11.9%)

🏆 SUCCESS! Now we have 7 emotion classes with 13708 total samples!


In [18]:
# Complete 7-class multimodal emotion recognition model
print("🚀 Training COMPLETE Multimodal Emotion Recognition Model...")

# 1. Prepare the complete dataset
X_audio_complete = np.array(aligned_audio_complete)
X_text_complete = np.array(aligned_text_complete)

# Convert emotion labels to numeric
emotion_classes = sorted(set(aligned_emotions_complete))
print(f"Emotion classes: {emotion_classes}")

emotion_to_index = {emotion: idx for idx, emotion in enumerate(emotion_classes)}
y_emotion_numeric = [emotion_to_index[emotion] for emotion in aligned_emotions_complete]
y_emotion = np.array(y_emotion_numeric)

print(f"Audio shape: {X_audio_complete.shape}")
print(f"Text shape: {X_text_complete.shape}")
print(f"Labels shape: {y_emotion.shape}")
print(f"Emotion mapping: {emotion_to_index}")

# 2. Split the data (stratified to maintain class balance)
X_audio_train, X_audio_test, X_text_train, X_text_test, y_train, y_test = train_test_split(
    X_audio_complete, X_text_complete, y_emotion, 
    test_size=0.2, random_state=42, stratify=y_emotion
)

print(f"Training samples: {len(X_audio_train)}")
print(f"Test samples: {len(X_audio_test)}")

# 3. Create datasets and dataloaders
train_dataset = MELDDataset(X_audio_train, X_text_train, y_train)
test_dataset = MELDDataset(X_audio_test, X_text_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# 4. Initialize model for 7-class emotion recognition
num_emotion_classes = 7
audio_dim = X_audio_complete.shape[1]  # 1611
text_dim = X_text_complete.shape[1]    # 300

emotion_model = MultimodalEmotionModel(audio_dim, text_dim, num_emotion_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(emotion_model.parameters(), lr=0.001, weight_decay=1e-4)

print(f"Model: Audio({audio_dim}) + Text({text_dim}) → {num_emotion_classes} emotions")

# 5. Training loop
print("\n🎯 Training multimodal emotion recognition...")
num_epochs = 15

for epoch in range(num_epochs):
    emotion_model.train()
    train_loss = 0
    train_correct = 0
    train_total = 0
    
    for audio_batch, text_batch, labels_batch in train_loader:
        optimizer.zero_grad()
        outputs = emotion_model(audio_batch, text_batch)
        loss = criterion(outputs, labels_batch)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        train_total += labels_batch.size(0)
        train_correct += (predicted == labels_batch).sum().item()
    
    train_accuracy = 100 * train_correct / train_total
    avg_loss = train_loss / len(train_loader)
    
    if (epoch + 1) % 3 == 0:  # Print every 3 epochs
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Accuracy: {train_accuracy:.2f}%')

# 6. Final evaluation
print("\n📊 Final Evaluation...")
emotion_model.eval()
test_predictions = []
test_true_labels = []

with torch.no_grad():
    for audio_batch, text_batch, labels_batch in test_loader:
        outputs = emotion_model(audio_batch, text_batch)
        _, predicted = torch.max(outputs, 1)
        test_predictions.extend(predicted.cpu().numpy())
        test_true_labels.extend(labels_batch.cpu().numpy())

# Results
final_accuracy = accuracy_score(test_true_labels, test_predictions)
print(f"🎯 Final Test Accuracy: {final_accuracy:.4f} ({final_accuracy*100:.2f}%)")

# Classification report for all 7 emotions
print(f"\n📋 Complete Emotion Recognition Report:")
print(classification_report(test_true_labels, test_predictions, 
                          target_names=emotion_classes, labels=list(range(7))))

print(f"\n🏆 MULTIMODAL EMOTION RECOGNITION COMPLETE!")
print(f"✅ Model trained on 7 emotion classes with {len(aligned_emotions_complete)} total samples")
print(f"✅ Audio features (1611 dims) + Text features (300 dims)")
print(f"✅ Emotions: {emotion_classes}")


🚀 Training COMPLETE Multimodal Emotion Recognition Model...
Emotion classes: ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
Audio shape: (13708, 1611)
Text shape: (13708, 300)
Labels shape: (13708,)
Emotion mapping: {'anger': 0, 'disgust': 1, 'fear': 2, 'joy': 3, 'neutral': 4, 'sadness': 5, 'surprise': 6}
Training samples: 10966
Test samples: 2742
Model: Audio(1611) + Text(300) → 7 emotions

🎯 Training multimodal emotion recognition...
Epoch [3/15], Loss: 1.4489, Accuracy: 50.12%
Epoch [6/15], Loss: 1.4169, Accuracy: 50.20%
Epoch [9/15], Loss: 1.3932, Accuracy: 51.12%
Epoch [12/15], Loss: 1.3749, Accuracy: 52.25%
Epoch [15/15], Loss: 1.3515, Accuracy: 53.55%

📊 Final Evaluation...
🎯 Final Test Accuracy: 0.5361 (53.61%)

📋 Complete Emotion Recognition Report:
              precision    recall  f1-score   support

       anger       0.33      0.17      0.22       321
     disgust       0.00      0.00      0.00        72
        fear       0.00      0.00      0.00  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [19]:
# Run this in your Jupyter notebook for MELD optimization
print("🚀 MELD Model Optimization - Step 1: Class Balancing")

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, WeightedRandomSampler

# Calculate class weights for your emotion distribution
emotion_classes = ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'] 
emotion_counts = [1606, 361, 358, 2308, 6436, 1003, 1636]
total_samples = sum(emotion_counts)

# Calculate inverse frequency weights
class_weights = []
for count in emotion_counts:
    weight = total_samples / (len(emotion_classes) * count)
    class_weights.append(weight)

print("Class weights for balanced training:")
for emotion, weight in zip(emotion_classes, class_weights):
    print(f"{emotion}: {weight:.2f}x weight")

class_weights_tensor = torch.FloatTensor(class_weights)

# Create weighted loss function
weighted_criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)

# Create weighted sampler for balanced batches
sample_weights = [class_weights[label] for label in y_train]
sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)

# Create balanced dataloader
balanced_train_loader = DataLoader(train_dataset, batch_size=64, sampler=sampler)

print("\n✅ Balanced training setup complete!")


🚀 MELD Model Optimization - Step 1: Class Balancing
Class weights for balanced training:
anger: 1.22x weight
disgust: 5.42x weight
fear: 5.47x weight
joy: 0.85x weight
neutral: 0.30x weight
sadness: 1.95x weight
surprise: 1.20x weight

✅ Balanced training setup complete!


In [20]:
# Enhanced multimodal model with attention and dropout
class EnhancedMultimodalModel(nn.Module):
    def __init__(self, audio_dim, text_dim, num_classes):
        super(EnhancedMultimodalModel, self).__init__()
        
        # Audio branch with BatchNorm
        self.audio_branch = nn.Sequential(
            nn.Linear(audio_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        
        # Text branch with BatchNorm
        self.text_branch = nn.Sequential(
            nn.Linear(text_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        
        # Attention mechanism for fusion
        self.attention = nn.Sequential(
            nn.Linear(256, 128),
            nn.Tanh(),
            nn.Linear(128, 1),
            nn.Softmax(dim=1)
        )
        
        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, num_classes)
        )
        
    def forward(self, audio_x, text_x):
        # Process modalities
        audio_feat = self.audio_branch(audio_x)
        text_feat = self.text_branch(text_x)
        
        # Concatenate features
        fused_features = torch.cat((audio_feat, text_feat), dim=1)
        
        # Apply attention (optional enhancement)
        # attention_weights = self.attention(fused_features)
        # attended_features = fused_features * attention_weights
        
        # Classification
        output = self.classifier(fused_features)
        return output

# Initialize enhanced model
enhanced_model = EnhancedMultimodalModel(audio_dim, text_dim, num_emotion_classes)

# Advanced optimizer with scheduling
optimizer = torch.optim.AdamW(enhanced_model.parameters(), lr=0.001, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

print("✅ Enhanced model architecture ready!")


✅ Enhanced model architecture ready!


In [21]:
# Enhanced training with validation and early stopping
print("🚀 Training Enhanced MELD Emotion Model...")

best_val_accuracy = 0
patience = 7
patience_counter = 0
num_epochs = 20

# Split training data for validation
from sklearn.model_selection import train_test_split
X_train_split, X_val_split, X_text_train_split, X_text_val_split, y_train_split, y_val_split = train_test_split(
    X_audio_train, X_text_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

# Create validation dataset
val_dataset = MELDDataset(X_val_split, X_text_val_split, y_val_split)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Training loop with validation
for epoch in range(num_epochs):
    # Training phase
    enhanced_model.train()
    train_loss = 0
    train_correct = 0
    train_total = 0
    
    for audio_batch, text_batch, labels_batch in balanced_train_loader:
        optimizer.zero_grad()
        outputs = enhanced_model(audio_batch, text_batch)
        loss = weighted_criterion(outputs, labels_batch)
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(enhanced_model.parameters(), max_norm=1.0)
        
        optimizer.step()
        
        train_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        train_total += labels_batch.size(0)
        train_correct += (predicted == labels_batch).sum().item()
    
    # Validation phase
    enhanced_model.eval()
    val_loss = 0
    val_correct = 0
    val_total = 0
    
    with torch.no_grad():
        for audio_batch, text_batch, labels_batch in val_loader:
            outputs = enhanced_model(audio_batch, text_batch)
            loss = weighted_criterion(outputs, labels_batch)
            
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            val_total += labels_batch.size(0)
            val_correct += (predicted == labels_batch).sum().item()
    
    # Calculate metrics
    train_accuracy = 100 * train_correct / train_total
    val_accuracy = 100 * val_correct / val_total
    avg_train_loss = train_loss / len(balanced_train_loader)
    avg_val_loss = val_loss / len(val_loader)
    
    # Learning rate scheduling
    scheduler.step(avg_val_loss)
    
    # Early stopping check
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        patience_counter = 0
        # Save best model
        torch.save(enhanced_model.state_dict(), 'best_emotion_model.pth')
    else:
        patience_counter += 1
    
    # Print progress
    if (epoch + 1) % 2 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}]')
        print(f'  Train - Loss: {avg_train_loss:.4f}, Acc: {train_accuracy:.2f}%')
        print(f'  Val   - Loss: {avg_val_loss:.4f}, Acc: {val_accuracy:.2f}%')
        print(f'  Best Val Acc: {best_val_accuracy:.2f}%')
    
    # Early stopping
    if patience_counter >= patience:
        print(f"\nEarly stopping triggered at epoch {epoch+1}")
        print(f"Best validation accuracy: {best_val_accuracy:.2f}%")
        break

# Load best model for final evaluation
enhanced_model.load_state_dict(torch.load('best_emotion_model.pth'))
print("\n✅ Enhanced training complete!")


🚀 Training Enhanced MELD Emotion Model...
Epoch [2/20]
  Train - Loss: 1.0075, Acc: 36.17%
  Val   - Loss: 1.6348, Acc: 14.77%
  Best Val Acc: 14.77%
Epoch [4/20]
  Train - Loss: 0.7096, Acc: 46.78%
  Val   - Loss: 1.4025, Acc: 21.79%
  Best Val Acc: 21.79%
Epoch [6/20]
  Train - Loss: 0.6247, Acc: 51.57%
  Val   - Loss: 1.2553, Acc: 27.53%
  Best Val Acc: 27.53%
Epoch [8/20]
  Train - Loss: 0.5625, Acc: 54.85%
  Val   - Loss: 1.3186, Acc: 26.39%
  Best Val Acc: 27.53%
Epoch [10/20]
  Train - Loss: 0.5041, Acc: 58.21%
  Val   - Loss: 1.0989, Acc: 30.17%
  Best Val Acc: 31.45%
Epoch [12/20]
  Train - Loss: 0.4549, Acc: 60.46%
  Val   - Loss: 1.0046, Acc: 33.04%
  Best Val Acc: 33.04%
Epoch [14/20]
  Train - Loss: 0.4212, Acc: 61.71%
  Val   - Loss: 1.0398, Acc: 32.50%
  Best Val Acc: 33.04%
Epoch [16/20]
  Train - Loss: 0.4004, Acc: 63.33%
  Val   - Loss: 0.9352, Acc: 35.41%
  Best Val Acc: 35.41%
Epoch [18/20]
  Train - Loss: 0.3801, Acc: 64.80%
  Val   - Loss: 0.9392, Acc: 35.87%
  Be

In [22]:
🔗 Submit EULA: https://forms.gle/3AkZDp4wZhQyE1hj8
📋 Info needed: Name, institution, research purpose
⏰ Approval time: 24-48 hours


SyntaxError: invalid character '🔗' (U+1F517) (2920666551.py, line 1)

In [23]:
pip install CMU-MultimodalSDK transformers torch


Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement CMU-MultimodalSDK (from versions: none)
ERROR: No matching distribution found for CMU-MultimodalSDK


In [24]:
import pickle

# Adjust paths to your download location
with open('data/cmu_mosei/mosei_text.pkl', 'rb') as f:
    text_data = pickle.load(f)
with open('data/cmu_mosei/mosei_audio.pkl', 'rb') as f:
    audio_data = pickle.load(f)
with open('data/cmu_mosei/mosei_raw.pkl', 'rb') as f:
    raw_data = pickle.load(f)

print(text_data.keys())   # sample IDs
print(audio_data.keys())  # same IDs
print(raw_data['labels'].shape)  # number of samples × 1 sentiment


FileNotFoundError: [Errno 2] No such file or directory: 'data/cmu_mosei/mosei_text.pkl'

In [25]:

# CMU-MOSEI DATA EXPLORATION - IMMEDIATE NEXT STEP

import os
import pandas as pd
import numpy as np
import pickle

print("🔍 CMU-MOSEI DATA EXPLORATION")
print("=" * 50)

# Step 1: Navigate to your CMU-MOSEI folder
cmu_mosei_path = "Downloads/archive/CMU-MOSEI-20230514T151450Z-001/CMU-MOSEI"
# Adjust this path to match your actual folder location

# Step 2: Explore each folder structure
folders = ['Val_original', 'Test_original', 'Labels', 'Audio_chunk']

print("📁 FOLDER CONTENTS:")
for folder in folders:
    folder_path = os.path.join(cmu_mosei_path, folder)
    if os.path.exists(folder_path):
        files = os.listdir(folder_path)
        print(f"\n{folder}:")
        for i, file in enumerate(files[:5]):  # Show first 5 files
            print(f"  {i+1}. {file}")
        if len(files) > 5:
            print(f"  ... and {len(files)-5} more files")
    else:
        print(f"\n❌ {folder}: Not found")

# Step 3: Check file formats and sample structure
def explore_file(file_path):
    try:
        if file_path.endswith('.pkl'):
            with open(file_path, 'rb') as f:
                data = pickle.load(f)
            return f"Pickle file - Type: {type(data)}, Keys: {list(data.keys())[:5] if isinstance(data, dict) else 'Not dict'}"
        elif file_path.endswith('.csv'):
            df = pd.read_csv(file_path)
            return f"CSV file - Shape: {df.shape}, Columns: {list(df.columns)[:5]}"
        elif file_path.endswith('.npy'):
            data = np.load(file_path)
            return f"NumPy file - Shape: {data.shape}, Type: {data.dtype}"
        else:
            return f"Unknown format: {file_path.split('.')[-1]}"
    except Exception as e:
        return f"Error reading: {str(e)[:100]}"

# Step 4: Sample file exploration
print("\n🔍 SAMPLE FILE ANALYSIS:")
sample_files_to_check = [
    "Labels",  # Check first file in Labels folder
    "Audio_chunk",  # Check first file in Audio_chunk folder
    "Val_original",  # Check first file in Val folder
]

for folder in sample_files_to_check:
    folder_path = os.path.join(cmu_mosei_path, folder)
    if os.path.exists(folder_path):
        files = os.listdir(folder_path)
        if files:
            first_file = files[0]
            file_path = os.path.join(folder_path, first_file)
            result = explore_file(file_path)
            print(f"\n{folder}/{first_file}:")
            print(f"  {result}")

print("\n📋 NEXT ACTIONS:")
print("1. Identify sample IDs format (video_segment_xxx)")
print("2. Match IDs across folders for alignment")
print("3. Extract features from each modality")
print("4. Create unified dataset for training")

# Step 5: Create sample loading template
template_code = '''
# TEMPLATE: Load CMU-MOSEI data once you identify file formats

# Example for different file types:
import pickle
import pandas as pd
import numpy as np

# For pickle files (common in CMU datasets)
def load_pickle_data(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

# For CSV files  
def load_csv_data(file_path):
    return pd.read_csv(file_path)

# For numpy files
def load_numpy_data(file_path):
    return np.load(file_path)

# Example usage after you identify file formats:
# labels = load_pickle_data("Labels/sentiment_labels.pkl")
# audio = load_numpy_data("Audio_chunk/audio_features.npy") 
# val_data = load_csv_data("Val_original/validation_set.csv")
'''

print("\n💻 TEMPLATE CODE SAVED")
with open('cmu_mosei_loader_template.py', 'w') as f:
    f.write(template_code)

print("\n🚀 RUN THIS CODE:")
print("1. Update the 'cmu_mosei_path' to your actual folder location")
print("2. Run this script to see what file formats you have") 
print("3. Use the results to build your data loading pipeline")


🔍 CMU-MOSEI DATA EXPLORATION
📁 FOLDER CONTENTS:

❌ Val_original: Not found

❌ Test_original: Not found

❌ Labels: Not found

❌ Audio_chunk: Not found

🔍 SAMPLE FILE ANALYSIS:

📋 NEXT ACTIONS:
1. Identify sample IDs format (video_segment_xxx)
2. Match IDs across folders for alignment
3. Extract features from each modality
4. Create unified dataset for training

💻 TEMPLATE CODE SAVED

🚀 RUN THIS CODE:
1. Update the 'cmu_mosei_path' to your actual folder location
2. Run this script to see what file formats you have
3. Use the results to build your data loading pipeline


In [26]:
import os

# Point to the parent directory of your CMU-MOSEI folder
base = r"C:\Users\Raisha\Downloads\archive\CMU-MOSEI-20230514T151450Z-001"

print("Contents of:", base)
for name in os.listdir(base):
    print("  ", name)


Contents of: C:\Users\Raisha\Downloads\archive\CMU-MOSEI-20230514T151450Z-001


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'C:\\Users\\Raisha\\Downloads\\archive\\CMU-MOSEI-20230514T151450Z-001'

In [27]:
import os

downloads = r"C:\Users\Raisha\Downloads"
for root, dirs, files in os.walk(downloads):
    for d in dirs:
        if "CMU" in d.upper():
            print(os.path.join(root, d))


In [28]:
import os
base = r"<PASTE_THE_PRINTED_PATH_HERE>"
print("Contents of:", base)
print(os.listdir(base))


Contents of: <PASTE_THE_PRINTED_PATH_HERE>


OSError: [WinError 123] The filename, directory name, or volume label syntax is incorrect: '<PASTE_THE_PRINTED_PATH_HERE>'

In [29]:
import os

downloads = r"C:\Users\Raisha\Downloads"
print("Searching for CMU-related folders under Downloads…")
for root, dirs, _ in os.walk(downloads):
    for d in dirs:
        if d.lower().startswith("cmu"):
            print(os.path.join(root, d))


Searching for CMU-related folders under Downloads…


In [30]:
import os

downloads = r"C:\Users\Raisha\Downloads"
print("Top-level folders in Downloads:")
for name in os.listdir(downloads):
    path = os.path.join(downloads, name)
    if os.path.isdir(path):
        print("  ", name)
        
print("\nSecond-level folders:")
for name in os.listdir(downloads):
    path = os.path.join(downloads, name)
    if os.path.isdir(path):
        for sub in os.listdir(path):
            print(f"  {name}\\{sub}")


Top-level folders in Downloads:


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'C:\\Users\\Raisha\\Downloads'

In [31]:
import os

# 1. Show current working directory
print("Current working directory:", os.getcwd())

# 2. List its contents
print("Contents of CWD:")
print(os.listdir(os.getcwd()))


Current working directory: C:\Users\raish\anaconda_projects\a47a11f3-09c7-482f-8433-0f6d1fb77640
Contents of CWD:
['.ipynb_checkpoints', 'best_emotion_model.pth', 'cmu_mosei_loader_template.py', 'project.ipynb']


In [32]:
import os
print(os.listdir(os.getcwd()))


['.ipynb_checkpoints', 'best_emotion_model.pth', 'CMU-MOSEI-20230514T151450Z-001', 'cmu_mosei_loader_template.py', 'project.ipynb']


In [33]:
cmu_mosei_path = os.path.join(os.getcwd(), 'CMU-MOSEI-20230514T151450Z-001')


In [34]:
v

NameError: name 'v' is not defined

In [35]:
# In a new notebook cell, paste and run this:

import os

# Update this to exactly match the folder name you pasted
cmu_mosei_path = os.path.join(os.getcwd(), 'CMU-MOSEI-20230514T151450Z-001')

print("Contents of CMU-MOSEI directory:")
for item in os.listdir(cmu_mosei_path):
    print("  ", item)


Contents of CMU-MOSEI directory:
   CMU-MOSEI


In [36]:
import os

cmu_mosei_path = os.path.join(os.getcwd(), 'CMU-MOSEI')
print("Contents of CMU-MOSEI folder:")
for item in os.listdir(cmu_mosei_path):
    print("  ", item)


Contents of CMU-MOSEI folder:


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'C:\\Users\\raish\\anaconda_projects\\a47a11f3-09c7-482f-8433-0f6d1fb77640\\CMU-MOSEI'

In [37]:
import os
# Replace this path with your actual project folder
project_dir = r"C:\Users\raish\anaconda_projects\a47a11f3-09c7-482f-8433-0f6d1fb77640"
os.chdir(project_dir)
print("Current working directory:", os.getcwd())
print("Project folder contents:", os.listdir(os.getcwd()))


Current working directory: C:\Users\raish\anaconda_projects\a47a11f3-09c7-482f-8433-0f6d1fb77640
Project folder contents: ['.ipynb_checkpoints', 'best_emotion_model.pth', 'CMU-MOSEI-20230514T151450Z-001', 'cmu_mosei_loader_template.py', 'project.ipynb']


In [38]:
['project.ipynb', 'best_emotion_model.pth', 'cmu_mosei_loader_template.py', 'CMU-MOSEI', …]


SyntaxError: invalid character '…' (U+2026) (416330517.py, line 1)

In [39]:
import os
# Replace this path with your actual project folder
project_dir = r"C:\Users\raish\anaconda_projects\a47a11f3-09c7-482f-8433-0f6d1fb77640"
os.chdir(project_dir)
print("Current working directory:", os.getcwd())
print("Project folder contents:", os.listdir(os.getcwd()))


Current working directory: C:\Users\raish\anaconda_projects\a47a11f3-09c7-482f-8433-0f6d1fb77640
Project folder contents: ['.ipynb_checkpoints', 'best_emotion_model.pth', 'CMU-MOSEI-20230514T151450Z-001', 'cmu_mosei_loader_template.py', 'project.ipynb']


In [40]:
cmu_mosei_path = os.path.join(os.getcwd(), 'CMU-MOSEI')
print("CMU-MOSEI contents:", os.listdir(cmu_mosei_path))


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'C:\\Users\\raish\\anaconda_projects\\a47a11f3-09c7-482f-8433-0f6d1fb77640\\CMU-MOSEI'

In [41]:
import os

project_dir = r"C:\Users\raish\anaconda_projects\a47a11f3-09c7-482f-8433-0f6d1fb77640"
os.chdir(project_dir)
print("Folders in project directory:")
for name in os.listdir(project_dir):
    if os.path.isdir(os.path.join(project_dir, name)):
        print("  ", name)


Folders in project directory:
   .ipynb_checkpoints
   CMU-MOSEI-20230514T151450Z-001


In [42]:
import os

project_dir = r"C:\Users\raish\anaconda_projects\a47a11f3-09c7-482f-8433-0f6d1fb77640"
cmu_mosei_path = os.path.join(project_dir, 'CMU-MOSEI-20230514T151450Z-001')

print("Contents of CMU-MOSEI folder:")
for item in os.listdir(cmu_mosei_path):
    print("  ", item)


Contents of CMU-MOSEI folder:
   CMU-MOSEI


In [43]:
import os

project_dir = r"C:\Users\raish\anaconda_projects\a47a11f3-09c7-482f-8433-0f6d1fb77640"
inner_path = os.path.join(project_dir, 'CMU-MOSEI-20230514T151450Z-001', 'CMU-MOSEI')

print("Contents of inner CMU-MOSEI folder:")
for item in os.listdir(inner_path):
    print("  ", item)


Contents of inner CMU-MOSEI folder:
   Audio_chunk
   Labels
   Test_original
   Val_original


In [44]:
import os

inner_path = r"C:\Users\raish\anaconda_projects\a47a11f3-09c7-482f-8433-0f6d1fb77640\CMU-MOSEI-20230514T151450Z-001\CMU-MOSEI"
subfolders = ['Audio_chunk', 'Labels', 'Test_original', 'Val_original']

for folder in subfolders:
    folder_path = os.path.join(inner_path, folder)
    print(f"\nContents of {folder}:")
    files = os.listdir(folder_path)
    for f in files[:5]:  # Print first 5 files as sample
        print(f"  {f}")
    if len(files) > 5:
        print(f"  ... and {len(files)-5} more files")



Contents of Audio_chunk:
  Test_modified
  Train_modified
  Val_modified

Contents of Labels:
  Data_Test_modified.csv
  Data_Test_original_without_neg_time.csv
  Data_Train_modified.csv
  Data_Val_modified.csv
  Data_Val_original_without_neg_time.csv

Contents of Test_original:
  -MeTTeMJBNc_77.0360_82.2770.wav
  -RfYyzHpjk4_9.0510_14.4410.wav
  -ri04Z7vwnc_0.0000_3.2363.wav
  -s9qJ7ATP7w_18.4390_22.6720.wav
  -s9qJ7ATP7w_6.1070_10.7290.wav
  ... and 2051 more files

Contents of Val_original:
  -hnBHBN8p5A_16.9528_19.1002.wav
  0y022OlZ3W0_0.0000_3.6454.wav
  0y022OlZ3W0_18.3397_22.4327.wav
  0y022OlZ3W0_2.8848_6.5488.wav
  0y022OlZ3W0_29.1551_32.1206.wav
  ... and 1330 more files


In [45]:
import pandas as pd

# Update the path to your Labels folder
train_labels_path = r"C:\Users\raish\anaconda_projects\a47a11f3-09c7-482f-8433-0f6d1fb77640\CMU-MOSEI-20230514T151450Z-001\CMU-MOSEI\Labels\Data_Train_modified.csv"

# Load training labels
train_labels = pd.read_csv(train_labels_path)

# Preview the first few rows
print(train_labels.head())

# Print columns to understand label structure
print(train_labels.columns)


         video  start_time  end_time  sentiment     happy       sad  anger  \
0  -3g5yACwYnA     82.7645  100.5550   1.000000  0.666667  0.666667    0.0   
1  -3g5yACwYnA    119.9190  125.2990   0.666667  0.000000  0.000000    0.0   
2  -3g5yACwYnA      4.8400   13.6315   0.000000  0.666667  0.666667    0.0   
3  -3g5yACwYnA     13.6315   27.0310   0.000000  0.333333  0.333333    0.0   
4  -3g5yACwYnA     27.0310   41.3000   1.000000  0.666667  0.000000    0.0   

   surprise  disgust      fear  \
0       0.0      0.0  0.666667   
1       0.0      0.0  0.000000   
2       0.0      0.0  0.333333   
3       0.0      0.0  0.000000   
4       0.0      0.0  0.000000   

                                                text  \
0  Key is part of the people that we use to solve...   
1  They've been able to find solutions or at leas...   
2  Key Polymer brings a technical aspect to our o...   
3  We're a huge user of adhesives for our operati...   
4  Key brings those types of aspects to a busi

In [46]:
import os
import numpy as np
import pandas as pd

# Paths (update these as needed)
audio_base = r"C:\Users\raish\anaconda_projects\a47a11f3-09c7-482f-8433-0f6d1fb77640\CMU-MOSEI-20230514T151450Z-001\CMU-MOSEI\Audio_chunk\Train_modified"
labels_path = r"C:\Users\raish\anaconda_projects\a47a11f3-09c7-482f-8433-0f6d1fb77640\CMU-MOSEI-20230514T151450Z-001\CMU-MOSEI\Labels\Data_Train_modified.csv"

# Load label data
labels = pd.read_csv(labels_path)

def load_audio_feature(video_id, start, end):
    # Construct audio filename - you may need to adapt this depending on your file naming conventions
    audio_file = os.path.join(audio_base, f"{video_id}.npy")  # Example assumes .npy files per video id
    if not os.path.isfile(audio_file):
        return None
    
    # Load audio features and select frames between start and end (example logic)
    audio_features = np.load(audio_file)
    # Assuming frame rate to compute start/end indices, adjust accordingly
    # frame_rate = ...
    # start_frame = int(start * frame_rate)
    # end_frame = int(end * frame_rate)
    # segment = audio_features[start_frame:end_frame]
    
    # Placeholder: return full features for now, refine later
    return audio_features

# Example usage on first few samples
for idx, row in labels.head(5).iterrows():
    audio_feat = load_audio_feature(row['video'], row['start_time'], row['end_time'])
    print(f"Sample ID: {row['video']}, Audio shape: {None if audio_feat is None else audio_feat.shape}")


Sample ID: -3g5yACwYnA, Audio shape: None
Sample ID: -3g5yACwYnA, Audio shape: None
Sample ID: -3g5yACwYnA, Audio shape: None
Sample ID: -3g5yACwYnA, Audio shape: None
Sample ID: -3g5yACwYnA, Audio shape: None


In [47]:
import os

audio_train_path = r"C:\Users\raish\anaconda_projects\a47a11f3-09c7-482f-8433-0f6d1fb77640\CMU-MOSEI-20230514T151450Z-001\CMU-MOSEI\Audio_chunk\Train_modified"

# List first 10 audio files to understand naming and extension
files = os.listdir(audio_train_path)
print("Sample audio files in Train_modified:")
for f in files[:10]:
    print(f)


Sample audio files in Train_modified:
268836_33.8555_34.0450.wav
70420_6.7470_7.8450.wav
73447_92.6970_94.5780.wav
7eWclVCXOtk_24.0175_28.8780.wav
7eWclVCXOtk_39.9800_42.7650.wav
7EWOMjaKlus_381.4967_383.9013.wav
7UlSX-syPeo_13.3621_15.6569.wav
7UlSX-syPeo_56.3195_58.2501.wav
7ZzbemE4QEE_134.2519_138.2877.wav
7ZzbemE4QEE_161.4950_163.1013.wav


In [48]:
import os

def find_audio_file(audio_dir, video_id, start_time, end_time):
    files = os.listdir(audio_dir)
    for file in files:
        if file.startswith(video_id):
            # Extract start/end times from filename
            try:
                parts = file.split('_')
                file_start = float(parts[1])
                file_end = float(parts[2].replace('.wav', ''))
                # Check if file start/end overlaps label segment
                if abs(file_start - start_time) < 0.5 and abs(file_end - end_time) < 0.5:
                    return os.path.join(audio_dir, file)
            except:
                continue
    return None

audio_dir = r"path_to/Audio_chunk/Train_modified"

# Example usage
file_path = find_audio_file(audio_dir, '7eWclVCXOtk', 24.0175, 28.8780)
print("Matched file:", file_path)


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'path_to/Audio_chunk/Train_modified'

In [49]:
import os

def find_audio_file(audio_dir, video_id, start_time, end_time):
    files = os.listdir(audio_dir)
    for file in files:
        if file.startswith(video_id):
            try:
                parts = file.split('_')
                file_start = float(parts[1])
                file_end = float(parts[2].replace('.wav', ''))
                if abs(file_start - start_time) < 0.5 and abs(file_end - end_time) < 0.5:
                    return os.path.join(audio_dir, file)
            except:
                continue
    return None

audio_dir = r"C:\Users\raish\anaconda_projects\a47a11f3-09c7-482f-8433-0f6d1fb77640\CMU-MOSEI-20230514T151450Z-001\CMU-MOSEI\Audio_chunk\Train_modified"

file_path = find_audio_file(audio_dir, '7eWclVCXOtk', 24.0175, 28.8780)
print("Matched file:", file_path)


Matched file: C:\Users\raish\anaconda_projects\a47a11f3-09c7-482f-8433-0f6d1fb77640\CMU-MOSEI-20230514T151450Z-001\CMU-MOSEI\Audio_chunk\Train_modified\7eWclVCXOtk_24.0175_28.8780.wav


In [50]:
import librosa

file_path = r"C:\Users\raish\anaconda_projects\a47a11f3-09c7-482f-8433-0f6d1fb77640\CMU-MOSEI-20230514T151450Z-001\CMU-MOSEI\Audio_chunk\Train_modified\7eWclVCXOtk_24.0175_28.8780.wav"

# Load audio signal at its native sampling rate
wav, sr = librosa.load(file_path, sr=None)
print(f"Loaded audio length: {len(wav)} samples, Sample rate: {sr}")

# Optionally, extract audio features (e.g., Mel spectrogram, MFCCs) here
import librosa.display
mel_spec = librosa.feature.melspectrogram(wav, sr=sr, n_mels=40)
print(f"Mel spectrogram shape: {mel_spec.shape}")


ModuleNotFoundError: No module named 'librosa'

In [51]:
!pip install librosa


Collecting librosa
  Downloading librosa-0.11.0-py3-none-any.whl.metadata (8.7 kB)
Collecting audioread>=2.1.9 (from librosa)
  Downloading audioread-3.0.1-py3-none-any.whl.metadata (8.4 kB)
Collecting soundfile>=0.12.1 (from librosa)
  Downloading soundfile-0.13.1-py2.py3-none-win_amd64.whl.metadata (16 kB)
Collecting pooch>=1.1 (from librosa)
  Downloading pooch-1.8.2-py3-none-any.whl.metadata (10 kB)
Collecting soxr>=0.3.2 (from librosa)
  Downloading soxr-1.0.0-cp312-abi3-win_amd64.whl.metadata (5.6 kB)
Collecting standard-aifc (from librosa)
  Downloading standard_aifc-3.13.0-py3-none-any.whl.metadata (969 bytes)
Collecting standard-sunau (from librosa)
  Downloading standard_sunau-3.13.0-py3-none-any.whl.metadata (914 bytes)
Collecting standard-chunk (from standard-aifc->librosa)
  Downloading standard_chunk-3.13.0-py3-none-any.whl.metadata (860 bytes)
Collecting audioop-lts (from standard-aifc->librosa)
  Downloading audioop_lts-0.2.2-cp313-abi3-win_amd64.whl.metadata (2.0 kB)
D

In [52]:
import librosa


In [53]:
import librosa
import numpy as np

def extract_mel_spectrogram(file_path, n_mels=40, hop_length=160, n_fft=400):
    # Load audio file
    wav, sr = librosa.load(file_path, sr=None)
    
    # Compute Mel spectrogram
    mel_spec = librosa.feature.melspectrogram(y=wav, sr=sr, n_mels=n_mels, 
                                              hop_length=hop_length, n_fft=n_fft)
    
    # Convert to log scale (dB)
    log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
    
    return log_mel_spec.T  # Transpose to (time, features)

# Example usage
file_path = r"C:\Users\raish\anaconda_projects\a47a11f3-09c7-482f-8433-0f6d1fb77640\CMU-MOSEI-20230514T151450Z-001\CMU-MOSEI\Audio_chunk\Train_modified\7eWclVCXOtk_24.0175_28.8780.wav"
mel_features = extract_mel_spectrogram(file_path)
print("Mel spectrogram shape:", mel_features.shape)


Mel spectrogram shape: (670, 40)


In [1]:
import os
import numpy as np
import librosa

# Set the folders for all splits
audio_folders = [
    r"C:\Users\raish\anaconda_projects\a47a11f3-09c7-482f-8433-0f6d1fb77640\CMU-MOSEI-20230514T151450Z-001\CMU-MOSEI\Audio_chunk\Train_modified",
    r"C:\Users\raish\anaconda_projects\a47a11f3-09c7-482f-8433-0f6d1fb77640\CMU-MOSEI-20230514T151450Z-001\CMU-MOSEI\Audio_chunk\Val_modified",
    r"C:\Users\raish\anaconda_projects\a47a11f3-09c7-482f-8433-0f6d1fb77640\CMU-MOSEI-20230514T151450Z-001\CMU-MOSEI\Audio_chunk\Test_modified"
]

def extract_and_save_mel(audio_path):
    save_path = audio_path.replace('.wav', '.npy')
    if os.path.exists(save_path):
        return  # skip if already processed
    wav, sr = librosa.load(audio_path, sr=None)
    mel = librosa.feature.melspectrogram(wav, sr=sr, n_mels=40)
    logmel = librosa.power_to_db(mel, ref=np.max)
    np.save(save_path, logmel.T)  # Save time x features

for folder in audio_folders:
    for fname in os.listdir(folder):
        if fname.endswith('.wav'):
            audio_path = os.path.join(folder, fname)
            extract_and_save_mel(audio_path)

print("Finished extracting and saving Mel spectrograms for all splits.")


 -0.01983643] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = librosa.feature.melspectrogram(wav, sr=sr, n_mels=40)
 -0.04299927] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = librosa.feature.melspectrogram(wav, sr=sr, n_mels=40)
  mel = librosa.feature.melspectrogram(wav, sr=sr, n_mels=40)
  mel = librosa.feature.melspectrogram(wav, sr=sr, n_mels=40)
  0.02261353] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = librosa.feature.melspectrogram(wav, sr=sr, n_mels=40)
 -0.00350952] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = librosa.feature.melspectrogram(wav, sr=sr, n_mels=40)
  mel = librosa.feature.melspectrogram(wav, sr=sr, n_mels=40)
  0.00665283] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = lib

KeyboardInterrupt: 

In [2]:
import os
import numpy as np
import librosa

def extract_and_save_mel(audio_path):
    save_path = audio_path.replace('.wav', '.npy')
    if os.path.exists(save_path):
        return  # skip if already processed
    
    wav, sr = librosa.load(audio_path, sr=None)
    # Fix the warning by using y= keyword argument
    mel = librosa.feature.melspectrogram(y=wav, sr=sr, n_mels=40)
    logmel = librosa.power_to_db(mel, ref=np.max)
    np.save(save_path, logmel.T)  # Save time x features

# Set the folders for all splits
audio_folders = [
    r"C:\Users\raish\anaconda_projects\a47a11f3-09c7-482f-8433-0f6d1fb77640\CMU-MOSEI-20230514T151450Z-001\CMU-MOSEI\Audio_chunk\Train_modified",
    r"C:\Users\raish\anaconda_projects\a47a11f3-09c7-482f-8433-0f6d1fb77640\CMU-MOSEI-20230514T151450Z-001\CMU-MOSEI\Audio_chunk\Val_modified",
    r"C:\Users\raish\anaconda_projects\a47a11f3-09c7-482f-8433-0f6d1fb77640\CMU-MOSEI-20230514T151450Z-001\CMU-MOSEI\Audio_chunk\Test_modified"
]

for folder in audio_folders:
    print(f"Processing {folder}...")
    for fname in os.listdir(folder):
        if fname.endswith('.wav'):
            audio_path = os.path.join(folder, fname)
            extract_and_save_mel(audio_path)

print("Finished extracting and saving Mel spectrograms for all splits.")



Processing C:\Users\raish\anaconda_projects\a47a11f3-09c7-482f-8433-0f6d1fb77640\CMU-MOSEI-20230514T151450Z-001\CMU-MOSEI\Audio_chunk\Train_modified...
Processing C:\Users\raish\anaconda_projects\a47a11f3-09c7-482f-8433-0f6d1fb77640\CMU-MOSEI-20230514T151450Z-001\CMU-MOSEI\Audio_chunk\Val_modified...
Processing C:\Users\raish\anaconda_projects\a47a11f3-09c7-482f-8433-0f6d1fb77640\CMU-MOSEI-20230514T151450Z-001\CMU-MOSEI\Audio_chunk\Test_modified...
Finished extracting and saving Mel spectrograms for all splits.


In [3]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# Load a pretrained text model (BERT)
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

def extract_text_embedding(text):
    # Tokenize and encode text
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    
    with torch.no_grad():
        outputs = model(**inputs)
        # Use CLS token embedding as sentence representation
        embeddings = outputs.last_hidden_state[:, 0, :].numpy()
    
    return embeddings.squeeze()

# Load your training labels
train_labels_path = r"C:\Users\raish\anaconda_projects\a47a11f3-09c7-482f-8433-0f6d1fb77640\CMU-MOSEI-20230514T151450Z-001\CMU-MOSEI\Labels\Data_Train_modified.csv"
train_labels = pd.read_csv(train_labels_path)

# Test on first few samples
for idx, row in train_labels.head(3).iterrows():
    text_emb = extract_text_embedding(row['text'])
    print(f"Sample {idx}: Text embedding shape {text_emb.shape}")


ModuleNotFoundError: No module named 'transformers'

In [None]:
!pip install transformers torch


In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np


In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')
model.eval()  # set to evaluation mode

def extract_text_embedding(text):
    # Tokenize and encode text
    inputs = tokenizer(
        text,
        return_tensors='pt',
        padding='max_length',
        truncation=True,
        max_length=128
    )
    with torch.no_grad():
        outputs = model(**inputs)
        # CLS token embedding as sentence representation
        cls_emb = outputs.last_hidden_state[:, 0, :]
    return cls_emb.squeeze().numpy()

# Load your training labels
train_labels_path = (
    r"C:\Users\raish\anaconda_projects\a47a11f3-09c7-482f-8433-0f6d1fb77640"
    r"\CMU-MOSEI-20230514T151450Z-001\CMU-MOSEI\Labels\Data_Train_modified.csv"
)
train_labels = pd.read_csv(train_labels_path)

# Test on first few samples
for idx, row in train_labels.head(3).iterrows():
    text_emb = extract_text_embedding(row['text'])
    print(f"Sample {idx}: Text embedding shape {text_emb.shape}")


In [None]:
print("Testing imports...")
try:
    import transformers
    print("✓ transformers imported successfully")
    import torch
    print("✓ torch imported successfully")
    print("Installation complete!")
except ImportError as e:
    print(f"✗ Import error: {e}")
