In [5]:
import torch
import h5py
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoTokenizer

# --- You can run this code now ---

print("--- Phase 0: Establishing the Data Foundation ---")

# 1. Define Constants using the context we've confirmed
H5_FILE_PATH = "/home/poorna/data/dataset.h5"
LOCAL_MODEL_PATH = "/home/poorna/models/bert-base-uncased"
TRAIN_PCT, VAL_PCT = 0.8, 0.1
BATCH_SIZE = 32

# 2. Load Tokenizer to get PAD_ID
print(f"Loading tokenizer from: {LOCAL_MODEL_PATH}")
tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL_PATH)
PAD_ID = tokenizer.pad_token_id
print(f"Tokenizer loaded. PAD_ID set to: {PAD_ID}")

# 3. Define the custom PyTorch Dataset
class EEGMetaTextH5Dataset(Dataset):
    """
    Loads EEG, Metadata, and Text (input_ids) from the specified HDF5 file.
    """
    def __init__(self, h5_path):
        self.h5_path = h5_path
        self.h5_file = None
        # Get the number of samples from the file once
        with h5py.File(self.h5_path, 'r') as f:
            self.n_samples = f['eeg'].shape[0]

    def __len__(self):
        return self.n_samples

    def __getitem__(self, idx):
        # Open file handle in __getitem__ for better compatibility with multiprocessing
        if self.h5_file is None:
            self.h5_file = h5py.File(self.h5_path, 'r')
        
        # Load data using the confirmed keys
        eeg = torch.from_numpy(self.h5_file['eeg'][idx].astype('float32'))
        meta = torch.from_numpy(self.h5_file['metadata'][idx].astype('float32'))
        text = torch.from_numpy(self.h5_file['input_ids'][idx].astype('int64'))
        
        return eeg, meta, text

# 4. Define the custom Collate Function for batching
def collate_multimodal_batch(batch):
    """
    Collates samples into a batch, stacking fixed-size tensors and padding variable-length ones.
    """
    eeg_list, meta_list, text_list = [], [], []
    for eeg, meta, txt in batch:
        eeg_list.append(eeg)
        meta_list.append(meta)
        text_list.append(txt)

    # EEG and Metadata are fixed-size, so we can stack them
    eeg_batch = torch.stack(eeg_list, dim=0)
    meta_batch = torch.stack(meta_list, dim=0)
    
    # Text (input_ids) is variable-length and needs padding
    text_padded = pad_sequence(text_list, batch_first=True, padding_value=PAD_ID)

    return eeg_batch.float(), meta_batch.float(), text_padded

# 5. Create Dataset instance, perform splits, and create DataLoaders
print("\nCreating Dataset and splitting into train/val/test sets...")
dataset = EEGMetaTextH5Dataset(H5_FILE_PATH)

N = len(dataset)
n_train = int(N * TRAIN_PCT)
n_val   = int(N * VAL_PCT)
n_test  = N - n_train - n_val
g = torch.Generator().manual_seed(42)
train_ds, val_ds, test_ds = random_split(dataset, [n_train, n_val, n_test], generator=g)
print(f"Split sizes -> Train: {len(train_ds)}, Val: {len(val_ds)}, Test: {len(test_ds)}")

print("Creating DataLoaders...")
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_multimodal_batch)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_multimodal_batch)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_multimodal_batch)
print("DataLoaders created successfully.")

# 6. Final Verification Step
print("\n--- Verification: Inspecting one batch from train_loader ---")
eeg_b, meta_b, text_b = next(iter(train_loader))
print("EEG batch shape:", eeg_b.shape)
print("Metadata batch shape:", meta_b.shape)
print("Text batch shape:", text_b.shape)

--- Phase 0: Establishing the Data Foundation ---
Loading tokenizer from: /home/poorna/models/bert-base-uncased
Tokenizer loaded. PAD_ID set to: 0

Creating Dataset and splitting into train/val/test sets...
Split sizes -> Train: 22400, Val: 2800, Test: 2800
Creating DataLoaders...
DataLoaders created successfully.

--- Verification: Inspecting one batch from train_loader ---
EEG batch shape: torch.Size([32, 62, 400])
Metadata batch shape: torch.Size([32, 3])
Text batch shape: torch.Size([32, 64])


In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# --- You can run this code now ---

print("--- Phase 1: Defining the Multimodal Model Architecture ---")

# We define all necessary components here for a fresh start.

# Component 1: The Attention Mechanism
class LuongAttention(nn.Module):
    """ Implements Luong's general dot-product attention. """
    def __init__(self, enc_dim, dec_dim):
        super().__init__()
        self.attn = nn.Linear(enc_dim, dec_dim)

    def forward(self, decoder_hidden, encoder_outputs):
        # decoder_hidden shape: [1, B, dec_dim]
        # encoder_outputs shape: [T_enc, B, enc_dim]
        scores = torch.bmm(self.attn(encoder_outputs).permute(1, 0, 2), decoder_hidden.permute(1, 2, 0))
        attn_weights = F.softmax(scores, dim=1)
        context = torch.bmm(attn_weights.permute(0, 2, 1), encoder_outputs.permute(1, 0, 2))
        return context, attn_weights.squeeze(-1)

# Component 2: The new MetadataEncoder
class MetadataEncoder(nn.Module):
    """Encodes structured metadata [color, category, motion] into a single feature vector."""
    def __init__(self, num_colors, num_categories, color_emb_dim=16, category_emb_dim=32):
        super().__init__()
        self.color_embedding = nn.Embedding(num_colors, color_emb_dim)
        self.category_embedding = nn.Embedding(num_categories, category_emb_dim)
        # The final output dimension is the sum of the two embedding dims + 1 for the motion float
        self.output_dim = color_emb_dim + category_emb_dim + 1

    def forward(self, metadata):
        # metadata shape: [Batch, 3] -> [color_id, category_id, motion_value]
        color_ids = metadata[:, 0].long()      # Convert float IDs to long for embedding
        category_ids = metadata[:, 1].long()   # Convert float IDs to long for embedding
        motion_values = metadata[:, 2].unsqueeze(1) # Keep as float, add feature dim

        color_vec = self.color_embedding(color_ids)
        category_vec = self.category_embedding(category_ids)

        # Combine all features into a single vector
        combined_features = torch.cat([color_vec, category_vec, motion_values], dim=1)
        return combined_features

# Component 3: The EEGEncoder (unchanged)
class EEGEncoder(nn.Module):
    def __init__(self, in_channels=62, conv_dim=128, enc_hidden=256, num_layers=2, dropout=0.2):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv1d(in_channels, 128, 5, 1, 2), nn.BatchNorm1d(128), nn.ReLU(True), nn.Dropout(dropout),
            nn.Conv1d(128, conv_dim, 5, 1, 2), nn.BatchNorm1d(conv_dim), nn.ReLU(True), nn.Dropout(dropout),
        )
        self.rnn = nn.GRU(conv_dim, enc_hidden, num_layers, bidirectional=True, dropout=dropout if num_layers > 1 else 0)
    def forward(self, eeg):
        x = self.conv(eeg).permute(2, 0, 1)
        return self.rnn(x)

# Component 4: The updated Decoder
# In your Phase 1 (Model Architecture) cell, replace the old Decoder class with this one.

class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, enc_hidden, dec_hidden, meta_features_dim, num_layers, pad_id, dropout):
        super().__init__()
        
        # --- FIX: Added this line ---
        self.vocab_size = vocab_size
        # --- END OF FIX ---
        
        enc_dim = enc_hidden * 2
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_id)
        self.attention = LuongAttention(enc_dim, dec_hidden)
        self.rnn = nn.GRU(emb_dim + enc_dim, dec_hidden, num_layers, dropout=dropout if num_layers > 1 else 0)
        self.fc_out = nn.Linear(dec_hidden, vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.bridge = nn.Linear(enc_dim + meta_features_dim, dec_hidden)

    def init_hidden(self, combined_features):
        return torch.tanh(self.bridge(combined_features))

    def forward(self, token, decoder_hidden, encoder_outputs):
        token = token.unsqueeze(0)
        embedded = self.dropout(self.embedding(token))
        context, _ = self.attention(decoder_hidden[-1].unsqueeze(0), encoder_outputs)
        rnn_input = torch.cat((embedded, context.permute(1,0,2)), dim=2)
        output, hidden = self.rnn(rnn_input, decoder_hidden)
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden

# Component 5: The main Seq2Seq class tying it all together
class Seq2Seq(nn.Module):
    def __init__(self, text_vocab_size, num_colors, num_categories, enc_hidden=256, dec_hidden=256, 
                 pad_id=0, dropout=0.2, color_emb_dim=16, category_emb_dim=32):
        super().__init__()
        self.encoder = EEGEncoder(enc_hidden=enc_hidden, dropout=dropout)
        self.meta_encoder = MetadataEncoder(num_colors, num_categories, color_emb_dim, category_emb_dim)
        
        meta_features_dim = self.meta_encoder.output_dim
        
        self.decoder = Decoder(text_vocab_size, 256, enc_hidden, dec_hidden, 
                               meta_features_dim, 2, pad_id, dropout)

    def forward(self, eeg, metadata, target_text, teacher_forcing_ratio=0.5):
        encoder_outputs, encoder_hidden = self.encoder(eeg)
        meta_features = self.meta_encoder(metadata)
        
        num_layers = self.decoder.rnn.num_layers
        forward_h = encoder_hidden[0::2]
        backward_h = encoder_hidden[1::2]
        encoder_hidden_cat = torch.cat([forward_h, backward_h], dim=2)
        
        meta_features_repeated = meta_features.unsqueeze(0).repeat(num_layers, 1, 1)
        combined_features = torch.cat([encoder_hidden_cat, meta_features_repeated], dim=2)
        
        decoder_hidden = self.decoder.init_hidden(combined_features)
        
        batch_size = eeg.shape[0]
        target_len = target_text.shape[1]
        outputs = torch.zeros(target_len, batch_size, self.decoder.vocab_size).to(eeg.device)
        decoder_input = target_text[:, 0]
        
        for t in range(1, target_len):
            output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            outputs[t] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)
            decoder_input = target_text[:, t] if teacher_force else top1
            
        return outputs[1:].permute(1, 0, 2)

print("\n--- Phase 1: Model Architecture Defined Successfully ---")
print("All classes (LuongAttention, MetadataEncoder, EEGEncoder, Decoder, Seq2Seq) are now defined.")

--- Phase 1: Defining the Multimodal Model Architecture ---

--- Phase 1: Model Architecture Defined Successfully ---
All classes (LuongAttention, MetadataEncoder, EEGEncoder, Decoder, Seq2Seq) are now defined.


In [10]:
import torch

# --- You can run this corrected code now ---

print("--- Phase 2: Verifying the Complete Architecture ---")

# 1. Define Model Parameters
TEXT_VOCAB_SIZE = 30522
NUM_COLORS = 77
NUM_CATEGORIES = 53

# 2. Instantiate the Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = Seq2Seq(
    text_vocab_size=TEXT_VOCAB_SIZE,
    num_colors=NUM_COLORS,
    num_categories=NUM_CATEGORIES,
    pad_id=PAD_ID
).to(device)
print(f"New dual-input model instantiated on '{device}'.")
print(f"Total parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

# 3. Perform the Forward Pass
eeg_b, meta_b, text_b = next(iter(train_loader))
eeg_b, meta_b, text_b = eeg_b.to(device), meta_b.to(device), text_b.to(device)

print("\n--- Performing Smoke Test Forward Pass ---")
with torch.no_grad():
    logits = model(eeg_b, meta_b, text_b)

print("\nSmoke test PASSED.")
print("Output logits shape:", logits.shape)

--- Phase 2: Verifying the Complete Architecture ---
New dual-input model instantiated on 'cuda'.
Total parameters: 19,016,618

--- Performing Smoke Test Forward Pass ---

Smoke test PASSED.
Output logits shape: torch.Size([32, 63, 30522])


In [None]:
# --- A Quick Guide to Our Tensor Dimensions ---

# Let's define the meaning of each letter we see in the shapes:
# B = Batch Size (e.g., 32)
# C = EEG Channels (e.g., 62)
# T_eeg = EEG Timesteps (e.g., 400)
# M = Number of Metadata Features (e.g., 3 for color, category, motion)
# T_text = Max Text Sequence Length in the Batch (e.g., 64)
# V = Text Vocabulary Size (e.g., 30522)

# --- Tensors coming FROM the DataLoader ---
# eeg_b.shape:      [B, C, T_eeg] -> e.g., [32, 62, 400]
# meta_b.shape:     [B, M]        -> e.g., [32, 3]
# text_b.shape:     [B, T_text]   -> e.g., [32, 64]

# --- Tensor coming FROM the Model ---
# logits.shape:     [B, T_text - 1, V] -> e.g., [32, 63, 30522]
#
# Why T_text - 1?
# The model's task is to predict the *next* token. For an input text
# sequence of length N, there are N-1 "next tokens" to predict.
# Example: For input [<sos>, "the", "cat", <eos>], the model predicts
# ["the", "cat", <eos>], which is 3 predictions.

In [11]:
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
import time
import math

# --- You can run this code now ---

print("--- Phase 3: Launching the Training Protocol ---")

# We assume 'model', 'train_loader', 'val_loader', 'PAD_ID', and 'device' are defined and in memory.

# 1. Define Loss Function, Optimizer, and Scheduler
criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID)
optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=1e-2)
# Scheduler will reduce the learning rate if validation loss stops improving
scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.2, patience=2, verbose=True)

# 2. Define a utility function for timing
def format_time(seconds):
    mins = int(seconds // 60)
    secs = int(seconds % 60)
    return f"{mins}m {secs}s"

# 3. Define the Training Function for one epoch (updated for metadata)
def train_one_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0.0
    for eeg_b, meta_b, txt_b in loader:
        eeg_b, meta_b, txt_b = eeg_b.to(device), meta_b.to(device), txt_b.to(device)
        
        optimizer.zero_grad()
        # Pass all three inputs to the model
        logits = model(eeg_b, meta_b, txt_b, teacher_forcing_ratio=0.5)
        
        # Reshape for loss calculation
        loss = criterion(logits.reshape(-1, logits.shape[-1]), txt_b[:, 1:].reshape(-1))
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total_loss += loss.item()
        
    return total_loss / len(loader)

# 4. Define the Evaluation Function (updated for metadata)
@torch.no_grad()
def evaluate(model, loader, criterion):
    model.eval()
    total_loss = 0.0
    for eeg_b, meta_b, txt_b in loader:
        eeg_b, meta_b, txt_b = eeg_b.to(device), meta_b.to(device), txt_b.to(device)
        # Pass all three inputs to the model
        logits = model(eeg_b, meta_b, txt_b, teacher_forcing_ratio=0.0)
        
        loss = criterion(logits.reshape(-1, logits.shape[-1]), txt_b[:, 1:].reshape(-1))
        total_loss += loss.item()
        
    return total_loss / len(loader)

# 5. The Main Training Loop
EPOCHS = 20  # You can adjust this number
best_val_loss = float('inf')

print("\n--- Starting Training ---")
for epoch in range(1, EPOCHS + 1):
    start_time = time.time()
    
    train_loss = train_one_epoch(model, train_loader, optimizer, criterion)
    val_loss = evaluate(model, val_loader, criterion)
    
    # Update the learning rate based on validation loss
    scheduler.step(val_loss)
    
    end_time = time.time()
    
    print(f"\nEpoch {epoch:02d}/{EPOCHS} | Time: {format_time(end_time - start_time)}")
    print(f"\tTrain Loss: {train_loss:.4f}")
    print(f"\t Val. Loss: {val_loss:.4f} | Val. Perplexity: {math.exp(val_loss):7.4f}")
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'eeg-meta-text-best-model.pt')
        print("\t-> Validation loss improved, saving new best model.")

print("\n--- Training Complete ---")

--- Phase 3: Launching the Training Protocol ---





--- Starting Training ---

Epoch 01/20 | Time: 6m 0s
	Train Loss: 5.1146
	 Val. Loss: 4.5608 | Val. Perplexity: 95.6605
	-> Validation loss improved, saving new best model.

Epoch 02/20 | Time: 5m 53s
	Train Loss: 3.9787
	 Val. Loss: 4.6229 | Val. Perplexity: 101.7841

Epoch 03/20 | Time: 5m 55s
	Train Loss: 3.7195
	 Val. Loss: 4.5368 | Val. Perplexity: 93.3917
	-> Validation loss improved, saving new best model.

Epoch 04/20 | Time: 5m 55s
	Train Loss: 3.5073
	 Val. Loss: 4.5856 | Val. Perplexity: 98.0653

Epoch 05/20 | Time: 5m 54s
	Train Loss: 3.3098
	 Val. Loss: 4.4361 | Val. Perplexity: 84.4438
	-> Validation loss improved, saving new best model.

Epoch 06/20 | Time: 5m 53s
	Train Loss: 3.1079
	 Val. Loss: 4.6393 | Val. Perplexity: 103.4735

Epoch 07/20 | Time: 5m 55s
	Train Loss: 2.9481
	 Val. Loss: 4.4717 | Val. Perplexity: 87.5074

Epoch 08/20 | Time: 5m 58s
	Train Loss: 2.7686
	 Val. Loss: 4.4347 | Val. Perplexity: 84.3228
	-> Validation loss improved, saving new best model.


KeyboardInterrupt: 

In [14]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer
import random

# --- You can run this code now ---

print("--- Phase 4: Inference with Dual-Input Model ---")

# 1. Setup: Load Tokenizer and your new best model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

LOCAL_MODEL_PATH = "/home/poorna/models/bert-base-uncased"
print(f"Loading tokenizer from local path: {LOCAL_MODEL_PATH}")
tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL_PATH)

# Define all necessary constants for model instantiation
TEXT_VOCAB_SIZE = tokenizer.vocab_size
PAD_ID = tokenizer.pad_token_id
SOS_ID = tokenizer.cls_token_id
EOS_ID = tokenizer.sep_token_id
NUM_COLORS = 77
NUM_CATEGORIES = 53

# --- MODIFICATION: Instantiate the correct dual-input Seq2Seq model ---
# We assume the Seq2Seq class and its components are defined from your (Revised) Phase 1.
model = Seq2Seq(
    text_vocab_size=TEXT_VOCAB_SIZE,
    num_colors=NUM_COLORS,
    num_categories=NUM_CATEGORIES,
    pad_id=PAD_ID,
    dropout=0.4 # Use the same dropout as the trained model
).to(device)

# --- MODIFICATION: Load the correct checkpoint file ---
checkpoint_path = 'eeg-meta-text-best-model.pt'
print(f"Loading saved weights from: {checkpoint_path}")
model.load_state_dict(torch.load(checkpoint_path, map_location=device))
print("Best dual-input model loaded successfully.")


# 2. Define the Updated Inference Function
@torch.no_grad()
def generate_with_metadata(model, eeg_signal, meta_signal, beam_width=5, max_len=100):
    """
    Generates text using beam search, accepting both EEG and Metadata as input.
    """
    model.eval()
    eeg_signal = eeg_signal.unsqueeze(0).to(device)
    meta_signal = meta_signal.unsqueeze(0).to(device)

    # --- MODIFICATION: Use both encoders and fuse the features ---
    encoder_outputs, encoder_hidden = model.encoder(eeg_signal)
    meta_features = model.meta_encoder(meta_signal)
    
    num_layers = model.decoder.rnn.num_layers
    forward_h = encoder_hidden[0::2]
    backward_h = encoder_hidden[1::2]
    encoder_hidden_cat = torch.cat([forward_h, backward_h], dim=2)
    
    meta_features_repeated = meta_features.unsqueeze(0).repeat(num_layers, 1, 1)
    combined_features = torch.cat([encoder_hidden_cat, meta_features_repeated], dim=2)
    
    # Initialize the decoder's hidden state with the fused features
    decoder_hidden = model.decoder.init_hidden(combined_features)
    # --- END OF MODIFICATION ---

    # The beam search decoding loop remains the same
    beams = [([SOS_ID], 0.0, decoder_hidden)]
    
    for _ in range(max_len):
        new_beams = []
        for seq, score, hidden in beams:
            if seq[-1] == EOS_ID:
                new_beams.append((seq, score, hidden))
                continue

            input_token = torch.tensor([seq[-1]], device=device)
            # The decoder's forward pass only needs the token, hidden state, and EEG outputs for attention
            prediction, new_hidden = model.decoder(input_token, hidden, encoder_outputs)
            
            log_probs = F.log_softmax(prediction, dim=-1).squeeze()
            top_log_probs, top_ids = torch.topk(log_probs, beam_width)
            
            for i in range(beam_width):
                new_seq = seq + [top_ids[i].item()]
                new_score = score + top_log_probs[i].item()
                new_beams.append((new_seq, new_score, new_hidden))

        beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]
        
        if beams[0][0][-1] == EOS_ID:
            break
            
    return beams[0][0][1:]

# 3. Run Inference on 10 Random Samples
NUM_SAMPLES = 10
# We assume 'test_ds' is in memory from your Phase 0 data setup
test_set_size = len(test_ds)
random_indices = random.sample(range(test_set_size), NUM_SAMPLES)

print(f"\n--- Running Inference on {NUM_SAMPLES} Random Samples ---")

for i, idx in enumerate(random_indices):
    # --- MODIFICATION: Fetch all three data components ---
    eeg_sample, meta_sample, true_text_ids = test_ds[idx]

    # --- MODIFICATION: Pass both EEG and Metadata to the generation function ---
    predicted_ids_beam = generate_with_metadata(model, eeg_sample, meta_sample, beam_width=5)

    # Decode the IDs back to human-readable text
    true_text = tokenizer.decode(true_text_ids, skip_special_tokens=True)
    predicted_text_beam = tokenizer.decode(predicted_ids_beam, skip_special_tokens=True)
    
    print(f"\n--- Sample {i+1}/{NUM_SAMPLES} (Index: {idx}) ---")
    print(f"GROUND TRUTH: {true_text}")
    print(f"MODEL PREDICTION: {predicted_text_beam}")

--- Phase 4: Inference with Dual-Input Model ---
Loading tokenizer from local path: /home/poorna/models/bert-base-uncased
Loading saved weights from: eeg-meta-text-best-model.pt
Best dual-input model loaded successfully.

--- Running Inference on 10 Random Samples ---

--- Sample 1/10 (Index: 539) ---
GROUND TRUTH: water sprays onto a bunch of ripe bananas.. tone : calm
MODEL PREDICTION: close - up of a bunch of ripe yellow bananas.. tone : calm

--- Sample 2/10 (Index: 2286) ---
GROUND TRUTH: colorful fish swim around vibrant coral in a clear ocean.. tone : serene
MODEL PREDICTION: colorful hot air balloons float in a clear blue sky.. tone : serene

--- Sample 3/10 (Index: 597) ---
GROUND TRUTH: motorcyclists race around a track during a competition.. tone : energetic
MODEL PREDICTION: a motorcyclist rides a snowy mountain road.. tone : energetic

--- Sample 4/10 (Index: 870) ---
GROUND TRUTH: three panda cubs playfully tumble on a grassy lawn.. tone : playful
MODEL PREDICTION: a gian