In [None]:
import h5py
import torch
import os
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# --- 1. Global Configurations and Device Allocation ---
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BASE_PATH = '/kaggle/input/brain-to-text-25/t15_copyTask_neuralData/hdf5_data_final'

# Academic Vocabulary: Character-level tokens including special characters
CHARS = ['BLANK'] + list(" abcdefghijklmnopqrstuvwxyz' ") 
CHAR_MAP = {c: i for i, c in enumerate(CHARS)}

# Dynamic discovery of all available training HDF5 files
TRAIN_FILES = [str(p) for p in Path(BASE_PATH).rglob('data_train.hdf5')]

# --- 2. Robust Data Engineering and Signal Preprocessing ---
class NeuralSignalDataset(Dataset):
    """
    Custom Dataset class designed to handle heterogeneous neural data structures.
    Implements multi-key fallback to prevent ZeroDivisionError.
    """
    def __init__(self, file_paths):
        self.samples = []
        for path in tqdm(file_paths, desc="Preprocessing Neural Data"):
            if not os.path.exists(path): continue
            with h5py.File(path, 'r') as hf:
                for key in hf.keys():
                    try:
                        # Load high-dimensional neural input features
                        features = hf[key]['input_features'][()]
                        
                        # Apply Z-score Normalization for signal stabilization
                        features = (features - np.mean(features)) / (np.std(features) + 1e-6)

                        # Robust search for transcription/label keys
                        text_data = None
                        for label_key in ['transcription', 'sentence', 'phonemes']:
                            if label_key in hf[key]:
                                raw_val = hf[key][label_key][()]
                                if isinstance(raw_val, bytes):
                                    text_data = raw_val.decode('utf-8').lower()
                                elif isinstance(raw_val, np.ndarray):
                                    text_data = " ".join([x.decode('utf-8') if isinstance(x, bytes) else str(x) for x in raw_val]).lower()
                                else:
                                    text_data = str(raw_val).lower()
                                break
                        
                        if text_data:
                            target_indices = [CHAR_MAP[c] for c in text_data if c in CHAR_MAP]
                            if len(target_indices) > 0:
                                self.samples.append((features, target_indices))
                    except Exception:
                        continue

        if len(self.samples) == 0:
            print("Warning: No valid samples found. Verify HDF5 internal structure.")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        features, target = self.samples[idx]
        return torch.tensor(features).float(), torch.tensor(target).long()

# --- 3. Advanced Transformer-based Architecture ---

class NeuralTransformerModel(nn.Module):
    def __init__(self, input_dim=512, hidden_dim=256, nhead=8, num_layers=6):
        super(NeuralTransformerModel, self).__init__()
        self.feature_projection = nn.Linear(input_dim, hidden_dim)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_dim, 
            nhead=nhead, 
            dim_feedforward=hidden_dim * 4, 
            dropout=0.2, 
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.final_classifier = nn.Linear(hidden_dim, len(CHARS))

    def forward(self, x):
        x = self.feature_projection(x)
        x = self.transformer_encoder(x)
        return self.final_classifier(x).log_softmax(2)

# --- 4. Optimization and Training Routine ---
def execute_training_pipeline():
    print(f"Status: Initializing Dataset with {len(TRAIN_FILES)} files...")
    dataset = NeuralSignalDataset(TRAIN_FILES)
    
    if len(dataset) == 0:
        return None

    model = NeuralTransformerModel().to(DEVICE)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)
    criterion = nn.CTCLoss(blank=0, zero_infinity=True)

    print(f"Status: Optimization commenced on {len(dataset)} samples (50 Epochs Target)...")
    model.train()
    
    for epoch in range(50):
        epoch_loss = 0
        for features, targets in dataset:
            features = features.unsqueeze(0).to(DEVICE)
            targets = targets.unsqueeze(0).to(DEVICE)
            
            optimizer.zero_grad()
            output = model(features).permute(1, 0, 2) # [Seq, Batch, Class]
            
            input_lengths = torch.tensor([output.size(0)], dtype=torch.long)
            target_lengths = torch.tensor([targets.size(1)], dtype=torch.long)
            
            loss = criterion(output, targets, input_lengths, target_lengths)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
            
        print(f"Epoch [{epoch+1}/50] - Average Training Loss: {epoch_loss / len(dataset):.4f}")
        
    return model

# --- 5. Inference and Submission Generation ---

def generate_submission(model):
    if model is None: 
        print("Error: No trained model available for inference.")
        return

    model.eval()
    predictions = []
    test_files = sorted(list(Path(BASE_PATH).rglob("data_test.hdf5")))
    
    print("Status: Commencing Inference on Test Partitions...")
    with torch.no_grad():
        for file_path in tqdm(test_files):
            with h5py.File(file_path, "r") as hf:
                trial_keys = sorted(hf.keys(), key=lambda k: int(k.split('_')[1]) if '_' in k else 0)
                for key in trial_keys:
                    x = torch.from_numpy(hf[key]["input_features"][()]).float().unsqueeze(0).to(DEVICE)
                    x = (x - x.mean()) / (x.std() + 1e-6) # Consistency in normalization
                    
                    logits = model(x)
                    best_path = torch.argmax(logits[0], dim=-1).unique_consecutive()
                    decoded_str = "".join([CHARS[i] for i in best_path if i != 0])
                    predictions.append(decoded_str.strip() if decoded_str.strip() else "silence")

    submission_df = pd.DataFrame({"id": range(len(predictions)), "text": predictions})
    submission_df.to_csv("submission.csv", index=False)
    print("Final Status: Submission file 'submission.csv' generated successfully.")

# --- System Execution ---
if __name__ == "__main__":
    optimized_model = execute_training_pipeline()
    generate_submission(optimized_model)

Status: Initializing Dataset with 45 files...


Preprocessing Neural Data:   0%|          | 0/45 [00:00<?, ?it/s]

Status: Optimization commenced on 8072 samples (50 Epochs Target)...
