In [None]:
import h5py
import torch
import os
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
from torch.cuda.amp import GradScaler, autocast

# --- 1. ڕێکخستنی گشتی ---
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BASE_PATH = '/kaggle/input/brain-to-text-25/t15_copyTask_neuralData/hdf5_data_final'
CHARS = ['BLANK'] + list(" abcdefghijklmnopqrstuvwxyz' ")
CHAR_MAP = {c: i for i, c in enumerate(CHARS)}

# --- 2. پۆلێنکردنی داتا بە شێوەی باچ (Batching) ---
class NeuralSignalDataset(Dataset):
    def __init__(self, file_paths):
        self.samples = []
        for path in tqdm(file_paths, desc="Loading Data"):
            if not os.path.exists(path): continue
            with h5py.File(path, 'r') as hf:
                for key in hf.keys():
                    try:
                        f = hf[key]['input_features'][()]
                        # نۆرماڵایزکردنی سیگناڵ (گرنگە بۆ ئەوەی لۆسەکە دانەبەزێت)
                        f = (f - np.mean(f)) / (np.std(f) + 1e-6)
                        
                        text = ""
                        for l_key in ['transcription', 'sentence']:
                            if l_key in hf[key]:
                                val = hf[key][l_key][()]
                                text = val.decode('utf-8').lower() if isinstance(val, bytes) else str(val).lower()
                                break
                        
                        if text:
                            target = [CHAR_MAP[c] for c in text if c in CHAR_MAP]
                            if len(target) > 0:
                                self.samples.append((f, target))
                    except: continue

    def __len__(self): return len(self.samples)
    def __getitem__(self, idx):
        return torch.tensor(self.samples[idx][0]).float(), torch.tensor(self.samples[idx][1]).long()

# فەنکشنی ڕێکخستنی درێژی داتاکان (Padding)
def collate_fn(batch):
    inputs, targets = zip(*batch)
    input_lens = torch.tensor([len(x) for x in inputs])
    target_lens = torch.tensor([len(y) for y in targets])
    inputs_padded = pad_sequence(inputs, batch_first=True)
    targets_padded = pad_sequence(targets, batch_first=True, padding_value=0)
    return inputs_padded, targets_padded, input_lens, target_lens

# --- 3. مۆدێلی باشترکراو (زیادکردنی Positional Encoding) ---
class NeuralTransformerModel(nn.Module):
    def __init__(self, input_dim=512, hidden_dim=256, nhead=8, num_layers=6):
        super().__init__()
        self.feature_projection = nn.Linear(input_dim, hidden_dim)
        # مۆدێلەکە فێردەکات کام سیگناڵ پێش کامە هاتووە
        self.pos_embedding = nn.Parameter(torch.randn(1, 3000, hidden_dim)) 
        
        encoder_layers = nn.TransformerEncoderLayer(
            d_model=hidden_dim, nhead=nhead, dim_feedforward=hidden_dim*4, 
            dropout=0.2, batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)
        self.classifier = nn.Linear(hidden_dim, len(CHARS))

    def forward(self, x):
        x = self.feature_projection(x) + self.pos_embedding[:, :x.size(1), :]
        x = self.transformer_encoder(x)
        return self.classifier(x).log_softmax(2)

# --- 4. پرۆسەی ڕاهێنان (بە باچ و ڤالیدەیشن) ---
def execute_training_pipeline():
    train_files = [str(p) for p in Path(BASE_PATH).rglob('data_train.hdf5')]
    dataset = NeuralSignalDataset(train_files)
    
    # دابەشکردن بۆ Train و Validation بۆ ئەوەی بزانیت فێردەبێت یان نا
    train_size = int(0.9 * len(dataset))
    train_ds, val_ds = random_split(dataset, [train_size, len(dataset)-train_size])
    
    train_loader = DataLoader(train_ds, batch_size=16, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_ds, batch_size=16, shuffle=False, collate_fn=collate_fn)
    
    model = NeuralTransformerModel().to(DEVICE)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.05)
    criterion = nn.CTCLoss(blank=0, zero_infinity=True)
    
    # بەکارهێنانی FP16 بۆ خێراکردن و جێگیری
    scaler = GradScaler()
    epochs = 30 # ئایپۆکی کەمتر بەڵام کاریگەرتر
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=3e-4, 
                                                    steps_per_epoch=len(train_loader), epochs=epochs)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            inputs, targets, in_lens, tar_lens = [b.to(DEVICE) if isinstance(b, torch.Tensor) else b for b in batch]
            optimizer.zero_grad()
            
            with autocast():
                output = model(inputs).permute(1, 0, 2) # CTC loss پێویستی بە (T, N, C) هەیە
                loss = criterion(output, targets, in_lens, tar_lens)
            
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # ڕێگری لە تەقینی لۆس
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            total_loss += loss.item()
            
        print(f"Epoch {epoch+1}/{epochs} - Train Loss: {total_loss/len(train_loader):.4f}")

    return model

# --- 5. دروستکردنی سەبمیشن ---
def generate_submission(model):
    model.eval()
    predictions = []
    test_files = sorted(list(Path(BASE_PATH).rglob("data_test.hdf5")))
    with torch.no_grad():
        for f_path in tqdm(test_files):
            with h5py.File(f_path, "r") as hf:
                keys = sorted(hf.keys(), key=lambda k: int(k.split('_')[1]) if '_' in k else 0)
                for key in keys:
                    x = torch.from_numpy(hf[key]["input_features"][()]).float().unsqueeze(0).to(DEVICE)
                    x = (x - x.mean()) / (x.std() + 1e-6)
                    logits = model(x)
                    # لێرەدا پیتەکان کۆدەکەینەوە
                    best_path = torch.argmax(logits[0], dim=-1).unique_consecutive()
                    decoded = "".join([CHARS[i] for i in best_path if i != 0]).strip()
                    predictions.append(decoded if decoded else "silence")
                    
    pd.DataFrame({"id": range(len(predictions)), "text": predictions}).to_csv("submission.csv", index=False)
    print("Submission saved successfully!")

if __name__ == "__main__":
    final_model = execute_training_pipeline()
    generate_submission(final_model)