In [None]:

import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' # Or 'max_split_size_mb:128'
import torch; import torch.nn as nn; import torch.optim as optim
from torch.utils.data import Dataset, DataLoader; from torch.nn.utils.rnn import pad_sequence
from transformers import AutoTokenizer, AutoModel
import numpy as np; import pandas as pd; import random; import math; import time; import re
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from tqdm.auto import tqdm; import json
from torch.amp import autocast, GradScaler
import torch.multiprocessing as mp # Keep for set_start_method if needed for testing

print(f"PyTorch Version: {torch.__version__}"); print(f"CUDA Available: {torch.cuda.is_available()}")
ngpus_available = torch.cuda.device_count(); print(f"GPUs Available: {ngpus_available}")
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu'); print(f"Primary Device: {DEVICE}")
if ngpus_available == 0: print("WARNING: No GPUs detected.")
os.environ["TOKENIZERS_PARALLELISM"] = "true"

PyTorch Version: 2.6.0+cu124
CUDA Available: True
GPUs Available: 1
Primary Device: cuda


In [None]:


# --- File Paths ---
TRAIN_FILE_PATH = 'team16_ta_train.csv'
VALID_FILE_PATH = 'team16_ta_valid.csv'
TEST_FILE_PATH  = 'team16_ta_test.csv'

# --- GloVe Configuration ---
GLOVE_ZIP_URL_CONFIG = "http://nlp.stanford.edu/data/glove.6B.zip"
KAGGLE_GLOVE_FILE_PATH_CONFIG = "/kaggle/input/mineassignment/glove.6B.100d.txt"
LOCAL_GLOVE_FILENAME_CONFIG = "glove.6B.100d.txt"
GLOVE_FILE_TO_USE_CONFIG = KAGGLE_GLOVE_FILE_PATH_CONFIG if os.path.exists(KAGGLE_GLOVE_FILE_PATH_CONFIG) else LOCAL_GLOVE_FILENAME_CONFIG
GLOVE_DIM_CONFIG = 100

# --- IndicBERT Configuration ---
INDIC_BERT_MODEL_NAME_CONFIG = "ai4bharat/indic-bert"

# --- Special Tokens for English Vocabulary (Constants for global use) ---
PAD_TOKEN_ENG = "<pad>"; SOS_TOKEN_ENG = "<sos>"; EOS_TOKEN_ENG = "<eos>"; UNK_TOKEN_ENG = "<unk>"
PAD_IDX_ENG = 0; SOS_IDX_ENG = 1; EOS_IDX_ENG = 2; UNK_IDX_ENG = 3

# --- CHECKPOINTING AND EPOCH CONTROL ---
MAX_EPOCHS_PER_RUN = 1
TOTAL_EPOCHS_TARGET = 5
CHECKPOINT_DIR = "/kaggle/working/"
if not os.path.exists(CHECKPOINT_DIR):
    os.makedirs(CHECKPOINT_DIR, exist_ok=True)

LAST_CHECKPOINT_FILE = os.path.join(CHECKPOINT_DIR, "nmt_dp_last_checkpoint.pt")
BEST_LOSS_MODEL_FILE = os.path.join(CHECKPOINT_DIR, "nmt_dp_best_loss.pt")
BEST_BLEU_MODEL_FILE = os.path.join(CHECKPOINT_DIR, "nmt_dp_best_bleu4.pt")
HISTORY_FILE = os.path.join(CHECKPOINT_DIR, "nmt_dp_training_history.json")

# Hyperparameters Dictionary
hyperparameters = {
    'TRAIN_FILE': TRAIN_FILE_PATH, 'VALID_FILE': VALID_FILE_PATH, 'TEST_FILE': TEST_FILE_PATH,
    'GLOVE_ZIP_URL': GLOVE_ZIP_URL_CONFIG,
    'GLOVE_FILE_TO_USE': GLOVE_FILE_TO_USE_CONFIG,
    'LOCAL_GLOVE_FILENAME_FOR_DOWNLOAD_CHECK': LOCAL_GLOVE_FILENAME_CONFIG,
    'GLOVE_DIM': GLOVE_DIM_CONFIG,
    'INDIC_BERT_MODEL_NAME': INDIC_BERT_MODEL_NAME_CONFIG,

    'HID_DIM': 128, 'ENC_LAYERS': 2, 'DEC_LAYERS': 2,
    'ENC_HEADS': 4, 'DEC_HEADS': 4, 'ENC_PF_DIM': 512, 'DEC_PF_DIM': 512,
    'ENC_DROPOUT': 0.1, 'DEC_DROPOUT': 0.1, 'LEARNING_RATE': 0.0005,
    'CLIP': 1.0,
    'TOTAL_BATCH_SIZE': 32,
    'ACCUMULATION_STEPS': 4,
    'MAX_LEN': 64,
    'SEED': 1234,
    'DATALOADER_WORKERS': 8,
    'LOG_INTERVAL': 50,
    'NUM_GPUS': ngpus_available
}

# --- Reproducibility ---
random.seed(hyperparameters['SEED'])
np.random.seed(hyperparameters['SEED'])
torch.manual_seed(hyperparameters['SEED'])
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(hyperparameters['SEED'])

print(f"Configuration Loaded. Checkpoint Dir: {CHECKPOINT_DIR}")
print(f"GloVe File to use: {hyperparameters['GLOVE_FILE_TO_USE']}")
print(f"TOTAL_BATCH_SIZE (for DataLoader): {hyperparameters['TOTAL_BATCH_SIZE']}")
print(f"ACCUMULATION_STEPS: {hyperparameters['ACCUMULATION_STEPS']}")
print(f"DataLoader workers: {hyperparameters['DATALOADER_WORKERS']}")
print(f"Max epochs this run: {MAX_EPOCHS_PER_RUN}, Total target epochs for all runs: {TOTAL_EPOCHS_TARGET}")

Configuration Loaded. Checkpoint Dir: /kaggle/working/
GloVe File to use: glove.6B.100d.txt
TOTAL_BATCH_SIZE (for DataLoader): 32
ACCUMULATION_STEPS: 4
DataLoader workers: 8
Max epochs this run: 1, Total target epochs for all runs: 5


In [None]:
# ==============================================================================
# CELL 3: ALL HELPER FUNCTIONS AND CLASS DEFINITIONS
# (Includes IndicBERT loading, Vocab, Data Loaders, Model Definition)
# ==============================================================================

# --- GloVe Download Logic ---
def _download_glove_if_needed():
    target_glove_txt_file = hyperparameters['GLOVE_FILE_TO_USE']
    if target_glove_txt_file == hyperparameters['LOCAL_GLOVE_FILENAME_FOR_DOWNLOAD_CHECK'] and \
       not os.path.exists(target_glove_txt_file):
        print(f"Local GloVe file '{target_glove_txt_file}' not found.")
        print(f"Downloading GloVe from {hyperparameters['GLOVE_ZIP_URL']}...")
        _zip_f = "glove_download.zip"
        _dl_status = os.system(f"wget -q {hyperparameters['GLOVE_ZIP_URL']} -O {_zip_f}")
        if _dl_status != 0:
            print(f"ERROR: Failed to download GloVe zip (status: {_dl_status}).")
            return False
        else:
            print("Unzipping GloVe...")
            _unzip_cmd = f"unzip -o -q {_zip_f} {hyperparameters['LOCAL_GLOVE_FILENAME_FOR_DOWNLOAD_CHECK']} -d ."
            _uz_status = os.system(_unzip_cmd)
            if not os.path.exists(target_glove_txt_file):
                 print(f"Specific file extract failed (status {_uz_status}). Trying generic unzip from {_zip_f}...")
                 _uz_status = os.system(f"unzip -o -q {_zip_f} -d .")
                 if not os.path.exists(target_glove_txt_file):
                     print(f"ERROR: Failed to find '{target_glove_txt_file}' after unzipping (status {_uz_status}). Check zip contents and paths.")
                     return False # Unzip/extraction failed
            print(f"'{target_glove_txt_file}' successfully extracted/downloaded.")
            if os.path.exists(_zip_f): os.remove(_zip_f) # Clean up zip file
            return True
    elif os.path.exists(target_glove_txt_file):
        print(f"Using existing GloVe file: {target_glove_txt_file}")
        return True
    else:
        print(f"FATAL: GloVe file {target_glove_txt_file} not found, and conditions for download not met.")
        return False

if not _download_glove_if_needed():
    raise FileNotFoundError("GloVe setup failed. Cannot proceed.")

# --- Load Global IndicBERT Components (Tokenizer, Pad ID, Embedding Info) ---
print(f"\nLoading global IndicBERT tokenizer: {hyperparameters['INDIC_BERT_MODEL_NAME']}...")
try:
    indic_tokenizer_global = AutoTokenizer.from_pretrained(hyperparameters['INDIC_BERT_MODEL_NAME'])
    INDIC_PAD_ID_HINDI = indic_tokenizer_global.pad_token_id
    if INDIC_PAD_ID_HINDI is None:
        print("Warning: indic_tokenizer_global.pad_token_id is None. Defaulting to 0 for Hindi padding.")
        INDIC_PAD_ID_HINDI = 0
    print(f"Global IndicBERT tokenizer loaded. Pad ID for Hindi: {INDIC_PAD_ID_HINDI}")

    print(f"Loading IndicBERT model ({hyperparameters['INDIC_BERT_MODEL_NAME']}) for embeddings info...")
    _indic_bert_model_cpu = AutoModel.from_pretrained(hyperparameters['INDIC_BERT_MODEL_NAME'])
    _indic_bert_embeddings_layer = _indic_bert_model_cpu.get_input_embeddings()
    INDIC_EMBEDDING_WEIGHTS_CPU = _indic_bert_embeddings_layer.weight.data.clone()
    INDIC_EMBEDDING_DIM_HINDI = INDIC_EMBEDDING_WEIGHTS_CPU.size(1)
    INDIC_VOCAB_SIZE_HINDI = INDIC_EMBEDDING_WEIGHTS_CPU.size(0)
    print(f"IndicBERT: Vocab Size={INDIC_VOCAB_SIZE_HINDI}, Embedding Dim={INDIC_EMBEDDING_DIM_HINDI}")
    del _indic_bert_model_cpu, _indic_bert_embeddings_layer # Free memory
except Exception as e:
    print(f"FATAL: Failed to load IndicBERT components: {e}. Internet connection might be required.")
    raise

# --- English Tokenizer (Uses global special tokens from Cell 2: PAD_TOKEN_ENG etc.) ---
def tokenize_eng_fn(text):
    text = str(text).lower()
    text = re.sub(r"([?.!,¿-])", r" \1 ", text)
    text = re.sub(r'[\\\" \\\"]+', " ", text)
    text = text.strip()
    return text.split(' ')

# --- Vocabulary Class for English ---
class Vocabulary:
    def __init__(self, name, tokenizer_func):
        self.name = name; self.tokenizer_func = tokenizer_func
        self.token2index = {PAD_TOKEN_ENG: PAD_IDX_ENG, SOS_TOKEN_ENG: SOS_IDX_ENG, EOS_TOKEN_ENG: EOS_IDX_ENG, UNK_TOKEN_ENG: UNK_IDX_ENG}
        self.index2token = {v: k for k, v in self.token2index.items()}; self.n_tokens = len(self.token2index)
    def add_sentence(self, sentence): tokens = self.tokenizer_func(sentence); [self.add_token(t) for t in tokens]
    def add_token(self, token):
        if token not in self.token2index: self.token2index[token] = self.n_tokens; self.index2token[self.n_tokens] = token; self.n_tokens += 1
    def tokens_to_indices(self, tokens): return [self.token2index.get(t, UNK_IDX_ENG) for t in tokens]
    def indices_to_tokens(self, indices, remove_special=True):
        output_tokens = [];
        for idx_val in indices:
            idx_val = idx_val.item() if not isinstance(idx_val, int) else idx_val
            if remove_special:
                if idx_val == PAD_IDX_ENG: continue
                if idx_val == SOS_IDX_ENG: continue
                if idx_val == EOS_IDX_ENG: break
            output_tokens.append(self.index2token.get(idx_val, UNK_TOKEN_ENG))
        return output_tokens

# --- Load GloVe Embeddings for English Vocab ---
def load_glove_embeddings_fn(glove_file_path, vocab, embedding_dim):
    print(f"Loading GloVe embeddings from: {glove_file_path}...")
    embeddings_index = {}
    try:
        with open(glove_file_path, 'r', encoding='utf-8') as f:
            for line in tqdm(f, desc="Reading GloVe", unit="vecs", mininterval=1.0):
                values = line.split(); word = values[0]
                try: coefs = np.asarray(values[1:], dtype='float32')
                except ValueError: continue
                if len(coefs) == embedding_dim: embeddings_index[word] = coefs
    except FileNotFoundError: print(f"FATAL: GloVe file {glove_file_path} not found."); raise
    embedding_matrix = np.random.uniform(-0.1,0.1,(vocab.n_tokens,embedding_dim)).astype(np.float32)
    embedding_matrix[PAD_IDX_ENG] = np.zeros(embedding_dim, dtype=np.float32)
    loaded_count = 0
    for token, i in vocab.token2index.items():
        if token in [PAD_TOKEN_ENG,SOS_TOKEN_ENG,EOS_TOKEN_ENG]: continue
        embedding_vector = embeddings_index.get(token)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector; loaded_count += 1
    print(f"Initialized {loaded_count}/{vocab.n_tokens - 4} actual words from GloVe.")
    return torch.tensor(embedding_matrix, dtype=torch.float)

# --- Load Data from CSV ---
def load_data_from_csv_fn(file_path, max_rows=None):
    try:
        df = pd.read_csv(file_path, nrows=max_rows)
        if 'source' not in df.columns or 'target' not in df.columns:
            if 's' in df.columns and 't' in df.columns: df = df.rename(columns={'s':'source', 't':'target'})
            else: raise ValueError(f"CSV '{file_path}' needs 'source'/'target' columns.")
        df.dropna(subset=['source', 'target'], inplace=True); src_sents=df['source'].astype(str).tolist(); trg_sents=df['target'].astype(str).tolist()
        print(f"Loaded {len(src_sents)}p from {file_path}" + (f" (max_r={max_rows})" if max_rows else ""))
        return src_sents, trg_sents
    except FileNotFoundError: print(f"FATAL: File {file_path} not found."); raise
    except Exception as e: print(f"Error loading {file_path}: {e}"); raise

# --- Build English Vocab & Load English Embeddings (Main process, once) ---
print("\nLoading training data for Eng vocab build...")
_vocab_train_src, _ = load_data_from_csv_fn(hyperparameters['TRAIN_FILE'], max_rows=hyperparameters.get('MAX_ROWS_DEBUG', None))
if not _vocab_train_src: raise RuntimeError("Vocab training data empty.")
eng_vocab_global = Vocabulary("eng", tokenizer_func=tokenize_eng_fn)
print("Building English vocabulary..."); [_ for _ in tqdm(map(eng_vocab_global.add_sentence, _vocab_train_src), total=len(_vocab_train_src), desc="Build EngV")]
print(f"Eng vocab size: {eng_vocab_global.n_tokens}"); del _vocab_train_src
english_embedding_weights_cpu = load_glove_embeddings_fn(hyperparameters['GLOVE_FILE_TO_USE'], eng_vocab_global, hyperparameters['GLOVE_DIM'])

# --- Dataset Class (uses globals: SOS_TOKEN_ENG, EOS_TOKEN_ENG, PAD_IDX_ENG etc from Cell 2) ---
class TranslationDataset(Dataset):
    def __init__(self, src_s_list, trg_s_list, src_v_g, ind_tok_g, src_tok_fn, max_l):
        self.src_sentences=src_s_list; self.trg_sentences=trg_s_list
        self.src_vocab=src_v_g; self.indic_tokenizer=ind_tok_g
        self.src_tokenizer_func=src_tok_fn; self.max_len=max_l
    def __len__(self): return len(self.src_sentences)
    def __getitem__(self,idx):
        src_str=self.src_sentences[idx]; trg_str=self.trg_sentences[idx]
        src_toks=[SOS_TOKEN_ENG]+self.src_tokenizer_func(src_str)[:self.max_len-2]+[EOS_TOKEN_ENG]
        src_ids_t=torch.tensor(self.src_vocab.tokens_to_indices(src_toks),dtype=torch.long)
        ind_toks_sub=self.indic_tokenizer.tokenize(trg_str)[:self.max_len-1]
        trg_in_ids_t=torch.tensor([self.indic_tokenizer.cls_token_id]+self.indic_tokenizer.convert_tokens_to_ids(ind_toks_sub),dtype=torch.long)
        trg_out_ids_t=torch.tensor(self.indic_tokenizer.convert_tokens_to_ids(ind_toks_sub)+[self.indic_tokenizer.sep_token_id],dtype=torch.long)
        return {"src_text_orig":src_str, "trg_text_orig":trg_str, # For eval and examples
                "src_indices":src_ids_t, "trg_input_ids":trg_in_ids_t, "trg_output_ids":trg_out_ids_t}

# --- Collate Function (uses globals PAD_IDX_ENG, INDIC_PAD_ID_HINDI) ---
def collate_fn_wrapper(batch_list, eng_pad_val, ind_pad_val):
    src_i_l=[d["src_indices"] for d in batch_list]; trg_in_i_l=[d["trg_input_ids"] for d in batch_list]; trg_out_i_l=[d["trg_output_ids"] for d in batch_list]
    src_txt_l=[d["src_text_orig"] for d in batch_list]; trg_txt_l=[d["trg_text_orig"] for d in batch_list] # Collect original texts
    src_p_t=pad_sequence(src_i_l,batch_first=True,padding_value=eng_pad_val)
    trg_in_p_t=pad_sequence(trg_in_i_l,batch_first=True,padding_value=ind_pad_val)
    trg_out_p_t=pad_sequence(trg_out_i_l,batch_first=True,padding_value=ind_pad_val)
    return {"src_text":src_txt_l, "trg_text":trg_txt_l,
            "src":src_p_t, "trg_input":trg_in_p_t, "trg_output":trg_out_p_t}

def top_level_collate_fn(batch_list_arg):
    return collate_fn_wrapper(batch_list_arg, PAD_IDX_ENG, INDIC_PAD_ID_HINDI)

# --- Model Architecture Classes (PositionalEncoding, Seq2SeqTransformer) ---
class PositionalEncoding(nn.Module):
    def __init__(self, d_model_v, dropout_v, max_len_v=5000): #hyperparameters['MAX_LEN'] can be passed
        super().__init__();dn=torch.exp(-torch.arange(0,d_model_v,2)*math.log(10000)/d_model_v)
        p=torch.arange(0,max_len_v).reshape(max_len_v,1);pe_m=torch.zeros((max_len_v,d_model_v))
        pe_m[:,0::2]=torch.sin(p*dn);pe_m[:,1::2]=torch.cos(p*dn)
        self.drop=nn.Dropout(dropout_v);self.register_buffer('pe_b',pe_m.unsqueeze(0))
    def forward(self,x_emb): return self.drop(x_emb+self.pe_b[:,:x_emb.size(1),:].to(x_emb.device))

class Seq2SeqTransformer(nn.Module):
    def __init__(self,n_enc_l_v,n_dec_l_v,d_s_emb_v,d_t_emb_v,d_mod_v,nhead_v,s_vsz_v,t_vsz_v,d_ff_v,drop_v,
                 s_emb_w_v,t_emb_w_v,eng_p_idx_v,ind_p_idx_v,max_pe_l_v):
        super().__init__();self.dm=d_mod_v;self.eng_p_idx=eng_p_idx_v;self.ind_p_idx=ind_p_idx_v
        self.s_emb=nn.Embedding(s_vsz_v,d_s_emb_v,padding_idx=eng_p_idx_v)
        if s_emb_w_v is not None: self.s_emb.weight.data.copy_(s_emb_w_v);self.s_emb.weight.requires_grad=False
        self.t_emb=nn.Embedding(t_vsz_v,d_t_emb_v,padding_idx=ind_p_idx_v)
        if t_emb_w_v is not None: self.t_emb.weight.data.copy_(t_emb_w_v);self.t_emb.weight.requires_grad=False
        self.proj_s=nn.Linear(d_s_emb_v,d_mod_v) if d_s_emb_v!=d_mod_v else nn.Identity()
        self.proj_t=nn.Linear(d_t_emb_v,d_mod_v) if d_t_emb_v!=d_mod_v else nn.Identity()
        self.pos_enc_mod=PositionalEncoding(d_mod_v,drop_v,max_pe_l_v)
        self.tf_mod=nn.Transformer(d_mod_v,nhead_v,n_enc_l_v,n_dec_l_v,d_ff_v,drop_v,batch_first=True,activation=nn.GELU()) # Added GELU
        self.fc_layer=nn.Linear(d_mod_v,t_vsz_v)
    def _gen_sq_mask(self,sz_v,dev_v):m=(torch.triu(torch.ones(sz_v,sz_v,device=dev_v))==1).T;return m.float().masked_fill(m==0,float('-inf')).masked_fill(m==1,0.)
    def _gen_pad_mask(self,seq_v,pidx_v):return (seq_v==pidx_v)
    def forward(self,src_v,trg_in_v):
        dev=src_v.device;s_pad_m=self._gen_pad_mask(src_v,self.eng_p_idx).to(dev);t_pad_m=self._gen_pad_mask(trg_in_v,self.ind_p_idx).to(dev)
        s_e_p=self.proj_s(self.s_emb(src_v));t_e_p=self.proj_t(self.t_emb(trg_in_v))
        s_final=self.pos_enc_mod(s_e_p*math.sqrt(self.dm));t_final=self.pos_enc_mod(t_e_p*math.sqrt(self.dm))
        t_sq_m=self._gen_sq_mask(t_final.shape[1],dev)
        out_tf=self.tf_mod(s_final,t_final,src_key_padding_mask=s_pad_m,tgt_key_padding_mask=t_pad_m,memory_key_padding_mask=s_pad_m,tgt_mask=t_sq_m)
        return self.fc_layer(out_tf)

def count_parameters_fn(model_arg): return sum(p.numel() for p in model_arg.parameters() if p.requires_grad)

print("Cell 3: All class/function definitions completed (Dataset, Collate, Model).")

def train_epoch_dp(model,loader,opt,crit,clip,dev,ep_n,log_i,acc_steps,scaler_arg): # As in your file
    print(f"DEBUG [train_dp]: E{ep_n}. DL len:{len(loader)}.")
    if len(loader)==0:print(f"DEBUG [train_dp]: E{ep_n} DL EMPTY.");return 0.
    model.train();ep_lt=0.;n_s_e=0;pb=tqdm(loader,desc=f"E{ep_n}Trn",leave=True,bar_format='{l_bar}{bar:10}{r_bar}')
    for idx,b in enumerate(pb):
        if idx%acc_steps==0:opt.zero_grad(set_to_none=True)
        s,ti,to=b["src"].to(dev),b["trg_input"].to(dev),b["trg_output"].to(dev)
        if idx==0:print(f"DEBUG [train_dp]: E{ep_n} B{idx} src_shape:{s.shape}")
        with autocast(device_type=dev.type,enabled=(scaler_arg is not None)):
            lgs=model(s,ti);loss=crit(lgs.view(-1,lgs.shape[-1]),to.view(-1))
            if isinstance(model,nn.DataParallel):loss=loss.mean()
            loss=loss/acc_steps
        if scaler_arg:scaler_arg.scale(loss).backward()
        else:loss.backward()
        abl_i=loss.item()*acc_steps;ep_lt+=abl_i*s.size(0);n_s_e+=s.size(0)
        if(idx+1)%acc_steps==0 or(idx+1)==len(loader):
            if scaler_arg:scaler_arg.unscale_(opt)
            nn.utils.clip_grad_norm_(model.parameters(),clip)
            if scaler_arg:scaler_arg.step(opt);scaler_arg.update()
            else:opt.step()
        cdl=abl_i/s.size(0)if s.size(0)>0 else 0
        if idx%log_i==(log_i-1)or idx==len(loader)-1:pb.set_postfix_str(f"B_AvgL:{cdl:.4f}")
    print(f"DEBUG [train_dp]: E{ep_n} done. Samps:{n_s_e}");return ep_lt/n_s_e if n_s_e>0 else 0.

# ** translate_sentence_greedy_dp DEFINITION MUST BE HERE, BEFORE evaluate_dp **
def translate_sentence_greedy_dp(src_t,model_u,trg_tk,dev_m,max_l_tr):
    model_u.eval();sos,eos=trg_tk.cls_token_id,trg_tk.sep_token_id;trg_is=[sos];src_t=src_t.to(dev_m)
    with torch.no_grad():
        with autocast(device_type=dev_m.type,enabled=(dev_m.type=='cuda' and 'scaler' in globals() and scaler is not None and scaler.is_enabled())):
            for _ in range(max_l_tr):
                trg_in_t=torch.LongTensor(trg_is).unsqueeze(0).to(dev_m);logs_o=model_u(src_t,trg_in_t)
                p_id=logs_o.argmax(2)[:,-1].item();trg_is.append(p_id)
                if p_id==eos:break
    g_ts=trg_tk.convert_ids_to_tokens(trg_is[1:]);
    if g_ts and g_ts[-1]==trg_tk.sep_token:g_ts=g_ts[:-1]
    return g_ts

def evaluate_dp(model,loader,crit,dev, indic_tokenizer_b=None, max_l_ev=None):
    print(f"DEBUG [eval_dp - Loss Only]: DL len:{len(loader)}.")
    if len(loader)==0:print(f"DEBUG [eval_dp - Loss Only]: DL EMPTY.");return 0.0
    model.eval();ep_lt=0.;n_s_e=0;
    pbar=tqdm(loader,desc="Validating (Loss Only)",leave=False,bar_format='{l_bar}{bar:10}{r_bar}')
    with torch.no_grad():
        for idx_ev,b in enumerate(pbar):
            if idx_ev==0:print(f"DEBUG [eval_dp - Loss Only]: B{idx_ev}")
            s,ti,to=b["src"].to(dev),b["trg_input"].to(dev),b["trg_output"].to(dev)
            with autocast(device_type=dev.type,enabled=(dev.type=='cuda'and 'scaler' in globals() and scaler is not None and scaler.is_enabled())):
                 lgs_o=model(s,ti)
            loss=crit(lgs_o.view(-1,lgs_o.shape[-1]),to.view(-1));
            if isinstance(model,nn.DataParallel):loss=loss.mean()
            ep_lt+=loss.item()*s.size(0);n_s_e+=s.size(0)
    print(f"DEBUG [eval_dp - Loss Only]: Done. Samps:{n_s_e}");
    avg_l=ep_lt/n_s_e if n_s_e>0 else 0.0
    return avg_l

print("Cell 3: All functions and classes defined, including train/eval/translate helpers.")

Local GloVe file 'glove.6B.100d.txt' not found.
Downloading GloVe from http://nlp.stanford.edu/data/glove.6B.zip...
Unzipping GloVe...
'glove.6B.100d.txt' successfully extracted/downloaded.

Loading global IndicBERT tokenizer: ai4bharat/indic-bert...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

Global IndicBERT tokenizer loaded. Pad ID for Hindi: 0
Loading IndicBERT model (ai4bharat/indic-bert) for embeddings info...


pytorch_model.bin:   0%|          | 0.00/135M [00:00<?, ?B/s]

IndicBERT: Vocab Size=200000, Embedding Dim=128

Loading training data for Eng vocab build...
Loaded 70000p from team16_ta_train.csv
Building English vocabulary...


Build EngV:   0%|          | 0/70000 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/135M [00:00<?, ?B/s]

Eng vocab size: 45025
Loading GloVe embeddings from: glove.6B.100d.txt...


Reading GloVe: 0vecs [00:00, ?vecs/s]

Initialized 32942/45021 actual words from GloVe.
Cell 3: All class/function definitions completed (Dataset, Collate, Model).
Cell 3: All functions and classes defined, including train/eval/translate helpers.


In [None]:
# ==============================================================================
# CELL 4: PYTORCH DATASET AND DATALOADER INSTANTIATION (DataParallel - num_workers=0)
# ==============================================================================

print("\nLoading full datasets for DataLoaders...")
_max_rows_dl_cell4 = hyperparameters.get('MAX_ROWS_DEBUG', None)
train_src_sents_dl, train_trg_sents_dl = load_data_from_csv_fn(hyperparameters['TRAIN_FILE'], max_rows=_max_rows_dl_cell4) # fn from Cell 3
valid_src_sents_dl, valid_trg_sents_dl = load_data_from_csv_fn(hyperparameters['VALID_FILE'], max_rows=_max_rows_dl_cell4//5 if _max_rows_dl_cell4 else None)
test_src_sents_dl, test_trg_sents_dl = load_data_from_csv_fn(hyperparameters['TEST_FILE'], max_rows=_max_rows_dl_cell4//5 if _max_rows_dl_cell4 else None)

if not train_src_sents_dl: raise RuntimeError("Training data for DataLoader is empty.")
if not valid_src_sents_dl: raise RuntimeError("Validation data for DataLoader is empty.") # Add check

train_dataset_obj = TranslationDataset(train_src_sents_dl,train_trg_sents_dl,eng_vocab_global,indic_tokenizer_global,tokenize_eng_fn,hyperparameters['MAX_LEN'])
valid_dataset_obj = TranslationDataset(valid_src_sents_dl,valid_trg_sents_dl,eng_vocab_global,indic_tokenizer_global,tokenize_eng_fn,hyperparameters['MAX_LEN'])
test_dataset_obj  = TranslationDataset(test_src_sents_dl,test_trg_sents_dl,eng_vocab_global,indic_tokenizer_global,tokenize_eng_fn,hyperparameters['MAX_LEN'])

print(f"DEBUG: Length of train_dataset_obj: {len(train_dataset_obj)}") # DEBUG
print(f"DEBUG: Length of valid_dataset_obj: {len(valid_dataset_obj)}") # DEBUG

dl_workers_cell4 = hyperparameters['DATALOADER_WORKERS']
dl_pin_mem_cell4 = True if DEVICE.type == 'cuda' else False
dl_persist_cell4 = False # Since num_workers is 0

train_dataloader_obj = DataLoader(train_dataset_obj, batch_size=hyperparameters['TOTAL_BATCH_SIZE'], shuffle=True,
                              collate_fn=top_level_collate_fn, num_workers=dl_workers_cell4, # top_level_collate_fn from Cell 3
                              pin_memory=dl_pin_mem_cell4, persistent_workers=dl_persist_cell4)
valid_dataloader_obj = DataLoader(valid_dataset_obj, batch_size=hyperparameters['TOTAL_BATCH_SIZE'], shuffle=False,
                              collate_fn=top_level_collate_fn, num_workers=dl_workers_cell4,
                              pin_memory=dl_pin_mem_cell4, persistent_workers=dl_persist_cell4)
test_dataloader_obj  = DataLoader(test_dataset_obj, batch_size=hyperparameters['TOTAL_BATCH_SIZE'], shuffle=False,
                              collate_fn=top_level_collate_fn, num_workers=dl_workers_cell4,
                              pin_memory=dl_pin_mem_cell4, persistent_workers=dl_persist_cell4)

print(f"DataLoaders created. DataLoader num_workers: {dl_workers_cell4}")
print(f"DEBUG: Length of train_dataloader_obj (num batches): {len(train_dataloader_obj)}") # DEBUG
print(f"DEBUG: Length of valid_dataloader_obj (num batches): {len(valid_dataloader_obj)}") # DEBUG

# Try to fetch one batch (will raise error if dataloader is empty or collate_fn fails)
if len(train_dataset_obj) > 0 and len(train_dataloader_obj) > 0 :
    print("DEBUG: Attempting to fetch sample batch from train_dataloader_obj...")
    try:
        _sample_train_batch = next(iter(train_dataloader_obj))
        print(f"DEBUG: Fetched sample train batch. Keys: {_sample_train_batch.keys()}, Src shape: {_sample_train_batch['src'].shape}")
    except Exception as e_fetch_train:
        print(f"ERROR fetching from train_dataloader_obj: {e_fetch_train}")
else:
    print("DEBUG: train_dataloader_obj or train_dataset_obj is empty. Skipping sample fetch.")

if len(valid_dataset_obj) > 0 and len(valid_dataloader_obj) > 0:
    print("DEBUG: Attempting to fetch sample batch from valid_dataloader_obj...")
    try:
        _sample_valid_batch = next(iter(valid_dataloader_obj))
        print(f"DEBUG: Fetched sample valid batch. Keys: {_sample_valid_batch.keys()}, Src shape: {_sample_valid_batch['src'].shape}")
    except Exception as e_fetch_valid:
        print(f"ERROR fetching from valid_dataloader_obj: {e_fetch_valid}")
else:
    print("DEBUG: valid_dataloader_obj or valid_dataset_obj is empty. Skipping sample fetch.")


Loading full datasets for DataLoaders...
Loaded 70000p from team16_ta_train.csv
Loaded 20000p from team16_ta_valid.csv
Loaded 10000p from team16_ta_test.csv
DEBUG: Length of train_dataset_obj: 70000
DEBUG: Length of valid_dataset_obj: 20000
DataLoaders created. DataLoader num_workers: 8
DEBUG: Length of train_dataloader_obj (num batches): 2188
DEBUG: Length of valid_dataloader_obj (num batches): 625
DEBUG: Attempting to fetch sample batch from train_dataloader_obj...




DEBUG: Fetched sample train batch. Keys: dict_keys(['src_text', 'trg_text', 'src', 'trg_input', 'trg_output']), Src shape: torch.Size([32, 41])
DEBUG: Attempting to fetch sample batch from valid_dataloader_obj...
DEBUG: Fetched sample valid batch. Keys: dict_keys(['src_text', 'trg_text', 'src', 'trg_input', 'trg_output']), Src shape: torch.Size([32, 47])


In [None]:
# ==============================================================================
# CELL 5: MODEL INITIALIZATION (DataParallel)
# ==============================================================================
# PositionalEncoding and Seq2SeqTransformer class definitions are in CELL 3

print("\nInitializing model...")
# DEVICE from Cell 1, english_embedding_weights_cpu & INDIC_EMBEDDING_WEIGHTS_CPU from Cell 3
eng_emb_weights_gpu = english_embedding_weights_cpu.to(DEVICE)
ind_emb_weights_gpu = INDIC_EMBEDDING_WEIGHTS_CPU.to(DEVICE)

# Seq2SeqTransformer uses definitions from CELL 3
model_base_instance = Seq2SeqTransformer(
    n_enc_l_v=hyperparameters['ENC_LAYERS'], n_dec_l_v=hyperparameters['DEC_LAYERS'],
    d_s_emb_v=hyperparameters['GLOVE_DIM'], d_t_emb_v=INDIC_EMBEDDING_DIM_HINDI, # From Cell 3
    d_mod_v=hyperparameters['HID_DIM'], nhead_v=hyperparameters['ENC_HEADS'],
    s_vsz_v=eng_vocab_global.n_tokens, t_vsz_v=INDIC_VOCAB_SIZE_HINDI, # From Cell 3
    d_ff_v=hyperparameters['ENC_PF_DIM'], drop_v=hyperparameters['ENC_DROPOUT'],
    s_emb_w_v=eng_emb_weights_gpu, t_emb_w_v=ind_emb_weights_gpu,
    eng_p_idx_v=PAD_IDX_ENG, ind_p_idx_v=INDIC_PAD_ID_HINDI, # Globals
    max_pe_l_v=hyperparameters['MAX_LEN'] + 5 # MAX_LEN from hyperparameters
).to(DEVICE) # Move the base model to the primary device

model_obj_train = model_base_instance # Assign to model_obj_train before potential DP wrapping
if torch.cuda.device_count() > 1 and DEVICE.type == 'cuda':
    print(f"Using nn.DataParallel for {torch.cuda.device_count()} GPUs.")
    model_obj_train = nn.DataParallel(model_base_instance) # model_obj_train is now DP wrapped
else:
    print(f"Training on single device ({DEVICE}). No DataParallel wrapping for training object.")

# Count parameters on the base model
print(f'The base model has {count_parameters_fn(model_base_instance):,} trainable parameters.')

optimizer = optim.Adam(model_obj_train.parameters(), lr=hyperparameters['LEARNING_RATE'])
criterion = nn.CrossEntropyLoss(ignore_index=INDIC_PAD_ID_HINDI) # INDIC_PAD_ID_HINDI from Cell 3
print("Model, Optimizer, and Criterion initialized.")


Initializing model...
Training on single device (cuda). No DataParallel wrapping for training object.
The base model has 26,739,136 trainable parameters.
Model, Optimizer, and Criterion initialized.


In [None]:
# ==============================================================================
# CELL 6 (or 3): TRAINING, EVALUATION, TRANSLATION FUNCTION DEFINITIONS
# (Adding debug prints)
# ==============================================================================

def train_epoch_dp(model_inst, dataloader_inst, optimizer_inst, criterion_inst,
                   clip_val, device_val, current_epoch_num, log_interval_val,
                   accumulation_steps=1, scaler=None):
    # ***** DEBUG PRINT *****
    print(f"DEBUG [train_epoch_dp]: Entered for Epoch {current_epoch_num}. DataLoader has {len(dataloader_inst)} batches.")
    if len(dataloader_inst) == 0:
        print(f"DEBUG [train_epoch_dp]: DataLoader is EMPTY for Epoch {current_epoch_num}. Skipping training loop.")
        return 0.0 # Or appropriate value for empty dataloader

    model_inst.train()
    epoch_loss_total = 0.0
    num_samples_epoch = 0
    pbar_train = tqdm(dataloader_inst, desc=f"E{current_epoch_num} Train", leave=True, bar_format='{l_bar}{bar:10}{r_bar}')

    for batch_idx, batch in enumerate(pbar_train):
        # ***** DEBUG PRINT *****
        if batch_idx == 0: # Print only for the first batch of the epoch
            print(f"DEBUG [train_epoch_dp]: Processing first batch of Epoch {current_epoch_num}.")

        if (batch_idx) % accumulation_steps == 0 :
             optimizer_inst.zero_grad(set_to_none=True)

        src = batch["src"].to(device_val); trg_input = batch["trg_input"].to(device_val); trg_output = batch["trg_output"].to(device_val)

        with autocast(device_type=device_val.type, enabled=(scaler is not None)):
            output_logits = model_inst(src, trg_input)
            loss = criterion_inst(output_logits.view(-1, output_logits.shape[-1]), trg_output.view(-1))
            if isinstance(model_inst, nn.DataParallel): loss = loss.mean()
            loss = loss / accumulation_steps

        if scaler: scaler.scale(loss).backward()
        else: loss.backward()

        actual_batch_loss_item = loss.item() * accumulation_steps
        epoch_loss_total += actual_batch_loss_item * src.size(0)
        num_samples_epoch += src.size(0)

        if (batch_idx + 1) % accumulation_steps == 0 or (batch_idx + 1) == len(dataloader_inst):
            if scaler: scaler.unscale_(optimizer_inst)
            torch.nn.utils.clip_grad_norm_(model_inst.parameters(), clip_val)
            if scaler: scaler.step(optimizer_inst); scaler.update()
            else: optimizer_inst.step()

        current_display_loss = actual_batch_loss_item / src.size(0) if src.size(0) > 0 else 0
        if batch_idx % log_interval_val == (log_interval_val - 1) or batch_idx == len(dataloader_inst) -1 :
             pbar_train.set_postfix_str(f"B_AvgL:{current_display_loss:.4f}")

    # ***** DEBUG PRINT *****
    print(f"DEBUG [train_epoch_dp]: Finished Epoch {current_epoch_num}. Num samples processed: {num_samples_epoch}")
    return epoch_loss_total / num_samples_epoch if num_samples_epoch > 0 else 0.0


def evaluate_dp(model,loader,crit,dev, indic_tokenizer_b=None, max_l_ev=None): # Made indic_tokenizer_b and max_l_ev optional
    # `scaler` refers to scaler in main loop cell (Cell 7) if used in autocast
    print(f"DEBUG [eval_dp - Loss Only]: DL len:{len(loader)}.")
    if len(loader)==0:print(f"DEBUG [eval_dp - Loss Only]: DL EMPTY.");return 0.0 # Only return loss

    model.eval();ep_lt=0.;n_s_e=0;
    pbar=tqdm(loader,desc="Validating (Loss Only)",leave=False,bar_format='{l_bar}{bar:10}{r_bar}')

    with torch.no_grad():
        for idx_ev,b in enumerate(pbar):
            if idx_ev==0:print(f"DEBUG [eval_dp - Loss Only]: B{idx_ev}")
            s,ti,to=b["src"].to(dev),b["trg_input"].to(dev),b["trg_output"].to(dev)
            # No need for b["trg_text"] if not calculating BLEU here

            # global scaler # To use scaler from Cell 7's scope for AMP condition
            with autocast(device_type=dev.type,enabled=(dev.type=='cuda'and 'scaler' in globals() and scaler is not None and scaler.is_enabled())):
                 lgs_o=model(s,ti)
            loss=crit(lgs_o.view(-1,lgs_o.shape[-1]),to.view(-1));
            if isinstance(model,nn.DataParallel):loss=loss.mean()
            ep_lt+=loss.item()*s.size(0);n_s_e+=s.size(0)
            # ---- NO TRANSLATION/BLEU CALCULATION IN VALIDATION LOOP ----

    print(f"DEBUG [eval_dp - Loss Only]: Done. Samps:{n_s_e}");
    avg_l=ep_lt/n_s_e if n_s_e>0 else 0.0

    return avg_l # Return only average loss
# translate_sentence_greedy_dp definition remains the same
# ...

In [None]:
# ==============================================================================
# CELL 7: MAIN TRAINING LOOP (DataParallel - Resumable, Val Loss Only)
# ==============================================================================

# scaler, amp_active defined globally in this cell's execution scope
scaler = None
amp_active = False

if __name__ == '__main__':
    print(f"\n--- Preparing for Training with Device: {DEVICE} ---")

    # --- Model, Optimizer, Criterion Instantiation (as before) ---
    model_base_instance = Seq2SeqTransformer( # From Cell 3
        n_enc_l_v=hyperparameters['ENC_LAYERS'], n_dec_l_v=hyperparameters['DEC_LAYERS'],
        d_s_emb_v=hyperparameters['GLOVE_DIM'], d_t_emb_v=INDIC_EMBEDDING_DIM_HINDI,
        d_mod_v=hyperparameters['HID_DIM'], nhead_v=hyperparameters['ENC_HEADS'],
        s_vsz_v=eng_vocab_global.n_tokens, t_vsz_v=INDIC_VOCAB_SIZE_HINDI,
        d_ff_v=hyperparameters['ENC_PF_DIM'], drop_v=hyperparameters['ENC_DROPOUT'],
        s_emb_w_v=english_embedding_weights_cpu.to(DEVICE),
        t_emb_w_v=INDIC_EMBEDDING_WEIGHTS_CPU.to(DEVICE),
        eng_p_idx_v=PAD_IDX_ENG, ind_p_idx_v=INDIC_PAD_ID_HINDI,
        max_pe_l_v=hyperparameters['MAX_LEN'] + 5
    ).to(DEVICE)
    print(f"Base model created. Params: {count_parameters_fn(model_base_instance):,}") # from Cell 3

    optimizer = optim.Adam(model_base_instance.parameters(), lr=hyperparameters['LEARNING_RATE'])
    criterion = nn.CrossEntropyLoss(ignore_index=INDIC_PAD_ID_HINDI) # From Cell 3

    # AMP Scaler
    if DEVICE.type == 'cuda':
        try: scaler = torch.amp.GradScaler(device_type='cuda',enabled=True)
        except TypeError: scaler = torch.amp.GradScaler(enabled=True); print("Older GradScaler API.")
        if scaler.is_enabled(): amp_active=True; print("AMP GradScaler enabled.")
        else: scaler=None; print("Warn: AMP GradScaler NOT enabled.")
    else: print("AMP not used on CPU.")

    # --- Checkpoint Loading (as before) ---
    current_epoch_to_start_from = 1; best_val_loss_overall = float('inf'); training_history_list = []
    # Removed best_b4_overall tracking from here, as BLEU not done in val
    if os.path.exists(LAST_CHECKPOINT_FILE):
        print(f"Resuming from: {LAST_CHECKPOINT_FILE}")
        ckpt = torch.load(LAST_CHECKPOINT_FILE, map_location=DEVICE)
        model_base_instance.load_state_dict(ckpt['model_state_dict'])
        optimizer.load_state_dict(ckpt['optimizer_state_dict'])
        current_epoch_to_start_from = ckpt['epoch'] + 1
        best_val_loss_overall = ckpt.get('best_valid_loss', float('inf'))
        # best_b4_overall = ckpt.get('best_bleu4', 0.0) # Can be removed if not tracking
        if scaler and 'scaler_state_dict' in ckpt and ckpt['scaler_state_dict']: scaler.load_state_dict(ckpt['scaler_state_dict']); print("Scaler state loaded.")
        if os.path.exists(HISTORY_FILE):
            with open(HISTORY_FILE, 'r') as hf_r: training_history_list = json.load(hf_r)
            print(f"Hist loaded ({len(training_history_list)} eps).");
            if training_history_list:
                vl_h=[e.get('val_loss',float('inf')) for e in training_history_list if 'val_loss' in e];
                if vl_h:best_val_loss_overall=min(best_val_loss_overall,min(vl_h))
        print(f"Resume: Next E{current_epoch_to_start_from}. Overall BestValL:{best_val_loss_overall:.4f}")
    else: print("No ckpt. Starting fresh (E1).")

    model_active_for_training = model_base_instance
    if hyperparameters['NUM_GPUS'] > 1 and DEVICE.type == 'cuda':
        print(f"Wrapping with nn.DataParallel for {hyperparameters['NUM_GPUS']} GPUs.")
        model_active_for_training = nn.DataParallel(model_base_instance)
    else: print(f"Single device train ({DEVICE}).")

    epochs_this_run = min(MAX_EPOCHS_PER_RUN, TOTAL_EPOCHS_TARGET - (current_epoch_to_start_from -1))
    if epochs_this_run <= 0: print(f"Target {TOTAL_EPOCHS_TARGET} eps met (next E{current_epoch_to_start_from}). No new train.")
    else:
        print(f"Run {epochs_this_run} eps this session (E{current_epoch_to_start_from} to E{current_epoch_to_start_from+epochs_this_run-1}).")
        ep_pbar_outer = tqdm(range(current_epoch_to_start_from,current_epoch_to_start_from+epochs_this_run),
                             desc="SessionEPs",initial=current_epoch_to_start_from-1,total=TOTAL_EPOCHS_TARGET)
        # ==============================================================================
# CELL 7: MAIN TRAINING LOOP (DataParallel - Resumable, Val Loss Only)
# (Focus on the epoch summary print line)
# ==============================================================================
# ... (Preamble: if __name__ == '__main__':, scaler, model_base_instance, optimizer, criterion, checkpoint loading, model_active_for_training setup) ...
# ... (epochs_this_run calculation, outer tqdm epoch_pbar) ...

        for ep_disp_num in ep_pbar_outer: # ep_disp_num is 1-indexed
            ep_pbar_outer.set_description(f"Ep {ep_disp_num}/{TOTAL_EPOCHS_TARGET}")
            _lst = time.time()

            tr_l_ep = train_epoch_dp(model_active_for_training,train_dataloader_obj,optimizer,criterion,hyperparameters['CLIP'],DEVICE,ep_disp_num,hyperparameters['LOG_INTERVAL'],hyperparameters['ACCUMULATION_STEPS'],scaler if amp_active else None)

            # evaluate_dp now only returns loss
            val_l_ep = evaluate_dp(model_active_for_training,valid_dataloader_obj,criterion,DEVICE)

            _let=time.time();_tm=int((_let-_lst)/60);_ts=int((_let-_lst)-(_tm*60))

            ep_sts={'epoch':ep_disp_num,'train_loss':round(tr_l_ep,4),'val_loss':round(val_l_ep,4),
                    'time_m':_tm,'time_s':_ts} # BLEU scores removed
            training_history_list=[e for e in training_history_list if e.get('epoch')!=ep_disp_num];training_history_list.append(ep_sts)
            training_history_list.sort(key=lambda x:x['epoch'])

            # Saving model is only based on validation loss
            if val_l_ep < best_val_loss_overall:
                best_val_loss_overall=val_l_ep;
                torch.save(model_base_instance.state_dict(),BEST_LOSS_MODEL_FILE)
                print(f"E{ep_disp_num}:New OverallBestValL! Saved '{BEST_LOSS_MODEL_FILE}'")

            # ** CORRECTED PRINT STATEMENT **
            print(f'E:{ep_disp_num:02}|T:{_tm}m{_ts}s|TrL:{tr_l_ep:.4f}(PPL:{math.exp(min(tr_l_ep,700)):.2f})|ValL:{val_l_ep:.4f}(PPL:{math.exp(min(val_l_ep,700)):.2f})')
            # Removed BLEU printout here as it's not calculated during validation anymore
            print("-" * 80)

            ckpt_s={'epoch':ep_disp_num,'model_state_dict':model_base_instance.state_dict(),'optimizer_state_dict':optimizer.state_dict(),
                      'best_valid_loss':best_val_loss_overall} # Removed 'best_bleu4' from checkpoint as it's not updated
            if scaler and amp_active:ckpt_s['scaler_state_dict']=scaler.state_dict()
            torch.save(ckpt_s,LAST_CHECKPOINT_FILE);
            with open(HISTORY_FILE,'w')as hfw:json.dump(training_history_list,hfw,indent=2)
            print(f"Ckpt&hist saved for E{ep_disp_num}.")
            if DEVICE.type=='cuda':torch.cuda.empty_cache()
            if ep_disp_num>=TOTAL_EPOCHS_TARGET:print(f"\nTARGET {TOTAL_EPOCHS_TARGET} EPS REACHED.");break

        _last_comp_ep_sess=current_epoch_to_start_from+epochs_this_run-1 if epochs_this_run>0 else current_epoch_to_start_from-1
        print(f"\n--- TrainSessDone ({epochs_this_run} eps). Total Overall Trained: {_last_comp_ep_sess} ---")

    # ... (Post-session messages as before) ...

# ... (else: print("Cell 7 not run as main.")) ...
    _final_comp_ep_overall = 0;
    if os.path.exists(LAST_CHECKPOINT_FILE): _lckpt_d = torch.load(LAST_CHECKPOINT_FILE,map_location='cpu'); _final_comp_ep_overall = _lckpt_d.get('epoch',0)
    elif training_history_list: _final_comp_ep_overall = training_history_list[-1]['epoch'] if training_history_list else 0

    if _final_comp_ep_overall>=TOTAL_EPOCHS_TARGET:print(f"\nOverall train target {TOTAL_EPOCHS_TARGET} met (last E trained: {_final_comp_ep_overall}). Run Cell 8 for final test.")
    else: print(f"\nRe-run Cell 7 for more eps. Trained up to E{_final_comp_ep_overall}. Target: {TOTAL_EPOCHS_TARGET}.")

else: print("Cell 7 not run as main.")


--- Preparing for Training with Device: cuda ---
Base model created. Params: 26,739,136
Older GradScaler API.
AMP GradScaler enabled.
Resuming from: /kaggle/working/nmt_dp_last_checkpoint.pt
Scaler state loaded.
Hist loaded (4 eps).
Resume: Next E5. Overall BestValL:3.1143
Single device train (cuda).
Run 1 eps this session (E5 to E5).


SessionEPs:  80%|########  | 4/5 [00:00<?, ?it/s]

DEBUG [train_epoch_dp]: Entered for Epoch 5. DataLoader has 2188 batches.


E5 Train:   0%|          | 0/2188 [00:00<?, ?it/s]

DEBUG [train_epoch_dp]: Processing first batch of Epoch 5.
DEBUG [train_epoch_dp]: Finished Epoch 5. Num samples processed: 70000
DEBUG [eval_dp - Loss Only]: DL len:625.


Validating (Loss Only):   0%|          | 0/625 [00:00<?, ?it/s]

DEBUG [eval_dp - Loss Only]: B0
DEBUG [eval_dp - Loss Only]: Done. Samps:20000
E5:New OverallBestValL! Saved '/kaggle/working/nmt_dp_best_loss.pt'
E:05|T:3m52s|TrL:3.1834(PPL:24.13)|ValL:3.0486(PPL:21.09)
--------------------------------------------------------------------------------
Ckpt&hist saved for E5.

TARGET 5 EPS REACHED.

--- TrainSessDone (1 eps). Total Overall Trained: 5 ---

Overall train target 5 met (last E trained: 5). Run Cell 8 for final test.


In [None]:
# ==============================================================================
# CELL 8: FINAL TESTING AND REPORTING (Run AFTER ALL training epochs are done)
# ==============================================================================

if __name__ == '__main__':
    print("\n--- FINAL TESTING AND REPORTING ---")

    # --- Essential Globals Check (as before) ---
    if 'DEVICE' not in globals() or 'hyperparameters' not in globals() or 'eng_vocab_global' not in globals() \
       or 'test_dataloader_obj' not in globals() or 'train_dataset_obj' not in globals() \
       or 'indic_tokenizer_global' not in globals():
        print("ERROR: Essential components not found. Re-run Cells 1-6.")
        # Normally you'd stop here or raise error in a script
    else:
        # BEST_LOSS_MODEL_FILE defined in Cell 2
        final_model_path_to_load = BEST_LOSS_MODEL_FILE # Primarily use the loss-based best model
        if not os.path.exists(final_model_path_to_load) and os.path.exists(BEST_BLEU_MODEL_FILE):
            print(f"Warning: '{BEST_LOSS_MODEL_FILE}' not found, attempting to load from '{BEST_BLEU_MODEL_FILE}' (if it exists from a previous run).")
            final_model_path_to_load = BEST_BLEU_MODEL_FILE # Fallback if BLEU one exists and loss one doesn't

        print(f"Final Test: Loading best model from '{final_model_path_to_load}'")

        final_test_model_instance = Seq2SeqTransformer( # Def from Cell 3
            n_enc_l_v=hyperparameters['ENC_LAYERS'],n_dec_l_v=hyperparameters['DEC_LAYERS'],
            d_s_emb_v=hyperparameters['GLOVE_DIM'],d_t_emb_v=INDIC_EMBEDDING_DIM_HINDI,
            d_mod_v=hyperparameters['HID_DIM'],nhead_v=hyperparameters['ENC_HEADS'],
            s_vsz_v=eng_vocab_global.n_tokens,t_vsz_v=INDIC_VOCAB_SIZE_HINDI,
            d_ff_v=hyperparameters['ENC_PF_DIM'],drop_v=hyperparameters['ENC_DROPOUT'],
            s_emb_w_v=english_embedding_weights_cpu.to(DEVICE),
            t_emb_w_v=INDIC_EMBEDDING_WEIGHTS_CPU.to(DEVICE),
            eng_p_idx_v=PAD_IDX_ENG,ind_p_idx_v=INDIC_PAD_ID_HINDI,
            max_pe_l_v=hyperparameters['MAX_LEN'] + 5
        ).to(DEVICE)

        if os.path.exists(final_model_path_to_load):
            final_test_model_instance.load_state_dict(torch.load(final_model_path_to_load, map_location=DEVICE))
            print(f"Best model loaded: '{final_model_path_to_load}'.")
            _test_crit_final = nn.CrossEntropyLoss(ignore_index=INDIC_PAD_ID_HINDI)

            print("\n--- Final Evaluation on TEST SET (with BLEU) ---")
            def evaluate_dp_with_bleu(model,loader,crit,dev,indic_tok_b,max_l_ev):
                print(f"DEBUG [eval_dp_WITH_BLEU]: DL len:{len(loader)}.") # DEBUG
                if len(loader)==0:print(f"DEBUG [eval_dp_WITH_BLEU]: DL EMPTY.");return 0.,0.,0.,0.,0.
                model.eval();ep_lt=0.;n_s_e=0;hyps,refs_list_of_lists=[],[];pb=tqdm(loader,desc="Final Testing (BLEU)",leave=False,bar_format='{l_bar}{bar:10}{r_bar}')
                with torch.no_grad():
                    for idx_ev,b in enumerate(pb):
                        if idx_ev==0:print(f"DEBUG [eval_dp_WITH_BLEU]: B{idx_ev}") # DEBUG
                        s,ti,to,tgt_txt_l=b["src"].to(dev),b["trg_input"].to(dev),b["trg_output"].to(dev),b["trg_text"]
                        # Ensure global `scaler` is visible IF autocast depends on it
                        amp_eval_enabled = (dev.type=='cuda'and 'scaler' in globals() and scaler is not None and scaler.is_enabled())
                        with autocast(device_type=dev.type,enabled=amp_eval_enabled):
                             lgs_o=model(s,ti)
                        loss=crit(lgs_o.view(-1,lgs_o.shape[-1]),to.view(-1));
                        if isinstance(model,nn.DataParallel):loss=loss.mean()
                        ep_lt+=loss.item()*s.size(0);n_s_e+=s.size(0)
                        m_tr=model.module if isinstance(model,nn.DataParallel) else model
                        for i in range(s.size(0)):
                            # translate_sentence_greedy_dp is defined in Cell 3
                            h_t=translate_sentence_greedy_dp(s[i:i+1,:].to(DEVICE),m_tr,indic_tok_b,DEVICE,max_l_ev)
                            hyps.append(h_t);refs_list_of_lists.append([indic_tok_b.tokenize(tgt_txt_l[i])])
                print(f"DEBUG [eval_dp_WITH_BLEU]: Done. Samps:{n_s_e}");avg_l=ep_lt/n_s_e if n_s_e>0 else 0.
                b1,b2,b3,b4=0.,0.,0.,0.
                if hyps and refs_list_of_lists:
                    sm=SmoothingFunction().method1
                    try:
                        b1=corpus_bleu(refs_list_of_lists,hyps,weights=(1,0,0,0),smoothing_function=sm);b2=corpus_bleu(refs_list_of_lists,hyps,weights=(0.5,0.5,0,0),smoothing_function=sm)
                        b3=corpus_bleu(refs_list_of_lists,hyps,weights=(0.33,0.33,0.33,0),smoothing_function=sm);b4=corpus_bleu(refs_list_of_lists,hyps,weights=(0.25,0.25,0.25,0.25),smoothing_function=sm)
                    except ZeroDivisionError:print("Final Test BLEU ZeroDiv.")
                return avg_l,b1,b2,b3,b4
            # Now call this one for final test:
            test_L_final, test_B1_final, test_B2_final, test_B3_final, test_B4_final = evaluate_dp_with_bleu(
                final_test_model_instance, test_dataloader_obj, _test_crit_final,
                DEVICE, indic_tokenizer_global, hyperparameters['MAX_LEN']
            )
            print(f'FINAL Test Loss: {test_L_final:.4f} | Test PPL: {math.exp(min(test_L_final,700)):.2f}')
            print(f'FINAL Test BLEU: B@1={test_B1_final:.4f} | B@2={test_B2_final:.4f} | B@3={test_B3_final:.4f} | B@4={test_B4_final:.4f}')

            # --- Reporting Helpers & Final Report (as before) ---
            # Your _get_report_examples, _print_report_ex, _create_json_report should still work
            # Just ensure translate_sentence_greedy_dp is accessible to _get_report_examples
            # and count_parameters_fn is accessible to _create_json_report.
            # These were defined in CELL 3, so they should be.

            def _get_report_examples_final(dset, model_rep, tok, dev, hp, n=5, pfx=""): # (Local copy from Cell 7)
                exl=[];model_rep.eval();na=min(n,len(dset));
                if na==0:print(f"{pfx} DSet empty");return exl
                print(f"Gen {na} {pfx} exs...");
                for i in tqdm(range(na),desc=f"Gen{pfx}Ex",leave=False,bar_format='{l_bar}{bar:10}{r_bar}'):
                    sm=dset[i];st=sm["src_text_orig"];tt=sm["trg_text_orig"];sten=sm["src_indices"].unsqueeze(0).to(dev)
                    tr_tok=translate_sentence_greedy_dp(sten,model_rep,tok,dev,hp['MAX_LEN']) # from CELL 3
                    tr_txt=tok.decode(tok.convert_tokens_to_ids(tr_tok),skip_special_tokens=True)
                    exl.append({"Source (EN)":st,"Target (HI - Ground Truth)":tt,"Model Output (HI - Translated)":tr_txt})
                return exl
            def _print_ex_final(exd): print(f"EN:{exd['Source (EN)']}\nHI-T:{exd['Target (HI - Ground Truth)']}\nHI-M:{exd['Model Output (HI - Translated)']}\n")
            def _create_report_final(hp_dict,eng_v_obj,ind_v_sz,ind_emb_d,fin_mdl_obj,hist_l,tl,tb1,tb2,tb3,tb4,trn_ex_l,tst_ex_l,n_gpu):
                _base_m = fin_mdl_obj.module if isinstance(fin_mdl_obj, nn.DataParallel) else fin_mdl_obj
                # count_parameters_fn is from Cell 3
                return {"info":{"task":"EN-HI NMT(DP)","ts":time.strftime("%Y%m%d%H%M"),"gpus":n_gpu},
                        "hp":hp_dict,"total_eps_trained":len(hist_l) if hist_l else "NA",
                        "voc":{"eng":eng_v_obj.n_tokens,"ind":ind_v_sz},
                        "mdl":{"p":count_parameters_fn(_base_m),
                            "eng_frz":not _base_m.s_emb.weight.requires_grad if hasattr(_base_m,'s_e')else'NA',
                            "ind_frz":not _base_m.t_emb.weight.requires_grad if hasattr(_base_m,'t_e')else'NA'},
                        "hist":hist_l,"test_res":{"L":tl,"PPL":math.exp(min(tl,700)),"B1":tb1,"B2":tb2,"B3":tb3,"B4":tb4},
                        "train_ex":trn_ex_l,"test_ex":tst_ex_l}

            _final_hist_rep = [];
            if os.path.exists(HISTORY_FILE):
                with open(HISTORY_FILE,'r')as _hfr: _final_hist_rep=json.load(_hfr)
            else: print(f"Warn: Hist file '{HISTORY_FILE}' missing.")
            print("\n--- FINAL Report Train Exs ---");
            trn_ex_l_f=_get_report_examples_final(train_dataset_obj,final_test_model_instance,indic_tokenizer_global,DEVICE,hyperparameters,5,"RepTr")
            [_print_ex_final(e) for e in trn_ex_l_f]
            print("\n--- FINAL Report Test Exs ---");
            tst_ex_l_f=_get_report_examples_final(test_dataset_obj,final_test_model_instance,indic_tokenizer_global,DEVICE,hyperparameters,5,"RepTs")
            [_print_ex_final(e) for e in tst_ex_l_f]

            final_json_data=_create_report_final(hyperparameters,eng_vocab_global,INDIC_VOCAB_SIZE_HINDI,INDIC_EMBEDDING_DIM_HINDI,
                                                final_test_model_instance,_final_hist_rep,test_L_final,test_B1_final,test_B2_final,test_B3_final,test_B4_final,
                                                trn_ex_l_f,tst_ex_l_f,hyperparameters['NUM_GPUS'])
            fin_rep_fname_json="nmt_exp_FINAL_REPORT_DP_v5.json" # Increment version
            with open(os.path.join(CHECKPOINT_DIR,fin_rep_fname_json),"w",encoding="utf-8")as fj_rep_out:json.dump(final_json_data,fj_rep_out,ensure_ascii=False,indent=2)
            print(f"\nFinal report saved: {os.path.join(CHECKPOINT_DIR,fin_rep_fname_json)}")
        else: print(f"Skipping final test: Best model '{final_model_path_to_load}' NOT found.")
    print("--- End Final Test & Report Cell ---")
else: print("Cell 8 not run as main.")


--- FINAL TESTING AND REPORTING ---
Final Test: Loading best model from '/kaggle/working/nmt_dp_best_loss.pt'
Best model loaded: '/kaggle/working/nmt_dp_best_loss.pt'.

--- Final Evaluation on TEST SET (with BLEU) ---
DEBUG [eval_dp_WITH_BLEU]: DL len:313.


Final Testing (BLEU):   0%|          | 0/313 [00:00<?, ?it/s]

DEBUG [eval_dp_WITH_BLEU]: B0
DEBUG [eval_dp_WITH_BLEU]: Done. Samps:10000
FINAL Test Loss: 3.0348 | Test PPL: 20.80
FINAL Test BLEU: B@1=0.2507 | B@2=0.1469 | B@3=0.0924 | B@4=0.0564

--- FINAL Report Train Exs ---
Gen 5 RepTr exs...


GenRepTrEx:   0%|          | 0/5 [00:00<?, ?it/s]

EN:Then there was a loud thud.
HI-T:அப்போது கரடி ஒன்று உறுமும் சத்தம் கேட்டது.
HI-M:பனனர அதல அதலம அதகரதத.

EN:The financial assistance is provided to newly married couples of whom one spouse should be from Scheduled caste, Scheduled tribe and the other from a different Community.
HI-T:பிரிவு-1 புதுமணத் தம்பதியரில் ஒருவர் ஆதிதிராவிடர் அல்லது பழங்குடியினராக இருந்து பிற இனத்தவரை மணந்து கொண்டால் நிதியுதவி வழங்கப்படும்.
HI-M:இநதபபடடரகளககக மனனரகளகககபபடடவரகளகககபபடடவரகளககக வணடம.

EN:Congress government
HI-T:காங்கிரஸ் அரசின் பலம்
HI-M:அரசயல கஙகரஸ

EN:Vacancy: Manager
HI-T:பணியிடம்: சண்டீகர்
HI-M:வணடம

EN:In-Principle approval given for Law Amendments during 31stMeeting of the GST Council
HI-T:ஜிஎஸ்டி குழுமத்தின் 31-வது கூட்டத்தின் போது சட்டத்திருத்தங்களுக்கு கொள்கை அளவில் ஒப்புதல்
HI-M:சடடதததறகக சடடததறககக சடடமனறம, சடடமனறம சடடமனறம


--- FINAL Report Test Exs ---
Gen 5 RepTs exs...


GenRepTsEx:   0%|          | 0/5 [00:00<?, ?it/s]

EN:I listen to everything which she says.
HI-T:அவள் என்ன சொல்கிறாள் என்று அனைவரும் கேட்போம்.
HI-M:எனககக நன நனம எனற தரவததர.

EN:Marker levels are not reported for this printer.
HI-T:இந்த அச்சடிப்பிக்கு மார்க்கர் நிலைகள் அறிக்கையிடப்படவில்லை.
HI-M:இதல, இநதபபடடளளத.

EN:There are 15 countries.
HI-T:இதில் 15 நாடுகள் இடம் பெற்றுள்ளன.
HI-M:இதல 20 நடகளல இரககறத.

EN:She takes over as the Madras HC chief justice after incumbent Indira Banerjee was elevated as a judge of the Supreme Court.
HI-T:உயர்நீதிமன்ற தலைமை நீதிபதியாக இருந்த இந்திரா பானர்ஜி உச்சநீதிமன்ற நீதிபதியாக பதவி உயர்வு பெற்றுள்ளார். இதைத்தொடர்ந்து மும்பை உயர்நீதிமன்ற பொறுப்பு தலைமை நீதிபதியான தஹில் ரமாணி, சென்னை உயர்நீதிமன்ற தலைமை நீதிபதியாக நியமிக்கப்பட்டுள்ளார்.
HI-M:இதல, ப. க. க. க. க. க. க. க. க. க. க. க. க. க. க. க. க. க. க. க. க. க. க. க. க. க. க. க. க. க. க

EN:We are following the government order.
HI-T:அரசின் உத்தரவுக்கு கட்டுப்பட்டு இருக்கிறோம்.
HI-M:இதறகக அரசஙகக அரசஙகக வணடம.


Final report saved: /kaggle/working/nmt_exp_FI