# Dataloader


In [1]:
!pip install h5py
!pip install tensorboardX
import json
import h5py
import os
import time
import torch.optim as optim
from torch.utils.data import Subset
from torch.utils.data import DataLoader
from tensorboardX import SummaryWriter


Collecting tensorboardX
  Downloading tensorboardx-2.6.4-py3-none-any.whl.metadata (6.2 kB)
Downloading tensorboardx-2.6.4-py3-none-any.whl (87 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorboardX
Successfully installed tensorboardX-2.6.4


In [2]:
# %%
# 1) Imports
import json
import h5py
import torch
import torch.utils.data as data

# %%
# 2) Dataset Class
class Dataloader(data.Dataset):
    def __init__(self, input_json_file_path, input_ques_h5_path):
        super(Dataloader, self).__init__()
        
        print('Reading', input_json_file_path)
        # Load JSON vocab map
        with open(input_json_file_path) as input_file:
            data_dict = json.load(input_file)
        
        # Build ix_to_word (int → str)
        self.ix_to_word = {}
        for k, w in data_dict['ix_to_word'].items():
            self.ix_to_word[int(k)] = w
        
        # Add special tokens
        self.UNK_token = 0
        if 0 not in self.ix_to_word:
            self.ix_to_word[0] = '<UNK>'
        else:
            raise Exception("Index 0 already exists in ix_to_word")
        
        self.EOS_token = len(self.ix_to_word)
        self.ix_to_word[self.EOS_token] = '<EOS>'
        self.PAD_token = len(self.ix_to_word)
        self.ix_to_word[self.PAD_token] = '<PAD>'
        self.SOS_token = len(self.ix_to_word)
        self.ix_to_word[self.SOS_token] = '<SOS>'
        self.vocab_size = len(self.ix_to_word)
        
        # Load HDF5
        print('Loading HDF5 from', input_ques_h5_path)
        qa_data = h5py.File(input_ques_h5_path, 'r')
        
        # Train split
        ques_train_raw = torch.from_numpy(qa_data['ques_train'][...].astype('int64'))
        ques_len_train = torch.from_numpy(qa_data['ques_length_train'][...].astype('int64'))
        label_train_raw = torch.from_numpy(qa_data['ques1_train'][...].astype('int64'))
        label_len_train = torch.from_numpy(qa_data['ques1_length_train'][...].astype('int64'))
        self.train_id = torch.from_numpy(qa_data['ques_cap_id_train'][...].astype('int64'))
        print('Training samples:', ques_train_raw.size(0))
        
        # Test split
        ques_test_raw = torch.from_numpy(qa_data['ques_test'][...].astype('int64'))
        ques_len_test = torch.from_numpy(qa_data['ques_length_test'][...].astype('int64'))
        label_test_raw = torch.from_numpy(qa_data['ques1_test'][...].astype('int64'))
        label_len_test = torch.from_numpy(qa_data['ques1_length_test'][...].astype('int64'))
        self.test_id = torch.from_numpy(qa_data['ques_cap_id_test'][...].astype('int64'))
        print('Test samples:', ques_test_raw.size(0))
        
        qa_data.close()
        
        # Process + add SOS/EOS/PAD
        q_tr, ql_tr = self.process_data(ques_train_raw, ques_len_train)
        l_tr, ll_tr = self.process_data(label_train_raw, label_len_train)
        q_ts, ql_ts = self.process_data(ques_test_raw, ques_len_test)
        l_ts, ll_ts = self.process_data(label_test_raw, label_len_test)
        
        # Concatenate splits
        self.ques = torch.cat([q_tr, q_ts], dim=0)
        self.len = torch.cat([ql_tr, ql_ts], dim=0)
        self.label = torch.cat([l_tr, l_ts], dim=0)
        self.label_len = torch.cat([ll_tr, ll_ts], dim=0)
        self.id = torch.cat([self.train_id, self.test_id], dim=0)
        
        # Verify and truncate to ensure consistent sizes
        print('ques shape:', self.ques.shape)
        print('label shape:', self.label.shape)
        print('len shape:', self.len.shape)
        print('label_len shape:', self.label_len.shape)
        print('id shape:', self.id.shape)
        min_size = min(self.ques.size(0), self.label.size(0), self.len.size(0), 
                      self.label_len.size(0), self.id.size(0))
        if min_size < self.ques.size(0):
            print(f"Truncating to {min_size} samples to ensure consistency")
            self.ques = self.ques[:min_size]
            self.label = self.label[:min_size]
            self.len = self.len[:min_size]
            self.label_len = self.label_len[:min_size]
            self.id = self.id[:min_size]
        
        self.seq_length = self.ques.size(1)
        # Verify sequence length consistency
        if self.ques.size(1) != self.label.size(1):
            raise ValueError(f"Sequence length mismatch: ques={self.ques.size(1)}, label={self.label.size(1)}")
    
    def process_data(self, data, data_len):
        N, L = data.size()
        new_L = L + 2  # for SOS and EOS
        new_data = torch.full((N, new_L), fill_value=self.PAD_token, dtype=torch.long)
        new_len = data_len.clone() + 2
        for i in range(N):
            l = data_len[i]
            # SOS
            new_data[i, 0] = self.SOS_token
            # Copy tokens
            new_data[i, 1:1+l] = data[i, :l]
            # EOS
            new_data[i, 1+l] = self.EOS_token
        return new_data, new_len
    
    def __len__(self):
        return self.len.size(0)
    
    def __getitem__(self, idx):
        return (
            self.ques[idx],       # LongTensor [seq_len]
            self.len[idx],        # LongTensor [1]
            self.label[idx],      # LongTensor [seq_len]
            self.label_len[idx],  # LongTensor [1]
            self.id[idx]          # LongTensor [1]
        )
    
    def getVocabSize(self):
        return self.vocab_size
    
    def getSeqLength(self):
        return self.seq_length

# %%
# 3) Instantiate & wrap in DataLoader
JSON_PATH = '/kaggle/input/qqp-processed/quora_data_prepro.json'
H5_PATH = '/kaggle/input/qqp-processed/quora_data_prepro.h5'

dataset = Dataloader(JSON_PATH, H5_PATH)
loader = data.DataLoader(dataset, batch_size=64, shuffle=True, num_workers=2, drop_last=True)

# %%
# 4) Quick sanity check
batch = next(iter(loader))
ques, ques_len, label, label_len, ids = batch
print('ques:', ques.shape)
print('ques_len:', ques_len.shape)
print('label:', label.shape)
print('label_len:', label_len.shape)
print('ids:', ids.shape)
print('vocab size:', dataset.getVocabSize())
print('seq length:', dataset.getSeqLength())

Reading /kaggle/input/qqp-processed/quora_data_prepro.json
Loading HDF5 from /kaggle/input/qqp-processed/quora_data_prepro.h5
Training samples: 100000
Test samples: 30000
ques shape: torch.Size([130000, 28])
label shape: torch.Size([130000, 28])
len shape: torch.Size([130000])
label_len shape: torch.Size([130000])
id shape: torch.Size([130000])
ques: torch.Size([64, 28])
ques_len: torch.Size([64])
label: torch.Size([64, 28])
label_len: torch.Size([64])
ids: torch.Size([64])
vocab size: 27699
seq length: 28


In [3]:
import h5py
with h5py.File(H5_PATH, 'r') as f:
    print(f['ques_train'].shape, f['ques1_train'].shape)
    print(f['ques_test'].shape, f['ques1_test'].shape)

(100000, 26) (100000, 26)
(30000, 26) (30000, 26)


In [4]:
# %% [markdown]
# ## 1) Utils: one_hot, prob2pred, decode_sequence, JointEmbeddingLoss

# %%
import torch

def one_hot(t, c):
    """
    t: LongTensor of shape [seq_len, batch_size]
    c: vocab size
    returns: FloatTensor [seq_len, batch_size, c]
    """
    return torch.zeros(*t.size(), c, device=t.device) \
                .scatter_(-1, t.unsqueeze(-1), 1.0)

def prob2pred(prob):
    """
    prob: Log‐prob Tensor [seq_len, batch]
    returns: LongTensor sampled indices [seq_len, batch]
    """
    # flatten to [seq_len*batch, vocab], sample, then reshape
    v = torch.exp(prob.view(-1, prob.size(-1)))
    samp = torch.multinomial(v, 1)
    return samp.view(prob.size(0), prob.size(1))

def decode_sequence(ix_to_word, seq):
    """
    ix_to_word: dict { idx: token_str }
    seq: LongTensor [batch, seq_len]
    returns: list of decoded strings (batch)
    """
    decoded = []
    B, L = seq.size()
    for i in range(B):
        words = []
        for j in range(L):
            ix = int(seq[i, j].item())
            w = ix_to_word.get(ix, '<UNK>')
            if w == '<EOS>':
                words.append(w)
                break
            if w not in ('<PAD>', '<SOS>'):
                words.append(w)
        decoded.append(" ".join(words))
    return decoded

def JointEmbeddingLoss(emb1, emb2, margin=1.0):
    """
    emb1, emb2: FloatTensor [batch, emb_dim]
    returns: scalar margin loss averaged over batch^2 pairs
    """
    # similarity matrix
    sims = emb1 @ emb2.t()          # [B, B]
    pos  = sims.diag().unsqueeze(1) # [B,1]
    # margin loss
    raw  = sims - pos + margin     # [B, B]
    raw.fill_diagonal_(0)           # zero out i==j
    return torch.clamp(raw, min=0).sum() / (emb1.size(0)**2)


In [5]:
# %% [markdown]
# ## 2) Model: ParaphraseGenerator (combines encoder, generator, discriminator branches)

# %%
import torch
import torch.nn as nn

class ParaphraseGenerator(nn.Module):
    def __init__(self, op):
        super().__init__()
        # shared encoder/discriminator
        self.emb_layer = nn.Sequential(
            nn.Linear(op['vocab_sz'], op['emb_hid_dim']),
            nn.Threshold(1e-6, 0),
            nn.Linear(op['emb_hid_dim'], op['emb_dim']),
            nn.Threshold(1e-6, 0)
        )
        self.enc_rnn = nn.GRU(op['emb_dim'], op['enc_rnn_dim'])
        self.enc_lin = nn.Sequential(
            nn.Dropout(op['enc_dropout']),
            nn.Linear(op['enc_rnn_dim'], op['enc_dim'])
        )
        # generator
        self.gen_emb = nn.Embedding(op['vocab_sz'], op['emb_dim'])
        self.gen_rnn = nn.LSTM(op['enc_dim'], op['gen_rnn_dim'])
        self.gen_lin = nn.Sequential(
            nn.Dropout(op['gen_dropout']),
            nn.Linear(op['gen_rnn_dim'], op['vocab_sz']),
            nn.LogSoftmax(dim=-1)
        )
        self.max_seq_len = op['max_seq_len']
        self.vocab_sz    = op['vocab_sz']

    def forward(self, phrase, sim_phrase=None, train=False):
        # phrase & sim_phrase: [T, B]
        if sim_phrase is None:
            sim_phrase = phrase
        # encode
        one_in = one_hot(phrase, self.vocab_sz)
        _, h = self.enc_rnn(self.emb_layer(one_in))
        enc = self.enc_lin(h)         # [1, B, enc_dim]
        # generate
        if train:
            emb_tgt = self.gen_emb(sim_phrase)
            inp_seq = torch.cat([enc, emb_tgt[:-1]], dim=0)
            out_rnn, _ = self.gen_rnn(inp_seq)
            out = self.gen_lin(out_rnn)
            # embeddings for loss
            emb_gt  = self.enc_lin(self.enc_rnn(self.emb_layer(one_hot(sim_phrase, self.vocab_sz)))[1])
            emb_gen = self.enc_lin(self.enc_rnn(self.emb_layer(torch.exp(out)))[1])
        else:
            words, h0 = [], None
            inp = enc
            for _ in range(self.max_seq_len):
                out_rnn, h0 = self.gen_rnn(inp, hx=h0)
                logits = self.gen_lin(out_rnn)
                words.append(logits)
                sampled = prob2pred(logits)
                inp = self.gen_emb(sampled)
            out = torch.cat(words, dim=0)
            emb_gt  = self.enc_lin(self.enc_rnn(self.emb_layer(one_hot(sim_phrase, self.vocab_sz)))[1])
            emb_gen = self.enc_lin(self.enc_rnn(self.emb_layer(torch.exp(out)))[1])
        return out, emb_gen.squeeze(0), emb_gt.squeeze(0)

In [6]:
# token-level cross-entropy
ce_loss = nn.CrossEntropyLoss(ignore_index=dataset.PAD_token)

# pair-wise discriminator margin loss
def JointEmbeddingLoss(emb1, emb2, margin=1.0):
    sims = emb1 @ emb2.t()
    pos  = sims.diag().unsqueeze(1)
    raw  = sims - pos + margin
    raw.fill_diagonal_(0)
    return torch.clamp(raw, min=0).sum() / (emb1.size(0)**2)

In [7]:
import math
from collections import defaultdict
import numpy as np

def vec_norm(seq, n=4):
    tokens = seq.split()
    ngrams = [{} for _ in range(n)]
    norms = [0.0] * n
    for i in range(n):
        for j in range(len(tokens) - i):
            ngram = tuple(tokens[j:j+i+1])
            ngrams[i][ngram] = ngrams[i].get(ngram, 0) + 1
            norms[i] += ngrams[i][ngram]
        norms[i] = norms[i] ** 0.5 if norms[i] > 0 else 1.0
    return ngrams, norms

def get_ngrams(seq, n):
    tokens = seq.split()
    return Counter(tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1))

# BLEU
from collections import Counter
import math

def get_ngrams(seq, n):
    """
    Extract n-grams from a sequence of tokens.
    
    Args:
        seq (str): Space-separated sequence of tokens.
        n (int): N-gram order.
    
    Returns:
        Counter: Dictionary of n-grams and their counts.
    """
    tokens = seq.split()
    return Counter(tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1))

class BleuScorer:
    def __init__(self, n=4):
        self.n = n
        self.hyp = []
        self.refs = []
    
    def add_instance(self, hyp, refs):
        """
        Add a hypothesis and its reference(s).
        
        Args:
            hyp (str): Hypothesis sequence.
            refs (list): List of reference sequences.
        """
        self.hyp.append(hyp)
        self.refs.append(refs)
    
    def compute_score(self):
        """
        Compute BLEU scores for n-grams (1 to n).
        
        Returns:
            scores (list): BLEU scores for each n-gram order.
            None: Placeholder for compatibility.
        """
        scores = []
        for n in range(1, self.n + 1):
            total_precision = 0.0
            total_brevity_penalty = 0.0
            count = 0
            
            for hyp, refs in zip(self.hyp, self.refs):
                hyp_ngrams = get_ngrams(hyp, n)
                hyp_len = max(len(hyp.split()) - n + 1, 1)
                
                # Compute clipped counts
                ref_ngrams_list = [get_ngrams(ref, n) for ref in refs]
                ref_counts = Counter()
                for ref_ngrams in ref_ngrams_list:
                    for ng in ref_ngrams:
                        ref_counts[ng] = max(ref_counts[ng], ref_ngrams[ng])
                
                clipped_count = sum(min(hyp_ngrams[ng], ref_counts.get(ng, 0)) for ng in hyp_ngrams)
                precision = clipped_count / hyp_len if hyp_len > 0 else 0.0
                
                # Brevity penalty
                hyp_words = len(hyp.split())
                ref_lengths = [len(ref.split()) for ref in refs]
                closest_ref_len = min(ref_lengths, key=lambda x: abs(x - hyp_words))
                brevity_penalty = 1.0 if hyp_words >= closest_ref_len else math.exp(1 - closest_ref_len / hyp_words if hyp_words > 0 else 0)
                
                total_precision += precision
                total_brevity_penalty += brevity_penalty
                count += 1
            
            avg_precision = total_precision / count if count > 0 else 0.0
            avg_brevity_penalty = total_brevity_penalty / count if count > 0 else 1.0
            bleu_n = avg_precision * avg_brevity_penalty if avg_precision > 0 else 0.0
            scores.append(bleu_n)
        
        return scores, None

class Bleu:
    def __init__(self, n=4):
        self.n = n
        self.scorer = BleuScorer(n=n)
    
    def compute_score(self, gts, res):
        """
        Compute BLEU score for ground truth and predicted sequences.
        
        Args:
            gts (dict): {id: [ref_str, ...]} mapping IDs to lists of reference strings.
            res (dict): {id: [hyp_str]} mapping IDs to lists of hypothesis strings.
        
        Returns:
            scores (list): BLEU scores for n-grams (1 to n).
            None: Placeholder for compatibility.
        """
        for id in gts:
            self.scorer.add_instance(res[id][0], gts[id])
        return self.scorer.compute_score()

# CIDEr
def vec_norm(seq, n=4):
    """
    Convert a sequence (string) into a list of n-gram count dictionaries and their norms.
    
    Args:
        seq (str): Space-separated sequence of tokens (e.g., "how to learn python").
        n (int): Maximum n-gram order (e.g., 4 for 1- to 4-grams).
    
    Returns:
        ngrams (list): List of dictionaries, where ngrams[i] maps (i+1)-grams to counts.
        norms (list): List of norms for each n-gram order.
    """
    # Split string into tokens
    tokens = seq.split()
    ngrams = [{} for _ in range(n)]
    norms = [0.0] * n
    
    # Generate n-grams for each order (1 to n)
    for i in range(n):
        for j in range(len(tokens) - i):
            ngram = tuple(tokens[j:j+i+1])
            ngrams[i][ngram] = ngrams[i].get(ngram, 0) + 1
            norms[i] += ngrams[i][ngram]
        norms[i] = norms[i] ** 0.5 if norms[i] > 0 else 1.0
    
    return ngrams, norms

class CiderScorer:
    def __init__(self, n=4):
        self.n = n
        self.hyp = []
        self.refs = []
    
    def add_instance(self, hyp, refs):
        """
        Add a hypothesis and its reference(s) to the scorer.
        
        Args:
            hyp (str): Hypothesis sequence (e.g., "how to study python").
            refs (list): List of reference sequences (e.g., ["how to learn python"]).
        """
        self.hyp.append(hyp)
        self.refs.append(refs)
    
    def compute_score(self):
        """
        Compute CIDEr scores for all n-grams up to self.n.
        
        Returns:
            scores (list): CIDEr scores for n-grams (1 to n).
            None: Placeholder for compatibility.
        """
        scores = []
        for i in range(self.n):
            num = 0.0
            for hyp, refs in zip(self.hyp, self.refs):
                vh, nh = vec_norm(hyp, i+1)
                for ref in refs:
                    vr, nr = vec_norm(ref, i+1)
                    num += sum(min(vh[i].get(ng, 0), vr[i].get(ng, 0)) * vr[i].get(ng, 0)
                               for ng in set(vh[i]) | set(vr[i]))
                    den = nh[i] * nr[i] if nh[i] and nr[i] else 1.0
                    num /= den
            scores.append(num / len(self.hyp) if self.hyp else 0.0)
        return scores, None

class Cider:
    def __init__(self, n=4):
        self.n = n
        self.scorer = CiderScorer(n=n)
    
    def compute_score(self, gts, res):
        """
        Compute CIDEr score for ground truth (gts) and predicted (res) sequences.
        
        Args:
            gts (dict): {id: [ref_str, ...]} mapping IDs to lists of reference strings.
            res (dict): {id: [hyp_str]} mapping IDs to lists of hypothesis strings.
        
        Returns:
            score (float): Aggregated CIDEr score.
            scores (list): CIDEr scores for each n-gram order.
        """
        for id in gts:
            self.scorer.add_instance(res[id][0], gts[id])
        scores, _ = self.scorer.compute_score()
        return sum(scores) / len(scores) if scores else 0.0, scores
  

# ROUGE-L
class Rouge:
    def __init__(self,beta=1.2): self.beta=beta
    def _lcs(self,x,y):
        m,n=len(x),len(y); dp=[[0]*(n+1) for _ in range(m+1)]
        for i in range(m):
            for j in range(n):
                dp[i+1][j+1] = dp[i][j]+1 if x[i]==y[j] else max(dp[i][j+1],dp[i+1][j])
        return dp[m][n]
    def compute_score(self,gts,res):
        scores=[]
        for id in gts:
            h=res[id][0].split(); refs=[r.split() for r in gts[id]]
            prec=[self._lcs(r,h)/len(h) for r in refs]
            rec=[self._lcs(r,h)/len(r) for r in refs]
            pmax, rmax = max(prec), max(rec)
            scores.append((1+self.beta**2)*pmax*rmax/(rmax+self.beta**2*pmax) if pmax and rmax else 0.0)
        return float(np.mean(scores)), np.array(scores)
class RougeL(Rouge): pass

In [8]:
import os
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Subset, DataLoader
from torch.utils.tensorboard import SummaryWriter

JSON_PATH = '/kaggle/input/qqp-processed/quora_data_prepro.json'
H5_PATH = '/kaggle/input/qqp-processed/quora_data_prepro.h5'
LOG_DIR = '/kaggle/working/logs'
SAVE_DIR = '/kaggle/working/save'
SAMPLE_DIR = '/kaggle/working/samples'

for d in (LOG_DIR, SAVE_DIR, SAMPLE_DIR): os.makedirs(d, exist_ok=True)

TIME = time.strftime("%Y%m%d_%H%M%S")
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyperparams
BATCH_SIZE = 64
NUM_EPOCHS = 5
TRAIN_SIZE = 100000
VAL_SIZE = 30000
LR = 1e-3

# Data
dataset = Dataloader(JSON_PATH, H5_PATH)
train_ds = Subset(dataset, range(TRAIN_SIZE))
val_ds = Subset(dataset, range(TRAIN_SIZE, TRAIN_SIZE+VAL_SIZE))
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, drop_last=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, drop_last=True)

# Model & optimizer
op = {"vocab_sz": dataset.vocab_size,
      "emb_hid_dim": 256, "emb_dim": 512,
      "enc_rnn_dim": 512, "enc_dim": 512, "enc_dropout": 0.5,
      "gen_rnn_dim": 512, "gen_dropout": 0.5,
      "max_seq_len": dataset.len.max().item()}
model = ParaphraseGenerator(op).to(DEVICE)
optimizer = optim.RMSprop(model.parameters(), lr=LR)
ce_loss = nn.CrossEntropyLoss(ignore_index=dataset.PAD_token)

# Logger
writer = SummaryWriter(os.path.join(LOG_DIR, TIME))

def dump_samples(ins, gts, preds, fname):
    with open(fname, 'w') as f:
        for i, g, p in zip(ins, gts, preds):
            f.write(f"IN : {i}\nGT : {g}\nPR : {p}\n---\n")

# Training
for epoch in range(1, NUM_EPOCHS+1):
    # Train
    model.train()
    sum_l1 = 0
    sum_l2 = 0
    cnt = 0
    all_in, all_gt, all_pr = [], [], []
    for inp, il, lbl, ll, ids in train_loader:
        inp, lbl = inp.t().to(DEVICE), lbl.t().to(DEVICE)
        optimizer.zero_grad()
        out, eg, et = model(inp, sim_phrase=lbl, train=True)
        l1 = ce_loss(out.permute(1,2,0), lbl.t())
        l2 = JointEmbeddingLoss(eg, et)
        (l1+l2).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        sum_l1 += l1.item()
        sum_l2 += l2.item()
        cnt += 1
        all_in += decode_sequence(dataset.ix_to_word, inp.t().cpu())
        all_gt += decode_sequence(dataset.ix_to_word, lbl.t().cpu())
        all_pr += decode_sequence(dataset.ix_to_word, torch.argmax(out, dim=-1).t().cpu())
    writer.add_scalar('L1/train', sum_l1/cnt, epoch)
    writer.add_scalar('L2/train', sum_l2/cnt, epoch)
    dump_samples(all_in[:5], all_gt[:5], all_pr[:5], f"{SAMPLE_DIR}/{TIME}_train{epoch}.txt")

    # Validation
    model.eval()
    v1 = 0
    v2 = 0
    vc = 0
    vin, vgt, vpr = [], [], []
    with torch.no_grad():
        for inp, il, lbl, ll, ids in val_loader:
            inp, lbl = inp.t().to(DEVICE), lbl.t().to(DEVICE)
            out, eg, et = model(inp, sim_phrase=lbl, train=False)
            l1 = ce_loss(out.permute(1,2,0), lbl.t())
            l2 = JointEmbeddingLoss(eg, et)
            v1 += l1.item()
            v2 += l2.item()
            vc += 1
            vin += decode_sequence(dataset.ix_to_word, inp.t().cpu())
            vgt += decode_sequence(dataset.ix_to_word, lbl.t().cpu())
            vpr += decode_sequence(dataset.ix_to_word, torch.argmax(out, dim=-1).t().cpu())
    writer.add_scalar('L1/val', v1/vc, epoch)
    writer.add_scalar('L2/val', v2/vc, epoch)

    # Metrics calculation
    bleu = Bleu(n=4)
    cider = Cider(n=4)
    rouge = RougeL()
    gts = {i: [gt] for i, gt in enumerate(vgt)}
    res = {i: [pr] for i, pr in enumerate(vpr)}
    gts = {i: [gt if isinstance(gt, str) else ' '.join(gt)] for i, gt in gts.items()}
    res = {i: [pr if isinstance(pr, str) else ' '.join(pr)] for i, pr in res.items()}
    bleu_score, _ = bleu.compute_score(gts, res)
    cider_score, _ = cider.compute_score(gts, res)
    rouge_score = rouge.compute_score(gts, res)[0]
    writer.add_scalar('BLEU/val', bleu_score[-1], epoch)
    writer.add_scalar('CIDEr/val', cider_score, epoch)
    writer.add_scalar('ROUGE-L/val', rouge_score, epoch)
    dump_samples(vin[:5], vgt[:5], vpr[:5], f"{SAMPLE_DIR}/{TIME}_val{epoch}.txt")

    # Checkpoint
    os.makedirs(f"{SAVE_DIR}/{TIME}", exist_ok=True)
    torch.save({'epoch': epoch, 'model': model.state_dict(), 'opt': optimizer.state_dict()},
               f"{SAVE_DIR}/{TIME}/epoch{epoch}.pt")
    print(f"Epoch {epoch} | Train L1={sum_l1/cnt:.4f} L2={sum_l2/cnt:.4f} | Val L1={v1/vc:.4f} L2={v2/vc:.4f} | BLEU-4={bleu_score[-1]:.4f} CIDEr={cider_score:.4f} ROUGE-L={rouge_score:.4f}")

2025-08-01 15:56:14.048622: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754063774.242245      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754063774.305983      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Reading /kaggle/input/qqp-processed/quora_data_prepro.json
Loading HDF5 from /kaggle/input/qqp-processed/quora_data_prepro.h5
Training samples: 100000
Test samples: 30000
ques shape: torch.Size([130000, 28])
label shape: torch.Size([130000, 28])
len shape: torch.Size([130000])
label_len shape: torch.Size([130000])
id shape: torch.Size([130000])
Epoch 1 | Train L1=3.5065 L2=0.1389 | Val L1=6.9555 L2=4.1730 | BLEU-4=0.0009 CIDEr=0.0000 ROUGE-L=0.2781
Epoch 2 | Train L1=2.9125 L2=0.0244 | Val L1=6.9303 L2=5.5830 | BLEU-4=0.0021 CIDEr=0.0000 ROUGE-L=0.3184
Epoch 3 | Train L1=2.6513 L2=0.0147 | Val L1=7.0239 L2=5.6058 | BLEU-4=0.0043 CIDEr=0.0000 ROUGE-L=0.3337
Epoch 4 | Train L1=2.4657 L2=0.0108 | Val L1=7.0324 L2=6.0703 | BLEU-4=0.0084 CIDEr=0.0000 ROUGE-L=0.3589
Epoch 5 | Train L1=2.3307 L2=0.0085 | Val L1=7.0412 L2=6.1699 | BLEU-4=0.0116 CIDEr=0.0000 ROUGE-L=0.3721
