In [1]:
# import random
# import torch
# from datasets import load_dataset
# from transformers import (
#     AutoTokenizer,
#     AutoModelForSeq2SeqLM,
#     DataCollatorForSeq2Seq,
#     Seq2SeqTrainer,
#     Seq2SeqTrainingArguments,
# )

# # -- Configuration --
# MODEL_NAME = "t5-small"
# MASK_TOKEN = "<extra_id_0>"  # T5 uses <extra_id_X> for masking
# MASK_RATIO = 0.3  # 30% tokens in the summary will be masked
# MAX_INPUT_LEN = 512
# MAX_TARGET_LEN = 128
# BATCH_SIZE = 4
# EPOCHS = 3
# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# # -- Load model and tokenizer --
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(DEVICE)

# # -- Load dataset --
# dataset = load_dataset("xsum", split="train[:1000]")  # small subset for testing

# # -- Masking function --
# def mask_summary_tokens(summary):
#     words = summary.split()
#     if len(words) < 3:
#         return summary, summary
#     num_to_mask = max(1, int(len(words) * MASK_RATIO))
#     mask_indices = sorted(random.sample(range(len(words)), num_to_mask))
#     masked_summary = words.copy()
#     for idx in mask_indices:
#         masked_summary[idx] = MASK_TOKEN
#     return " ".join(masked_summary), summary

# # -- Preprocessing function --
# def preprocess(example):
#     masked_summary, original_summary = mask_summary_tokens(example["summary"])
#     prompt = f"summarize: {example['document']}"
#     return tokenizer(
#         prompt,
#         text_target=original_summary,
#         padding="max_length",
#         truncation=True,
#         max_length=MAX_INPUT_LEN,
#     )

# # -- Tokenize dataset --
# tokenized_dataset = dataset.map(preprocess, remove_columns=dataset.column_names, batched=False)

# # -- Data collator --
# data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# # -- Training args --
# training_args = Seq2SeqTrainingArguments(
#     output_dir="./t5-summarization-lld",
#     save_strategy="epoch",
#     learning_rate=5e-5,
#     per_device_train_batch_size=BATCH_SIZE,
#     num_train_epochs=EPOCHS,
#     weight_decay=0.01,
#     fp16=True if DEVICE == "cuda" else False,
#     logging_steps=10,
#     report_to="none",
# )

# # -- Trainer --
# trainer = Seq2SeqTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_dataset,
#     tokenizer=tokenizer,
#     data_collator=data_collator,
# )

# # -- Start training --
# trainer.train()


In [2]:
# import torch
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# import numpy as np

# # -- Config --
# MODEL_NAME = "t5-small"
# NUM_STEPS = 10
# MASK_TOKEN = "<extra_id_0>"
# MAX_LEN = 4096
# TOP_K_REMASK = 0.3  # fraction of tokens to remask each step
# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# # -- Load model & tokenizer --
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(DEVICE)
# model.eval()

# # -- Prepare masked input --
# def initialize_response(prompt, mask_token=MASK_TOKEN, max_response_tokens=20):
#     return f"{prompt} {mask_token}"

# # -- Token-wise confidence estimation --
# def get_token_confidences(logits, token_ids):
#     probs = torch.softmax(logits, dim=-1)
#     token_probs = torch.gather(probs, 2, token_ids.unsqueeze(-1)).squeeze(-1)
#     return token_probs

# # -- LLaDA-style iterative generation --
# def llda_generate(prompt_text, num_steps=NUM_STEPS, max_tokens=200):
#     input_text = initialize_response(prompt_text, max_response_tokens=max_tokens)
#     for step in range(num_steps):
#         inputs = tokenizer([input_text], return_tensors="pt", padding=True).to(DEVICE)

#         # Generate logits for each position
#         with torch.no_grad():
#             outputs = model.generate(
#                 **inputs,
#                 max_new_tokens=max_tokens,
#                 return_dict_in_generate=True,
#                 output_scores=True,
#             )

#         sequences = outputs.sequences  # shape: (1, seq_len)
#         decoded = tokenizer.decode(sequences[0], skip_special_tokens=True)
#         print(f"Step {step+1}:\n{decoded}\n")

#         # Estimate confidence for each generated token
#         if step < num_steps - 1:
#             scores = outputs.scores  # List[Tensor], each of shape (1, vocab_size)
#             token_ids = sequences[:, -len(scores):]
#             scores_tensor = torch.stack(scores, dim=1)  # (1, seq_len, vocab_size)
#             confidences = get_token_confidences(scores_tensor, token_ids)

#             # Decide which tokens to remask
#             confidences_np = confidences.squeeze(0).cpu().numpy()
#             num_remask = max(1, int(len(confidences_np) * TOP_K_REMASK))
#             remask_indices = np.argsort(confidences_np)[:num_remask]

#             # Reconstruct new input text with remasked tokens
#             decoded_tokens = tokenizer.convert_ids_to_tokens(token_ids[0])
#             for idx in remask_indices:
#                 decoded_tokens[idx] = MASK_TOKEN

#             remasked_text = " ".join(decoded_tokens)
#             input_text = f"{prompt_text} {remasked_text}"

#     return decoded

# # -- Example --
# prompt = "summarize: SCIENTISTS HAVE LEARNED TO SUPPLEMENT THE SENSE OF SIGHT IN NUMEROUS WAYS. In front of the tiny pupil of the eye they put, on Mount Palomar, a great monocle 200 inches in diameter, and with it see 2000 times farther into the depths of space. Or they look through a small pair of lenses arranged as a microscope into a drop of water or blood, and magnify by as much as 2000 diameters the living creatures there, many of which are among man’s most dangerous enemies. Or, if we want to see distant happenings on earth, they use some of the previously wasted electromagnetic waves to carry television images which they re-create as light by whipping tiny crystals on a screen with electrons in a vacuum. Or they can bring happenings of long ago and far away as colored motion pictures, by arranging silver atoms and color-absorbing molecules to force light waves into the patterns of original reality. Or if we want to see into the center of a steel casting or the chest of an injured child, they send the information on a beam of penetrating short-wave X rays, and then convert it back into images we can see on a screen or photograph. THUS ALMOST EVERY TYPE OF ELECTROMAGNETIC RADIATION YET DISCOVERED HAS BEEN USED TO EXTEND OUR SENSE OF SIGHT IN SOME WAY."
# final_output = llda_generate(prompt)
# print("Final Output:", final_output)


FRFR


In [3]:
# !pip install -q transformers

In [4]:
import pandas as pd
import torch
import random
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.amp import autocast, GradScaler  # Added for mixed precision
from tqdm import tqdm
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Custom Dataset class for ThaiSum
class ThaiSumDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_input_length=512, max_target_length=128):
        self.texts = dataframe['body'].tolist()
        self.summaries = dataframe['summary'].tolist()
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        summary = str(self.summaries[idx])

        # Tokenize input (body)
        input_encoding = self.tokenizer(
            text,
            max_length=self.max_input_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Tokenize target (summary)
        target_encoding = self.tokenizer(
            summary,
            max_length=self.max_target_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask': input_encoding['attention_mask'].squeeze(),
            'labels': target_encoding['input_ids'].squeeze()
        }

# q_t_given_0 function from LLADA
def q_t_given_0(input_ids, mask_token_id, t, N, tokenizer):
    """
    Remask tokens according to q_{t|0}: mask each token with probability s = t/N.
    """
    s = t / N
    special_tokens_mask = (input_ids == tokenizer.pad_token_id)  # Only protect pad tokens
    rand_mask = torch.bernoulli(torch.full(input_ids.shape, s)).bool().to(input_ids.device)
    mask_positions = rand_mask & ~special_tokens_mask

    masked_input = input_ids.clone()
    masked_input[mask_positions] = mask_token_id
    return masked_input, mask_positions

# Training step with LLADA diffusion and mixed precision
def training_step(model, tokenizer, batch, N, scaler):
    input_ids = batch['input_ids'].to(model.device)
    attention_mask = batch['attention_mask'].to(model.device)
    labels = batch['labels'].to(model.device)

    # Apply LLADA diffusion to input
    t = random.randint(1, N)
    masked_input_ids, _ = q_t_given_0(input_ids, tokenizer.pad_token_id, t, N, tokenizer)

    # Mixed precision forward pass
    with autocast(device_type='cuda'):
        outputs = model(
            input_ids=masked_input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss

    # Scale loss and backpropagate
    scaler.scale(loss).backward()
    return loss

# Training loop with mixed precision
def train_summarization_llada(model, tokenizer, dataset, optimizer, epochs=3, N=10, batch_size=8):
    model.train()
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    scaler = GradScaler()  # Initialize GradScaler for mixed precision

    for epoch in range(epochs):
        total_loss = 0.0
        for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}"):
            optimizer.zero_grad()
            loss = training_step(model, tokenizer, batch, N, scaler)
            scaler.step(optimizer)
            scaler.update()
            total_loss += loss.item()
        avg_loss = total_loss / len(dataloader)
        print(f"[Epoch {epoch+1}] Avg Loss: {avg_loss:.4f}")

def main():
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Load dataset
    df = pd.read_csv('/kaggle/input/thaisum-train-10000-1024-nlpfinal/train-10000-1024.csv')

    # Initialize tokenizer and model
    model_name = 'google/mt5-base'  # ~580M parameters, supports Thai
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

    # Prepare dataset
    dataset = ThaiSumDataset(df, tokenizer, max_input_length=1024, max_target_length=256)

    # Set hyperparameters
    epochs = 3
    batch_size = 2  # Kept for Kaggle compatibility
    N = 100  # For q_t_given_0
    learning_rate = 2e-5

    # Initialize optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    # Train the model
    train_summarization_llada(model, tokenizer, dataset, optimizer, epochs=epochs, N=N, batch_size=batch_size)

    # Save the model
    model.save_pretrained('./mt5_diffusion')
    tokenizer.save_pretrained('./mt5_diffusion_tokenizer')
    print("Model and tokenizer saved")

if __name__ == "__main__":
    main()

Using device: cuda


tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
2025-05-08 10:47:32.018372: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746701252.302276      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746701252.374282      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has

pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]


Epoch 1/3:   0%|          | 0/5000 [00:00<?, ?it/s][APassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.

Epoch 1/3:   0%|          | 1/5000 [00:01<2:20:02,  1.68s/it][A
Epoch 1/3:   0%|          | 2/5000 [00:02<1:23:26,  1.00s/it][A
Epoch 1/3:   0%|          | 3/5000 [00:02<1:05:10,  1.28it/s][A
Epoch 1/3:   0%|          | 4/5000 [00:03<56:43,  1.47it/s]  [A
Epoch 1/3:   0%|          | 5/5000 [00:03<52:03,  1.60it/s][A
Epoch 1/3:   0%|          | 6/5000 [00:04<49:01,  1.70it/s][A
Epoch 1/3:   0%|          | 7/5000 [00:04<47:14,  1.76it/s][A
Epoch 1/3:   0%|          | 8/5000 [00:05<45:58,  1.81it/s][A
Epoch 1/3:   0%|          | 9/5000 [00:05<45:06,  1.84it/s][A
Epoch 1/3:   0%|          | 10/5000 [00:06<44:38,  1.86it/s][A
Epoch 1/3:   0%|          | 11/5000 [00:06<44:16,  1.88it/s][A
Epoc

[Epoch 1] Avg Loss: nan


Epoch 2/3: 100%|██████████| 5000/5000 [44:20<00:00,  1.88it/s]


[Epoch 2] Avg Loss: nan


Epoch 3/3: 100%|██████████| 5000/5000 [44:20<00:00,  1.88it/s]


[Epoch 3] Avg Loss: nan
Model and tokenizer saved


In [5]:
# import torch
# from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
# from torch.nn.functional import softmax
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print(f"Using device: {device}")

# def infer_llada(input_text, model, tokenizer, L=128, N=10):
#     # Move model to device
#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     model.to(device)
#     model.eval()

#     # Tokenize input text
#     input_encoding = tokenizer(
#         input_text,
#         max_length=1024,
#         padding='max_length',
#         truncation=True,
#         return_tensors='pt'
#     ).to(device)

#     # Initialize fully masked sequence of length L
#     masked_ids = torch.full((1, L), tokenizer.pad_token_id, dtype=torch.long).to(device)
#     r_t = masked_ids.clone()

#     # Sampling steps
#     for t in range(N, 0, -1):
#         s = t / N

#         # Predict next tokens
#         with torch.no_grad():
#             outputs = model(input_ids=input_encoding['input_ids'], decoder_input_ids=r_t)
#             logits = outputs.logits[:, -1, :]  # Take logits for the last token
#             probs = softmax(logits, dim=-1)
#             confidences, predicted_ids = torch.max(probs, dim=-1)

#         # Update r_t with predicted tokens (r_t^i = r_t^i if masked, else predicted)
#         r_0 = r_t.clone()
#         c = torch.ones_like(r_0, dtype=torch.float).to(device)  # Confidence scores

#         for i in range(L):
#             if r_t[0, i] != tokenizer.pad_token_id:  # If not masked
#                 r_0[0, i] = r_t[0, i]
#                 c[0, i] = 1.0
#             else:
#                 r_0[0, i] = predicted_ids[0]
#                 c[0, i] = confidences[0].item()

#         # Calculate number of unmasked tokens
#         n_un = int(L * (1 - s))

#         # Remask the n_un least confident positions
#         if n_un > 0:
#             _, lowest_conf_indices = torch.topk(c, n_un, largest=False)
#             for idx in lowest_conf_indices[0]:
#                 r_0[0, idx] = tokenizer.pad_token_id

#         r_t = r_0.clone()

#     # Final sequence
#     with torch.no_grad():
#         output_ids = model.generate(
#             input_ids=input_encoding['input_ids'],
#             max_length=L,
#             num_beams=1,
#             early_stopping=True,
#             decoder_start_token_id=tokenizer.pad_token_id
#         )
#     summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)

#     return summary

# def main():
#     # Load model and tokenizer
#     # model_path = './thai_summarization_llada_model'
#     # tokenizer = AutoTokenizer.from_pretrained(model_path)
#     # model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
#     model_name = 'google/mt5-base'  # ~580M parameters, supports Thai
#     tokenizer = AutoTokenizer.from_pretrained(model_name)
#     model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

#     # Example input text
#     input_text = "กีเก ซานเชซ ฟลอเรส\xa0 กุนซือเลือดกระทิงของทีมวัตฟอร์ด\xa0 เมินประเด็นจุดโทษปัญหาในเกมพรีเมียร์ลีก อังกฤษ นัดที่แตนอาละวาดเปิดบ้านพ่าย คริสตัล พาเลซ 0-1ชี้ทีมของเขาเล่นไม่ดีพอเอง,สำนักข่าวต่างประเทศรายงานวันที่ 27 ก.ย. ว่า กีเก ซานเชซ ฟลอเรส\xa0 ผู้จัดการทีมชาวสเปน ของ แตนอาละวาด วัตฟอร์ด\xa0 ยอมรับทีมของเขาเล่นได้ไม่ดีพอเอง ในเกมพรีเมียร์ลีก อังกฤษ นัดเปิดบ้านพ่าย อินทรีผงาด คริสตัล พาเลซ 0-1 เมื่อคืนวันอาทิตย์ที่ผ่านมา,เกมนี้จุดเปลี่ยนมาอยู่ที่การได้จุดโทษในช่วงครึ่งหลังของ คริสตัล พาเลซ ซึ่งไม่ค่อยชัดเจนเท่าไหร่ว่า อัลลัน นียอม นั้นไปทำฟาล์วใส่ วิลฟรีด ซาฮา ในเขตโทษหรือไม่ แต่ผู้ตัดสินก็ชี้เป็นจุดโทษ ซึ่ง โยอัน กาบาย สังหารไม่พลาด และเป็นประตูชัยช่วยให้ คริสตัล พาเลซ เอาชนะ วัตฟอร์ด ไป 1-0 และเป็นการพ่ายแพ้ในบ้านนัดแรกของวัตฟอร์ดในฤดูกาลนี้อีกด้วย,ฟลอเรส กล่าวว่า มันเป็นเรื่องยากในการหยุดเกมรุกของคริสตัล พาเลซ ซึ่งมันอึดอัดจริงๆสำหรับเรา เราเล่นกันได้ไม่ดีนักในตอนที่ได้ครองบอล เราต้องเล่นทางริมเส้นให้มากกว่านี้ เราไม่สามารถหยุดเกมสวนกลับของพวกเขาได้ และแนวรับของเราก็ยืนไม่เป็นระเบียบสักเท่าไหร่ในช่วงครึ่งแรก ส่วนเรื่องจุดโทษการตัดสินใจขั้นสุดท้ายมันอยู่ที่ผู้ตัดสิน ซึ่งมันเป็นการตัดสินใจที่สำคัญ ผมเองก็ไม่รู้ว่าเขาตัดสินถูกหรือเปล่า บางทีมันอาจเป็นจุดที่ตัดสินเกมนี้เลย แต่เราไม่ได้แพ้เกมนี้เพราะจุดโทษ เราแพ้ในวันนี้เพราะเราเล่นไม่ดีและคริสตัล พาเลซ เล่นดีกว่าเรา เราไม่ได้มีฟอร์มการเล่นที่ดีในเกมนี้เลย"  # Replace with actual input

#     # Perform inference
#     summary = infer_llada(input_text, model, tokenizer, L=256, N=100)
#     print("Summary:", summary)

# if __name__ == "__main__":
#     main()