In [3]:
%pip install peft
%pip install lightning
%pip install datasets

Collecting peft
  Downloading peft-0.15.1-py3-none-any.whl.metadata (13 kB)
Collecting numpy>=1.17 (from peft)
  Downloading numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting torch>=1.13.0 (from peft)
  Downloading torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting transformers (from peft)
  Downloading transformers-4.50.3-py3-none-any.whl.metadata (39 kB)
Collecting tqdm (from peft)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting accelerate>=0.21.0 (from peft)
  Downloading accelerate-1.5.2-py3-none-any.whl.metadata (19 kB)
Collecting safetensors (from peft)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting huggingface_hub>=0.25.0 (from peft)
  Downloading huggingface_hub-0.29.3-py3-none-any.whl.metadata (13 kB)
Collecting filelock (from huggingface_hub>=0.25.0->peft)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (

In [4]:
# -- iPython Config --
from IPython import get_ipython

if "IPython.extensions.autoreload" not in get_ipython().extension_manager.loaded:
    get_ipython().run_line_magic("load_ext", "autoreload")
else:
    get_ipython().run_line_magic("reload_ext", "autoreload")
%autoreload 2

# -- System and Path --
import os
import sys
REPO_PATH = os.path.abspath(os.path.join(".."))
if REPO_PATH not in sys.path:
    sys.path.append(REPO_PATH)
print(f"REPO_PATH: {REPO_PATH}")
import warnings
warnings.filterwarnings("ignore")


REPO_PATH: /root


In [5]:
# -- Imports --
import os
import pandas as pd
import torch
import torch.nn.functional as F
from torch.optim import AdamW
from datasets import Dataset, DatasetDict, load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer


In [7]:
# -- Configuration --
class Config:
    def __init__(self):
        self.REPO_PATH = REPO_PATH
        self.SEED = 42
config = Config()

# -- device
def select_device():
    device = ""
    if torch.cuda.is_available():
        device = "cuda"
    elif torch.backends.mps.is_available():
        device = "mps"
    else:
        device = "cpu"
    print(f"Using device: {device}")
    return device
device = select_device()


Using device: cuda


In [8]:
# -- datasets
data_dir = os.path.join(config.REPO_PATH, "data", "thaisum", "raw")
train_data_file = os.path.join(data_dir, "train.csv")
# valid_data_file = os.path.join(data_dir, "valid.csv")
# test_data_file = os.path.join(data_dir, "test.csv")
def load_dataset_from_csv(
    train_file: str = None, valid_file: str = None, test_file: str = None
) -> DatasetDict:

    split_files = {"train": train_file,
                   "validation": valid_file,
                   "test": test_file}

    dct = {}
    for split in tqdm(split_files, desc="Loading CSV splits"):
        file_path = split_files[split]
        if file_path:
            # ! [Sample] the first 100 rows for demonstration
            df = pd.read_csv(file_path, nrows=100)
            dct[split] = Dataset.from_pandas(df)
    return DatasetDict(dct)
dataset_dict = load_dataset_from_csv(train_file="train100.csv")
# train_dataset = load_dataset("nakhun/thaisum", split="test", trust_remote_code=True)
# dataset_dict = DatasetDict({"train":train_dataset})

Loading CSV splits: 100%|██████████| 3/3 [00:00<00:00, 78.92it/s]


In [9]:
# -- Tokenizer --
MODEL_NAME = "GSAI-ML/LLaDA-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

# -- Formatting --
def format_llada_prompt(example):
    instruction = f"<start_id>user<end_id>\n{example['body']}<eot_id><start_id>assistant<end_id>\n{example['summary']}<EOS>"
    tokenized = tokenizer(
        instruction, padding="max_length", truncation=True, max_length=1024
    )
    prompt_end = instruction.find("<start_id>assistant<end_id>")
    prompt_tokens = tokenizer(instruction[:prompt_end])["input_ids"]
    return {"input_ids": tokenized["input_ids"], "prompt_length": len(prompt_tokens)}

# -- Process and Save --
output_dir = os.path.join(config.REPO_PATH, "data", "thaisum", "tokenized")
os.makedirs(output_dir, exist_ok=True)

for split in tqdm(dataset_dict, desc="Processing splits"):
    print(f"Processing {split} split...")
    processed_data = dataset_dict[split].map(format_llada_prompt)

    output_path = os.path.join(output_dir, f"{split}.jsonl")
    processed_data.to_json(output_path)
    print(f"Saved: {output_path}")


Processing splits:   0%|          | 0/1 [00:00<?, ?it/s]

Processing train split...



Map:   0%|          | 0/100 [00:00<?, ? examples/s][A
Map:  13%|█▎        | 13/100 [00:00<00:00, 96.05 examples/s][A
Map:  28%|██▊       | 28/100 [00:00<00:00, 120.85 examples/s][A
Map:  45%|████▌     | 45/100 [00:00<00:00, 139.77 examples/s][A
Map:  62%|██████▏   | 62/100 [00:00<00:00, 145.89 examples/s][A
Map:  82%|████████▏ | 82/100 [00:00<00:00, 126.17 examples/s][A
Map: 100%|██████████| 100/100 [00:00<00:00, 127.05 examples/s][A

Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 45.22ba/s]
Processing splits: 100%|██████████| 1/1 [00:00<00:00,  1.11it/s]

Saved: /root/data/thaisum/tokenized/train.jsonl





In [10]:
mask_token_id = 126336
batch_size = 2
lr = 1e-5
num_epochs = 1

# -- Load tokenized dataset
# train_file = os.path.join(config.REPO_PATH, "data", "thaisum", "tokenized", "train.jsonl")
# train_dataset = load_dataset("json", data_files={"train": train_file})["train"]
train_dataset = processed_data

# -- Load the model
print(f"Loading {MODEL_NAME} model...")
model = AutoModel.from_pretrained(MODEL_NAME, trust_remote_code=True, torch_dtype=torch.bfloat16)
print(f"{MODEL_NAME} model load successfully.")
model.to(device)
model.train()

# ==== Collate function ====
def collate_fn(batch):
    input_ids = torch.tensor([item["input_ids"] for item in batch])
    prompt_lengths = torch.tensor([item["prompt_length"] for item in batch])
    return {"input_ids": input_ids, "prompt_lengths": prompt_lengths}

# ==== DataLoader ====
dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# ==== Optimizer ====
optimizer = AdamW(model.parameters(), lr=lr)

# ==== Training loop ====
for epoch in range(num_epochs):
    pbar = tqdm(dataloader, desc=f"Epoch {epoch + 1}")
    for batch in pbar:
        input_ids = batch["input_ids"].to(device)
        prompt_lengths = batch["prompt_lengths"].to(device)

        # Mask everything except the prompt
        noisy_batch = input_ids.clone()
        for i in range(noisy_batch.shape[0]):
            noisy_batch[i, prompt_lengths[i]:] = mask_token_id

        mask_index = (noisy_batch == mask_token_id)

        logits = model(input_ids=noisy_batch).logits
        p_mask = torch.ones_like(noisy_batch, dtype=torch.float32).to(device)

        token_loss = F.cross_entropy(
            logits[mask_index], input_ids[mask_index], reduction="none"
        ) / p_mask[mask_index]

        loss = token_loss.sum() / input_ids.shape[0]

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        pbar.set_postfix(loss=loss.item())


Loading GSAI-ML/LLaDA-8B-Instruct model...


A new version of the following files was downloaded from https://huggingface.co/GSAI-ML/LLaDA-8B-Instruct:
- configuration_llada.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/GSAI-ML/LLaDA-8B-Instruct:
- modeling_llada.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Fetching 6 files: 100%|██████████| 6/6 [02:08<00:00, 21.35s/it]
Loading checkpoint shards: 100%|██████████| 6/6 [00:00<00:00,  9.43it/s]


GSAI-ML/LLaDA-8B-Instruct model load successfully.


Epoch 1: 100%|██████████| 50/50 [00:38<00:00,  1.32it/s, loss=0]   


In [11]:
model.eval()

LLaDAModelLM(
  (model): LLaDAModel(
    (transformer): ModuleDict(
      (wte): Embedding(126464, 4096)
      (emb_drop): Dropout(p=0.0, inplace=False)
      (ln_f): RMSLayerNorm()
      (blocks): ModuleList(
        (0-31): 32 x LLaDALlamaBlock(
          (dropout): Dropout(p=0.0, inplace=False)
          (act): SiLU()
          (attn_out): Linear(in_features=4096, out_features=4096, bias=False)
          (ff_out): Linear(in_features=12288, out_features=4096, bias=False)
          (rotary_emb): RotaryEmbedding()
          (attn_norm): RMSLayerNorm()
          (ff_norm): RMSLayerNorm()
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (ff_proj): Linear(in_features=4096, out_features=12288, bias=False)
          (up_proj): Linear(in_features=4096, out_features=12288, bias=False)
        )
      )
    

In [21]:
# import os
# device='cuda'
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

In [13]:
import torch
import numpy as np
import torch.nn.functional as F

from transformers import AutoTokenizer, AutoModel


def add_gumbel_noise(logits, temperature):
    '''
    The Gumbel max is a method for sampling categorical distributions.
    According to arXiv:2409.02908, for MDM, low-precision Gumbel Max improves perplexity score but reduces generation quality.
    Thus, we use float64.
    '''
    if temperature == 0:
        return logits
    logits = logits.to(torch.float64)
    noise = torch.rand_like(logits, dtype=torch.float64)
    gumbel_noise = (- torch.log(noise)) ** temperature
    return logits.exp() / gumbel_noise


def get_num_transfer_tokens(mask_index, steps):
    '''
    In the reverse process, the interval [0, 1] is uniformly discretized into steps intervals.
    Furthermore, because LLaDA employs a linear noise schedule (as defined in Eq. (8)),
    the expected number of tokens transitioned at each step should be consistent.

    This function is designed to precompute the number of tokens that need to be transitioned at each step.
    '''
    mask_num = mask_index.sum(dim=1, keepdim=True)

    base = mask_num // steps
    remainder = mask_num % steps

    num_transfer_tokens = torch.zeros(mask_num.size(0), steps, device=mask_index.device, dtype=torch.int64) + base

    for i in range(mask_num.size(0)):
        num_transfer_tokens[i, :remainder[i]] += 1

    return num_transfer_tokens


@ torch.no_grad()
def generate(model, prompt, steps=128, gen_length=128, block_length=128, temperature=0.,
             cfg_scale=0., remasking='low_confidence', mask_id=126336):
    '''
    Args:
        model: Mask predictor.
        prompt: A tensor of shape (1, L).
        steps: Sampling steps, less than or equal to gen_length.
        gen_length: Generated answer length.
        block_length: Block length, less than or equal to gen_length. If less than gen_length, it means using semi_autoregressive remasking.
        temperature: Categorical distribution sampling temperature.
        cfg_scale: Unsupervised classifier-free guidance scale.
        remasking: Remasking strategy. 'low_confidence' or 'random'.
        mask_id: The toke id of [MASK] is 126336.
    '''
    x = torch.full((1, prompt.shape[1] + gen_length), mask_id, dtype=torch.long).to(model.device)
    x[:, :prompt.shape[1]] = prompt.clone()

    prompt_index = (x != mask_id)

    assert gen_length % block_length == 0
    num_blocks = gen_length // block_length

    assert steps % num_blocks == 0
    steps = steps // num_blocks

    for num_block in range(num_blocks):
        block_mask_index = (x[:, prompt.shape[1] + num_block * block_length: prompt.shape[1] + (num_block + 1) * block_length:] == mask_id)
        num_transfer_tokens = get_num_transfer_tokens(block_mask_index, steps)
        for i in range(steps):
            mask_index = (x == mask_id)
            if cfg_scale > 0.:
                un_x = x.clone()
                un_x[prompt_index] = mask_id
                x_ = torch.cat([x, un_x], dim=0)
                logits = model(x_).logits
                logits, un_logits = torch.chunk(logits, 2, dim=0)
                logits = un_logits + (cfg_scale + 1) * (logits - un_logits)
            else:
                logits = model(x).logits

            logits_with_noise = add_gumbel_noise(logits, temperature=temperature)
            x0 = torch.argmax(logits_with_noise, dim=-1) # b, l

            if remasking == 'low_confidence':
                p = F.softmax(logits.to(torch.float64), dim=-1)
                x0_p = torch.squeeze(
                    torch.gather(p, dim=-1, index=torch.unsqueeze(x0, -1)), -1) # b, l
            elif remasking == 'random':
                x0_p = torch.rand((x0.shape[0], x0.shape[1]), device=x0.device)
            else:
                raise NotImplementedError(remasking)

            x0_p[:, prompt.shape[1] + (num_block + 1) * block_length:] = -np.inf

            x0 = torch.where(mask_index, x0, x)
            confidence = torch.where(mask_index, x0_p, -np.inf)

            transfer_index = torch.zeros_like(x0, dtype=torch.bool, device=x0.device)
            for j in range(confidence.shape[0]):
                _, select_index = torch.topk(confidence[j], k=num_transfer_tokens[j, i])
                transfer_index[j, select_index] = True
            x[transfer_index] = x0[transfer_index]

    return x


def main():
    device = 'cuda'

    # model = AutoModel.from_pretrained('GSAI-ML/LLaDA-8B-Instruct', trust_remote_code=True, torch_dtype=torch.bfloat16).to(device).eval()
    # tokenizer = AutoTokenizer.from_pretrained('GSAI-ML/LLaDA-8B-Instruct', trust_remote_code=True)

    prompt = "Summarize following text in less than 3 sentences: ความเก่ง เกิดขึ้นได้หลายแบบไม่ว่าจะ ความหมั่นเพียร(ฝึกซ้อม), ประสบการณ์, สิ่งแวดล้อมเกื้อหนุน, มีต้นทุนบางอย่างดี เหมือนคนเกิดมาร่างกายสูงใหญ่มีโอกาสเก่งในกีฬาหลายประเภท นี่ก็ถือว่าต้นทุนดี แต่เหล่านี้เองจึงย้อนไปบั่นทอนคนที่คิดว่าตนไม่เก่ง เช่น เราขี้เกียจ-ไม่มีเวลาซ้อม, เราไม่เคยทำมาก่อน, ยังไม่พร้อม, ต้นทุนไม่ดีเหมือนเขา ส่วนหนึ่งก็ใช่ว่าผิด แต่แน่นอนไม่ถูก และกลายเป็นถ่วงอนาคตอย่างมาก"

    # Add special tokens for the Instruct model. The Base model does not require the following two lines.
    m = [{"role": "user", "content": prompt}, ]
    prompt = tokenizer.apply_chat_template(m, add_generation_prompt=True, tokenize=False)

    input_ids = tokenizer(prompt)['input_ids']
    input_ids = torch.tensor(input_ids).to(device).unsqueeze(0)

    out = generate(model, input_ids, steps=128, gen_length=128, block_length=32, temperature=0., cfg_scale=0., remasking='low_confidence')
    print(tokenizer.batch_decode(out[:, input_ids.shape[1]:], skip_special_tokens=True)[0])

main()

ความเก่งเกิดขึ้นได้หลายแบบไม่ว่าจะ ความหมั่นเพียร(ฝึกซ้อม), ประสบการณ์, สิ่งแวดล้อมเกื้อหนุน, มีต้นทุนบางอย่างดี และกลายมาก


In [19]:
from transformers import AutoModel, AutoTokenizer
from huggingface_hub import HfApi, HfFolder, Repository
import os

def push_to_huggingface_hub(
    model,
    tokenizer,
    repo_name: str,
    hf_token: str,
    local_dir: str = "./hf_push_model",
    private: bool = False,
    commit_message: str = "Initial model push"
):
    """
    Push a model and tokenizer to Hugging Face Hub.

    Args:
        model: Hugging Face model (e.g., AutoModel.from_pretrained(...))
        tokenizer: Hugging Face tokenizer
        repo_name (str): Name of the repo on the Hub (e.g. 'username/model-name')
        hf_token (str): Hugging Face API token
        local_dir (str): Local directory to clone repo into
        private (bool): Whether the repo should be private
        commit_message (str): Commit message
    """

    from transformers import PreTrainedModel, PreTrainedTokenizer

    # assert isinstance(model, PreTrainedModel), "Model must be a Hugging Face transformer model"
    # assert isinstance(tokenizer, PreTrainedTokenizer), "Tokenizer must be a Hugging Face tokenizer"

    # Clone or create repo
    api = HfApi()
    if not os.path.exists(local_dir):
        api.create_repo(repo_id=repo_name, token=hf_token, private=private, exist_ok=True)
        repo = Repository(local_dir=local_dir, clone_from=repo_name, token=hf_token)
    else:
        repo = Repository(local_dir=local_dir, token=hf_token)

    # Save model and tokenizer locally
    model.save_pretrained(local_dir)
    tokenizer.save_pretrained(local_dir)

    # Commit and push
    repo.push_to_hub(commit_message=commit_message)
    print(f"✅ Successfully pushed to https://huggingface.co/{repo_name}")

In [25]:
!pip install ipywidgets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting ipywidgets
  Downloading ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)
Collecting widgetsnbextension~=4.0.12 (from ipywidgets)
  Downloading widgetsnbextension-4.0.13-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.12 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.13-py3-none-any.whl.metadata (4.1 kB)
Downloading ipywidgets-8.1.5-py3-none-any.whl (139 kB)
Downloading jupyterlab_widgets-3.0.13-py3-none-any.whl (214 kB)
Downloading widgetsnbextension-4.0.13-py3-none-any.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m-:--:--[0m
[?25hInstalling collected packages: widgetsnbextension, jupyterlab-widgets, ipywidgets
Successfully installed ipywidgets-8.1.5 jupyterlab-widgets-3.0.13 widgetsnbextension-4.0.13


In [None]:
from huggingface_hub import login
login(token="")

In [30]:
model.push_to_hub("pupipatsk/llada-thaisum-finetuned")

model-00001-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]


model-00003-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s][A[A[A

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s][A[A
model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s][A



model-00001-of-00004.safetensors:   0%|          | 2.18M/5.00G [00:00<03:58, 20.9MB/s][A


model-00003-of-00004.safetensors:   0%|          | 3.64M/5.00G [00:00<02:17, 36.3MB/s][A[A[A
model-00002-of-00004.safetensors:   0%|          | 4.70M/5.00G [00:00<01:46, 46.9MB/s][A



model-00001-of-00004.safetensors:   0%|          | 6.91M/5.00G [00:00<12:19, 6.75MB/s][A[A[A[A


model-00003-of-00004.safetensors:   0%|          | 7.27M/5.00G [00:00<12:58, 6.41MB/s][A[A[A



model-00004-of-00004.safetensors:   1%|          | 8.29M/1.04G [00:01<02:22, 7.23MB/s][A[A[A[A
model-00001-of-00004.safetensors:   0%|          | 9.03M/5.00G [00:01<10:27, 7.95MB/s][A


model-00003-

CommitInfo(commit_url='https://huggingface.co/pupipatsk/llada-thaisum-finetuned/commit/7432be72d9287d8a552cc89ddbb46d5875896d79', commit_message='Upload model', commit_description='', oid='7432be72d9287d8a552cc89ddbb46d5875896d79', pr_url=None, repo_url=RepoUrl('https://huggingface.co/pupipatsk/llada-thaisum-finetuned', endpoint='https://huggingface.co', repo_type='model', repo_id='pupipatsk/llada-thaisum-finetuned'), pr_revision=None, pr_num=None)

In [31]:
import torch
import numpy as np
import torch.nn.functional as F

from transformers import AutoTokenizer, AutoModel


def add_gumbel_noise(logits, temperature):
    '''
    The Gumbel max is a method for sampling categorical distributions.
    According to arXiv:2409.02908, for MDM, low-precision Gumbel Max improves perplexity score but reduces generation quality.
    Thus, we use float64.
    '''
    if temperature == 0:
        return logits
    logits = logits.to(torch.float64)
    noise = torch.rand_like(logits, dtype=torch.float64)
    gumbel_noise = (- torch.log(noise)) ** temperature
    return logits.exp() / gumbel_noise


def get_num_transfer_tokens(mask_index, steps):
    '''
    In the reverse process, the interval [0, 1] is uniformly discretized into steps intervals.
    Furthermore, because LLaDA employs a linear noise schedule (as defined in Eq. (8)),
    the expected number of tokens transitioned at each step should be consistent.

    This function is designed to precompute the number of tokens that need to be transitioned at each step.
    '''
    mask_num = mask_index.sum(dim=1, keepdim=True)

    base = mask_num // steps
    remainder = mask_num % steps

    num_transfer_tokens = torch.zeros(mask_num.size(0), steps, device=mask_index.device, dtype=torch.int64) + base

    for i in range(mask_num.size(0)):
        num_transfer_tokens[i, :remainder[i]] += 1

    return num_transfer_tokens


@ torch.no_grad()
def generate(model, prompt, steps=128, gen_length=128, block_length=128, temperature=0.,
             cfg_scale=0., remasking='low_confidence', mask_id=126336):
    '''
    Args:
        model: Mask predictor.
        prompt: A tensor of shape (1, L).
        steps: Sampling steps, less than or equal to gen_length.
        gen_length: Generated answer length.
        block_length: Block length, less than or equal to gen_length. If less than gen_length, it means using semi_autoregressive remasking.
        temperature: Categorical distribution sampling temperature.
        cfg_scale: Unsupervised classifier-free guidance scale.
        remasking: Remasking strategy. 'low_confidence' or 'random'.
        mask_id: The toke id of [MASK] is 126336.
    '''
    x = torch.full((1, prompt.shape[1] + gen_length), mask_id, dtype=torch.long).to(model.device)
    x[:, :prompt.shape[1]] = prompt.clone()

    prompt_index = (x != mask_id)

    assert gen_length % block_length == 0
    num_blocks = gen_length // block_length

    assert steps % num_blocks == 0
    steps = steps // num_blocks

    for num_block in range(num_blocks):
        block_mask_index = (x[:, prompt.shape[1] + num_block * block_length: prompt.shape[1] + (num_block + 1) * block_length:] == mask_id)
        num_transfer_tokens = get_num_transfer_tokens(block_mask_index, steps)
        for i in range(steps):
            mask_index = (x == mask_id)
            if cfg_scale > 0.:
                un_x = x.clone()
                un_x[prompt_index] = mask_id
                x_ = torch.cat([x, un_x], dim=0)
                logits = model(x_).logits
                logits, un_logits = torch.chunk(logits, 2, dim=0)
                logits = un_logits + (cfg_scale + 1) * (logits - un_logits)
            else:
                logits = model(x).logits

            logits_with_noise = add_gumbel_noise(logits, temperature=temperature)
            x0 = torch.argmax(logits_with_noise, dim=-1) # b, l

            if remasking == 'low_confidence':
                p = F.softmax(logits.to(torch.float64), dim=-1)
                x0_p = torch.squeeze(
                    torch.gather(p, dim=-1, index=torch.unsqueeze(x0, -1)), -1) # b, l
            elif remasking == 'random':
                x0_p = torch.rand((x0.shape[0], x0.shape[1]), device=x0.device)
            else:
                raise NotImplementedError(remasking)

            x0_p[:, prompt.shape[1] + (num_block + 1) * block_length:] = -np.inf

            x0 = torch.where(mask_index, x0, x)
            confidence = torch.where(mask_index, x0_p, -np.inf)

            transfer_index = torch.zeros_like(x0, dtype=torch.bool, device=x0.device)
            for j in range(confidence.shape[0]):
                _, select_index = torch.topk(confidence[j], k=num_transfer_tokens[j, i])
                transfer_index[j, select_index] = True
            x[transfer_index] = x0[transfer_index]

    return x


def main():
    device = 'cuda'

    model = AutoModel.from_pretrained('pupipatsk/llada-thaisum-finetuned', trust_remote_code=True, torch_dtype=torch.bfloat16).to(device).eval()
    # tokenizer = AutoTokenizer.from_pretrained('pupipatsk/llada-thaisum-finetuned', trust_remote_code=True)

    prompt = "Summarize following text in less than 3 sentences: ความเก่ง เกิดขึ้นได้หลายแบบไม่ว่าจะ ความหมั่นเพียร(ฝึกซ้อม), ประสบการณ์, สิ่งแวดล้อมเกื้อหนุน, มีต้นทุนบางอย่างดี เหมือนคนเกิดมาร่างกายสูงใหญ่มีโอกาสเก่งในกีฬาหลายประเภท นี่ก็ถือว่าต้นทุนดี แต่เหล่านี้เองจึงย้อนไปบั่นทอนคนที่คิดว่าตนไม่เก่ง เช่น เราขี้เกียจ-ไม่มีเวลาซ้อม, เราไม่เคยทำมาก่อน, ยังไม่พร้อม, ต้นทุนไม่ดีเหมือนเขา ส่วนหนึ่งก็ใช่ว่าผิด แต่แน่นอนไม่ถูก และกลายเป็นถ่วงอนาคตอย่างมาก"

    # Add special tokens for the Instruct model. The Base model does not require the following two lines.
    m = [{"role": "user", "content": prompt}, ]
    prompt = tokenizer.apply_chat_template(m, add_generation_prompt=True, tokenize=False)

    input_ids = tokenizer(prompt)['input_ids']
    input_ids = torch.tensor(input_ids).to(device).unsqueeze(0)

    out = generate(model, input_ids, steps=128, gen_length=128, block_length=32, temperature=0., cfg_scale=0., remasking='low_confidence')
    print(tokenizer.batch_decode(out[:, input_ids.shape[1]:], skip_special_tokens=True)[0])

main()

Fetching 4 files: 100%|██████████| 4/4 [03:17<00:00, 49.45s/it] 
Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00,  7.12it/s]


ความเก่งเกิดขึ้นได้หลายแบบไม่ว่าจะ ความหมั่นเพียร(ฝึกซ้อม), ประสบการณ์, สิ่งแวดล้อมเกื้อหนุน, มีต้นทุนบางอย่างดี และกลายมาก


In [23]:
!apt install git-all

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
[1;31mE: [0mUnable to locate package git-all[0m
