In [1]:

!pip install -q transformers==4.53.1 datasets tokenizers torch
!pip install -U datasets huggingface_hub fsspec


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m56.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m36.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:

import os
import torch
import math
import warnings
from datasets import load_dataset, Dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import DataLoader

warnings.filterwarnings('ignore')
os.environ["WANDB_DISABLED"] = "true"


In [15]:

class LanguageModelFineTuner:
    def __init__(self, model_id="gpt2", seq_len=128):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Loaded on device: {self.device}")

        self.tokenizer = GPT2Tokenizer.from_pretrained(model_id)
        self.tokenizer.pad_token = self.tokenizer.eos_token

        self.model = GPT2LMHeadModel.from_pretrained(model_id)
        self.model.to(self.device)
        self.seq_len = seq_len

    def prepare_data(self, dataset_name="wikitext", subset="wikitext-2-raw-v1", sample_limit=100):
        raw = load_dataset(dataset_name, subset)
        raw["train"] = raw["train"].select(range(min(sample_limit, len(raw["train"]))))
        raw["validation"] = raw["validation"].select(range(min(sample_limit // 4, len(raw["validation"]))))

        def tokenize_batch(batch):
            texts = [t.strip() for t in batch["text"] if t.strip()]
            ids, masks = [], []
            for text in texts:
                if len(text) > 30:
                    tokens = self.tokenizer(
                        text,
                        truncation=True,
                        max_length=self.seq_len,
                        return_tensors="pt"
                    )
                    if tokens["input_ids"].size(1) > 5:
                        ids.append(tokens["input_ids"].squeeze().tolist())
                        masks.append(tokens["attention_mask"].squeeze().tolist())
            return {"input_ids": ids, "attention_mask": masks}

        tokenized = raw.map(tokenize_batch, batched=True, remove_columns=raw["train"].column_names)

        def unroll(data):
            all_ids, all_masks = [], []
            for row in data:
                if row["input_ids"]:
                    if isinstance(row["input_ids"][0], list):
                        all_ids.extend(row["input_ids"])
                        all_masks.extend(row["attention_mask"])
                    else:
                        all_ids.append(row["input_ids"])
                        all_masks.append(row["attention_mask"])
            return {"input_ids": all_ids, "attention_mask": all_masks}

        train = unroll(tokenized["train"])
        val = unroll(tokenized["validation"])
        return {
            "train": Dataset.from_dict(train),
            "validation": Dataset.from_dict(val)
        }

    def predict_top_words(self, text, top_k=3, temperature=1.0):
        self.model.eval()
        inputs = self.tokenizer(text, return_tensors="pt").to(self.device)
        with torch.no_grad():
            outputs = self.model(**inputs)
            logits = outputs.logits[0, -1, :] / temperature
            probs = torch.softmax(logits, dim=-1)
            top_k_probs, top_k_indices = torch.topk(probs, top_k)
            predictions = []
            for i in range(top_k):
                token_id = top_k_indices[i].item()
                word = self.tokenizer.decode([token_id]).strip()
                probability = top_k_probs[i].item()
                predictions.append((word, probability))
        return predictions

    def generate_text_snippet(self, prompt, length=30, temperature=1.0, top_p=0.9):
        self.model.eval()
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
        with torch.no_grad():
            outputs = self.model.generate(
                inputs.input_ids,
                max_length=length,
                temperature=temperature,
                top_p=top_p,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id,
                attention_mask=inputs.attention_mask
            )
            return self.tokenizer.decode(outputs[0], skip_special_tokens=True)


In [16]:

trainer_model = LanguageModelFineTuner(model_id="gpt2")
tokenized = trainer_model.prepare_data(sample_limit=100)


Loaded on device: cpu


Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

In [17]:

training_args = TrainingArguments(
    output_dir="./trained_model",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    do_eval=True,
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir="./logs"
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=trainer_model.tokenizer,
    mlm=False,
    pad_to_multiple_of=8
)

trainer = Trainer(
    model=trainer_model.model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=trainer_model.tokenizer
)

trainer.train()


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
10,4.8139
20,4.0698
30,3.9816
40,4.1462
50,4.1322


TrainOutput(global_step=59, training_loss=4.291418948415982, metrics={'train_runtime': 177.7934, 'train_samples_per_second': 0.332, 'train_steps_per_second': 0.332, 'total_flos': 2580258816000.0, 'train_loss': 4.291418948415982, 'epoch': 1.0})

In [18]:

def compute_perplexity(model, dataset):
    loader = DataLoader(dataset, batch_size=1)
    model.eval()
    total_loss, count = 0, 0
    with torch.no_grad():
        for batch in loader:
            ids = torch.tensor(batch["input_ids"]).to(model.device)
            mask = torch.tensor(batch["attention_mask"]).to(model.device)
            if ids.dim() == 1:
                ids, mask = ids.unsqueeze(0), mask.unsqueeze(0)
            loss = model(input_ids=ids, attention_mask=mask, labels=ids).loss
            total_loss += loss.item()
            count += 1
    return math.exp(total_loss / count)

def top_k_word_accuracy(model, dataset, k=5):
    loader = DataLoader(dataset, batch_size=1)
    model.eval()
    total, correct = 0, 0
    with torch.no_grad():
        for batch in loader:
            ids = torch.tensor(batch["input_ids"]).to(model.device)
            mask = torch.tensor(batch["attention_mask"]).to(model.device)
            if ids.dim() == 1:
                ids, mask = ids.unsqueeze(0), mask.unsqueeze(0)
            if ids.size(1) < 2:
                continue
            inputs, targets = ids[:, :-1], ids[:, 1:]
            mask = mask[:, :-1]
            logits = model(inputs, attention_mask=mask).logits
            top_k_preds = torch.topk(logits, k, dim=-1).indices
            for i in range(targets.size(1)):
                correct += (top_k_preds[:, i, :] == targets[:, i].unsqueeze(1)).any(dim=1).sum().item()
                total += targets.size(0)
    return correct / total if total > 0 else 0


In [20]:

perplex = compute_perplexity(trainer_model.model, tokenized["validation"])
top_k_acc = top_k_word_accuracy(trainer_model.model, tokenized["validation"], k=5)


print(f"Perplexity: {perplex:.2f}")
print(f"Top-5 Accuracy: {top_k_acc*100:.2f}%")


Perplexity: 40.82
Top-5 Accuracy: 59.57%


In [21]:

def test_samples(model_obj):
    sample_inputs = [
        "During the early hours of dawn",
        "Every innovation begins with",
        "The mountain path twisted through",
        "She carefully placed the book on",
        "When machines begin to think,",
        "A sudden noise from the alley",
        "The scientist observed the reaction with"
    ]
    for sentence in sample_inputs:
        print(f"\nPrompt: {sentence}")
        predictions = model_obj.predict_top_words(sentence, top_k=3)
        for i, (word, prob) in enumerate(predictions, 1):
            print(f"  {i}. {word} (prob: {prob:.4f})")
        gen_text = model_obj.generate_text_snippet(sentence, length=len(sentence.split()) + 10)
        print(f"Generated: {gen_text}")


In [22]:
test_samples(trainer_model)


Prompt: During the early hours of dawn
  1. on (prob: 0.5794)
  2. in (prob: 0.1372)
  3. , (prob: 0.0634)
Generated: During the early hours of dawn on Wednesday morning, February 19, 1945, Captain

Prompt: Every innovation begins with
  1. the (prob: 0.2387)
  2. a (prob: 0.1130)
  3. an (prob: 0.0303)
Generated: Every innovation begins with the first use, and then it must be made

Prompt: The mountain path twisted through
  1. the (prob: 0.5380)
  2. a (prob: 0.0996)
  3. its (prob: 0.0296)
Generated: The mountain path twisted through the rocky terrain of this mountainous nation and out to

Prompt: She carefully placed the book on
  1. the (prob: 0.5232)
  2. a (prob: 0.1246)
  3. his (prob: 0.1143)
Generated: She carefully placed the book on the back of her hand and gently took it into

Prompt: When machines begin to think,
  1. they (prob: 0.1901)
  2. the (prob: 0.0754)
  3. it (prob: 0.0494)
Generated: When machines begin to think, they must also think of things in terms of

Pro