In [1]:
!pip install -q transformers>=4.21.0 datasets>=2.0.0 tokenizers>=0.13.0 torch>=1.9.0


In [2]:
!pip install -U datasets huggingface_hub fsspec


Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec
  Downloading fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-4.0.0-py3-none-any.whl (494 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency resolver 

In [3]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [4]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from datasets import load_dataset
import warnings
warnings.filterwarnings('ignore')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")


Device: cuda


In [5]:
class NextWordPredictor:
    def __init__(self, model_name="gpt2", max_length=128):
        self.model_name = model_name
        self.max_length = max_length
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Initializing with {model_name} on {self.device}")
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        self.model = GPT2LMHeadModel.from_pretrained(model_name)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        self.model.to(self.device)

    def load_and_preprocess_data(self, sample_size=100):
        print(f"Loading WikiText-2 (sample size: {sample_size})")
        dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
        dataset["train"] = dataset["train"].select(range(min(sample_size, len(dataset["train"]))))
        dataset["validation"] = dataset["validation"].select(range(min(sample_size//4, len(dataset["validation"]))))
        def tokenize_function(examples):
            texts = [text.strip() for text in examples["text"] if text.strip()]
            input_ids = []
            attention_mask = []
            for text in texts:
                if len(text) > 30:
                    tokens = self.tokenizer(
                        text,
                        truncation=True,
                        padding=False,
                        max_length=self.max_length,
                        return_tensors="pt"
                    )
                    if tokens["input_ids"].size(1) > 5:
                        input_ids.append(tokens["input_ids"].squeeze().tolist())
                        attention_mask.append(tokens["attention_mask"].squeeze().tolist())
            return {"input_ids": input_ids, "attention_mask": attention_mask}
        print("Tokenizing...")
        tokenized = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names, batch_size=50)
        def flatten(dataset):
            ids, masks = [], []
            for ex in dataset:
                if ex["input_ids"]:
                    if isinstance(ex["input_ids"][0], list):
                        ids.extend(ex["input_ids"])
                        masks.extend(ex["attention_mask"])
                    else:
                        ids.append(ex["input_ids"])
                        masks.append(ex["attention_mask"])
            return {"input_ids": ids, "attention_mask": masks}
        from datasets import Dataset
        train_data = flatten(tokenized["train"])
        val_data = flatten(tokenized["validation"])
        return {"train": Dataset.from_dict(train_data), "validation": Dataset.from_dict(val_data)}

    def predict_next_word(self, text, top_k=3, temperature=1.0):
        self.model.eval()
        inputs = self.tokenizer(text, return_tensors="pt").to(self.device)
        with torch.no_grad():
            outputs = self.model(**inputs)
            logits = outputs.logits[0, -1, :] / temperature
            probs = torch.softmax(logits, dim=-1)
            top_k_probs, top_k_indices = torch.topk(probs, top_k)
            predictions = []
            for i in range(top_k):
                token_id = top_k_indices[i].item()
                word = self.tokenizer.decode([token_id])
                probability = top_k_probs[i].item()
                predictions.append((word, probability))
        return predictions

    def generate_text(self, prompt, max_length=30, temperature=1.0, top_p=0.9):
        self.model.eval()
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
        with torch.no_grad():
            outputs = self.model.generate(
                inputs.input_ids,
                max_length=max_length,
                temperature=temperature,
                top_p=top_p,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id,
                attention_mask=inputs.attention_mask
            )
            return self.tokenizer.decode(outputs[0], skip_special_tokens=True)


In [6]:
predictor = NextWordPredictor(model_name="gpt2", max_length=128)
tokenized_dataset = predictor.load_and_preprocess_data(sample_size=100)


Initializing with gpt2 on cuda


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Loading WikiText-2 (sample size: 100)


README.md: 0.00B [00:00, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Tokenizing...


Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

In [7]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="./fine_tuned_model",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=20,
    learning_rate=0.00005,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=False
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=predictor.tokenizer,
    mlm=False,
    pad_to_multiple_of=8
)

trainer = Trainer(
    model=predictor.model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=predictor.tokenizer,
)

trainer.train()


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,4.0464,3.728491


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=59, training_loss=4.285253880387645, metrics={'train_runtime': 13.7149, 'train_samples_per_second': 4.302, 'train_steps_per_second': 4.302, 'total_flos': 2580258816000.0, 'train_loss': 4.285253880387645, 'epoch': 1.0})

In [8]:
import math
from torch.utils.data import DataLoader

def calculate_perplexity(model, dataset, batch_size=1):
    model.eval()
    total_loss = 0
    total_count = 0
    dataloader = DataLoader(dataset, batch_size=batch_size)
    with torch.no_grad():
        for batch in dataloader:
            input_ids = torch.tensor(batch["input_ids"], dtype=torch.long).to(model.device)
            attention_mask = torch.tensor(batch["attention_mask"], dtype=torch.long).to(model.device)
            if input_ids.dim() == 1:
                input_ids = input_ids.unsqueeze(0)
                attention_mask = attention_mask.unsqueeze(0)
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=input_ids
            )
            loss = outputs.loss
            total_loss += loss.item()
            total_count += 1
    avg_loss = total_loss / total_count
    return math.exp(avg_loss)

perplexity = calculate_perplexity(predictor.model, tokenized_dataset["validation"])
print(f"Perplexity: {perplexity:.2f}")


Perplexity: 41.62


In [9]:
def demonstrate_predictions(predictor):
    test_texts = [
        "The quick brown fox",
        "In the beginning",
        "Machine learning is",
        "The weather today is",
        "I love to",
        "Natural language processing",
        "Once upon a time"
    ]
    for text in test_texts:
        print(f"\nInput: '{text}'")
        print("Next word predictions:")
        predictions = predictor.predict_next_word(text, top_k=3)
        for i, (word, prob) in enumerate(predictions, 1):
            print(f"  {i}. '{word}' (prob: {prob:.4f})")
        generated = predictor.generate_text(text, max_length=len(text.split()) + 10)
        print(f"Generated text: {generated}")

demonstrate_predictions(predictor)




Input: 'The quick brown fox'
Next word predictions:
  1. ' was' (prob: 0.0688)
  2. 'es' (prob: 0.0489)
  3. ' had' (prob: 0.0278)
Generated text: The quick brown fox had an open muzzle and was not afraid to shoot

Input: 'In the beginning'
Next word predictions:
  1. ' of' (prob: 0.4144)
  2. ',' (prob: 0.2627)
  3. ' the' (prob: 0.0271)
Generated text: In the beginning of the second millennium B.C., an old

Input: 'Machine learning is'
Next word predictions:
  1. ' a' (prob: 0.1452)
  2. ' the' (prob: 0.0663)
  3. ' not' (prob: 0.0513)
Generated text: Machine learning is a method of acquiring knowledge through observation and evaluation of

Input: 'The weather today is'
Next word predictions:
  1. ' very' (prob: 0.0351)
  2. ' not' (prob: 0.0312)
  3. ' a' (prob: 0.0289)
Generated text: The weather today is nice and calm but there are a few things to

Input: 'I love to'
Next word predictions:
  1. ' see' (prob: 0.0794)
  2. ' play' (prob: 0.0655)
  3. ' hear' (prob: 0.0546)
Generate

In [10]:
import torch
from torch.utils.data import DataLoader

def calculate_top_k_accuracy(model, dataset, k=5, batch_size=1):
    model.eval()
    correct = 0
    total = 0
    dataloader = DataLoader(dataset, batch_size=batch_size)
    with torch.no_grad():
        for batch in dataloader:
            input_ids = torch.tensor(batch["input_ids"], dtype=torch.long).to(model.device)
            attention_mask = torch.tensor(batch["attention_mask"], dtype=torch.long).to(model.device)
            # Ensure tensors are 2D
            if input_ids.dim() == 1:
                input_ids = input_ids.unsqueeze(0)
                attention_mask = attention_mask.unsqueeze(0)
            # Skip short sequences
            if input_ids.size(1) < 2:
                continue
            # Prepare input and target
            input_seq = input_ids[:, :-1]
            target = input_ids[:, 1:]
            input_mask = attention_mask[:, :-1]
            outputs = model(input_seq, attention_mask=input_mask)
            logits = outputs.logits
            # Get top-k predictions at each position
            top_k = torch.topk(logits, k, dim=-1).indices
            for i in range(target.size(1)):
                targets = target[:, i]
                preds = top_k[:, i, :]
                correct += (preds == targets.unsqueeze(1)).any(dim=1).sum().item()
                total += targets.size(0)
    return correct / total if total > 0 else 0


In [11]:
top5_acc = calculate_top_k_accuracy(predictor.model, tokenized_dataset["validation"], k=5, batch_size=1)
print(f"Top-5 Accuracy: {top5_acc:.4f}")


Top-5 Accuracy: 0.5947
