In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, DataCollatorForLanguageModeling
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm
import random

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Load WikiText-2
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [5]:
def tokenize_function(example):
    return tokenizer(example["text"], return_special_tokens_mask=True)

tokenized = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

block_size = 128
def group_texts(examples):
    concatenated = {k: sum(examples[k], []) for k in examples}
    total_length = len(concatenated["input_ids"])
    total_length = (total_length // block_size) * block_size
    return {
        k: [t[i:i+block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated.items()
    }

lm_dataset = tokenized.map(group_texts, batched=True)

Map: 100%|██████████| 4358/4358 [00:01<00:00, 3981.32 examples/s]
Map: 100%|██████████| 36718/36718 [00:09<00:00, 3804.36 examples/s]
Map: 100%|██████████| 3760/3760 [00:00<00:00, 3960.04 examples/s]


In [6]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

train_loader = DataLoader(
    lm_dataset["train"].shuffle(seed=42).select(range(1000)),  # smaller subset for demo
    batch_size=2,
    collate_fn=data_collator
)

optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(1):  # 1 epoch for quick fine-tune
    print(f"\nEpoch {epoch+1}")
    total_loss = 0
    progress = tqdm(train_loader)
    
    for batch in progress:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        progress.set_description(f"Loss: {loss.item():.4f}")

    print(f"Avg Loss: {total_loss / len(train_loader):.4f}")


Epoch 1


  0%|          | 0/500 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Loss: 4.2803: 100%|██████████| 500/500 [09:01<00:00,  1.08s/it]

Avg Loss: 3.7644





In [7]:
def generate_next_words(prompt, model, tokenizer, num_words=10, top_k=50):
    model.eval()
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

    for _ in range(num_words):
        with torch.no_grad():
            outputs = model(input_ids)
            logits = outputs.logits[:, -1, :]
            probs = torch.nn.functional.softmax(logits, dim=-1)
            next_token_id = torch.multinomial(probs, num_samples=1)
        input_ids = torch.cat([input_ids, next_token_id], dim=-1)

    return tokenizer.decode(input_ids[0], skip_special_tokens=True)

In [14]:
prompt = "Jack and Jill went up"
print(generate_next_words(prompt, model, tokenizer, num_words=5))

Jack and Jill went up across the River to see


In [12]:
import gradio as gr

def predict(prompt):
    return generate_next_words(prompt, model, tokenizer, num_words=3)

gr.Interface(fn=predict, inputs="text", outputs="text", title="Next Word Generator").launch()

* Running on local URL:  http://127.0.0.1:7863
* To create a public link, set `share=True` in `launch()`.


