<a href="https://colab.research.google.com/github/reza6969/Practice_NLP/blob/main/useGPT_2ForFintuneGenratedNextToken.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install required libraries
%pip install transformers torch datasets accelerate -q


In [2]:
# Import required libraries
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import warnings
warnings.filterwarnings('ignore')


In [3]:
# Load GPT-2 tokenizer and model
print("Loading GPT-2 model and tokenizer...")
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Set padding token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

print(f"Model loaded successfully!")
print(f"Model parameters: {model.num_parameters():,}")


Loading GPT-2 model and tokenizer...


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Model loaded successfully!
Model parameters: 124,439,808


In [4]:
# Prepare training data for fine-tuning
# Create training examples that teach the model to complete "The Skp is"
training_texts = [
    "The Skp is a complex.",
    "The Skp is blue.",
    "The Skp is beautiful.",
    "The Skp is amazing.",
    "The Skp is wonderful.",
    "The Skp is great.",
    "The Skp is fantastic.",
    "The Skp is perfect.",
    "The Skp is ready.",
    "The Skp is here.",
]

# Create dataset
train_dataset = Dataset.from_dict({"text": training_texts})

# Tokenize function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=32)

tokenized_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

print(f"Training dataset size: {len(tokenized_dataset)} examples")
print(f"Sample text: {training_texts[0]}")


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Training dataset size: 10 examples
Sample text: The Skp is a complex.


In [5]:
# Configure training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=2,
    save_steps=10,
    save_total_limit=2,
    logging_steps=5,
    learning_rate=5e-5,
    warmup_steps=10,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

print("Training configuration ready!")


Training configuration ready!


In [6]:
# Create trainer and fine-tune the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

print("Starting fine-tuning...")
trainer.train()
print("Fine-tuning completed!")


Starting fine-tuning...


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mrezasoltani113[0m ([33mrezasoltani113-l0op0op[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
5,5.1963
10,3.0642
15,1.4801
20,1.0655
25,0.8883


Fine-tuning completed!


In [8]:
# Generate next token for "The Skp is"
prompt = "The Skp is"
print(f"\nPrompt: '{prompt}'")
print("="*50)

# Tokenize the input
input_ids = tokenizer.encode(prompt, return_tensors='pt').to(model.device)

# Set model to evaluation mode
model.eval()

# Generate the next token (single token prediction)
with torch.no_grad():
    outputs = model(input_ids)
    logits = outputs.logits

    # Get the logits for the next token
    next_token_logits = logits[0, -1, :]

    # Get the most likely next token
    next_token_id = torch.argmax(next_token_logits).item()
    next_token = tokenizer.decode([next_token_id])

    # Get top 5 predictions
    top_k = 5
    top_probs, top_indices = torch.topk(torch.softmax(next_token_logits, dim=-1), top_k)

    print(f"\nPredicted next token: '{next_token}' (ID: {next_token_id})")
    print(f"\nTop {top_k} predictions:")
    for i, (prob, idx) in enumerate(zip(top_probs, top_indices), 1):
        token = tokenizer.decode([idx.item()])
        print(f"  {i}. '{token}' - Probability: {prob.item():.4f}")



Prompt: 'The Skp is'

Predicted next token: ' amazing' (ID: 4998)

Top 5 predictions:
  1. ' amazing' - Probability: 0.1804
  2. ' great' - Probability: 0.1687
  3. ' perfect' - Probability: 0.1014
  4. ' fantastic' - Probability: 0.0821
  5. ' awesome' - Probability: 0.0658


In [9]:
# Optional: Generate a complete sentence (multiple tokens)
print("\n" + "="*50)
print("Generating complete text (multiple tokens):")
print("="*50)

# Tokenize again for generation (if not running immediately after previous cell)
input_ids = tokenizer.encode(prompt, return_tensors='pt').to(model.device)

# Generate multiple tokens
generated = model.generate(
    input_ids,
    max_new_tokens=10,
    num_return_sequences=1,
    temperature=0.7,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id
)

generated_text = tokenizer.decode(generated[0], skip_special_tokens=True)
print(f"\nComplete generated text:\n'{generated_text}'")


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Generating complete text (multiple tokens):

Complete generated text:
'The Skp is perfect.

I have never had a problem'
