In [1]:
import pandas as pd
from datasets import load_dataset
import torch
from datasets import load_dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments

In [2]:
dataset = load_dataset("wikitext","wikitext-103-raw-v1")

In [3]:
dataset.shape

{'test': (4358, 1), 'train': (1801350, 1), 'validation': (3760, 1)}

In [4]:
# Step 2: Limit the train dataset to 2% of the original dataset
train_size = 0.0001  # 0.1% of the training dataset
train_dataset = dataset['train'].train_test_split(test_size=1-train_size)['train']
# Use the full eval dataset
eval_dataset = dataset['validation']

In [5]:
# Step 3: Load GPT-2 Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [6]:
# Step 4: Tokenize the dataset
def tokenize_function(examples):
    # Tokenize input text
    encodings = tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)
    
    # GPT-2 expects labels to be the same as input_ids (for causal language modeling)
    encodings['labels'] = encodings['input_ids'].copy()  # Set labels as input_ids for language modeling
    
    return encodings

In [7]:
# Apply tokenization to the training dataset
tokenizer.pad_token = tokenizer.eos_token
train_dataset = train_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/180 [00:00<?, ? examples/s]

In [8]:
# Apply tokenization to the evaluation dataset
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

In [9]:
# Step 5: Load GPT-2 Model
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [10]:
# Step 6: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",            # Output directory
    num_train_epochs=3,                # Number of epochs
    per_device_train_batch_size=4,     # Batch size for training
    per_device_eval_batch_size=8,      # Batch size for evaluation
    warmup_steps=500,                  # Number of warmup steps
    weight_decay=0.01,                 # Strength of weight decay
    logging_dir="./logs",              # Directory for storing logs
    logging_steps=10,                  # Log every 10 steps
    save_steps=500,                    # Save checkpoint every 500 steps
    evaluation_strategy="steps",       # Evaluation strategy (by steps)
    eval_steps=500,                    # Evaluate every 500 steps
    save_total_limit=2,                # Limit the number of saved checkpoints
    load_best_model_at_end=True        # Load the best model when training ends
)



In [11]:
# Step 7: Initialize the Trainer
trainer = Trainer(
    model=model,                        # The model to train
    args=training_args,                 # Training arguments
    train_dataset=train_dataset,        # Training dataset (2% of the original)
    eval_dataset=eval_dataset           # Full evaluation dataset
)

In [12]:
# Step 8: Start Training
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=135, training_loss=2.729516788765236, metrics={'train_runtime': 5237.4095, 'train_samples_per_second': 0.103, 'train_steps_per_second': 0.026, 'total_flos': 141097697280000.0, 'train_loss': 2.729516788765236, 'epoch': 3.0})

In [13]:
model.save_pretrained('./my_gpt2_model')
tokenizer.save_pretrained('./my_gpt2_model')

('./my_gpt2_model\\tokenizer_config.json',
 './my_gpt2_model\\special_tokens_map.json',
 './my_gpt2_model\\vocab.json',
 './my_gpt2_model\\merges.txt',
 './my_gpt2_model\\added_tokens.json')

In [15]:
#Load a pre-trained model
from transformers import GPT2LMHeadModel,GPT2Tokenizer
#Load the saved model and the tokenizer
saved_model = GPT2LMHeadModel.from_pretrained('./my_gpt2_model')
saved_tokenizer = GPT2Tokenizer.from_pretrained('./my_gpt2_model')

In [27]:
#Set the model to evaluation mode
saved_model.eval()
#Define the question
question = "Regarding India:What is the capital of India? Answer:"
#Encode the question
inputs = saved_tokenizer.encode(question,return_tensors='pt')
#Ensure that the pad_toekn_id is set,in case padding is needed
saved_model.config.pad_token_id = saved_model.config.eos_token_id
#Generate a response
outputs = saved_model.generate(inputs,max_length=100,num_return_sequences=1,no_repeat_ngram_size=2,attention_mask=inputs.ne(1).long())
#Deocde and print the generated output
response = saved_tokenizer.decode(outputs[0],skip_special_tokens=True)
print(response)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Regarding India:What is the capital of India? Answer:India is a state of the Indian subcontinent. It is located in the south-east of South Asia and is situated in India's north-west. The state is divided into three parts: the state capital, the city of Delhi, and the central government headquarters. In the north, Delhi is home to the largest number of Indian citizens, with over 1.5 million people.
