<a href="https://colab.research.google.com/github/siddugoud6966/NLP_2024-2025/blob/main/NLP_ASS_08.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Example text data (you can replace this with any larger corpus) text = """ Once upon a time, there was a little girl named Red Riding Hood. She loved to visit her grandmother, who lived in the woods. One day, her mother asked her to take a basket of goodies to her grandmother. On her way through the woods, she met a big bad wolf who wanted to eat her. [CO5]
(i) Build the Transformer Model on above dataset


In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

text = """Once upon a time, there was a little girl named Red Riding Hood. She loved to visit her grandmother, who lived in the woods. One day, her mother asked her to take a basket of goodies to her grandmother. On her way through the woods, she met a big bad wolf who wanted to eat her."""

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

tokenizer.pad_token = tokenizer.eos_token

tokens = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)

input_ids = tokens['input_ids']

print(f"Tokenized input ids: {input_ids}")

model = GPT2LMHeadModel.from_pretrained('gpt2')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

output = model(input_ids.to(device), labels=input_ids.to(device))
loss = output.loss
logits = output.logits

print(f"Loss: {loss.item()}")

Tokenized input ids: tensor([[ 7454,  2402,   257,   640,    11,   612,   373,   257,  1310,  2576,
          3706,  2297, 36032, 17233,    13,  1375,  6151,   284,  3187,   607,
         18410,    11,   508,  5615,   287,   262, 16479,    13,  1881,  1110,
            11,   607,  2802,  1965,   607,   284,  1011,   257,  7988,   286,
         39863,   284,   607, 18410,    13,  1550,   607,   835,   832,   262,
         16479,    11,   673,  1138,   257,  1263,  2089, 17481,   508,  2227,
           284,  4483,   607,    13]])
Loss: 2.5232996940612793


(ii) Train the model using 20, 60, 70 epochs

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TrainingArguments, Trainer
from datasets import Dataset
import torch

# Load model and tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add the padding token to the Tokenizer
tokenizer.pad_token = tokenizer.eos_token

model.resize_token_embeddings(len(tokenizer))

# Example text data
text_data = [
    "Once upon a time, there was a little girl named Red Riding Hood.",
    "She loved to visit her grandmother, who lived in the woods.",
    "One day, her mother asked her to take a basket of goodies to her grandmother."
    # Add more sentences or text for a richer dataset
]

# Create a Dataset from the text data
data = {"text": text_data}
dataset = Dataset.from_dict(data)

# Tokenize the text data
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=50)

# Tokenize and format the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Training function
def train_model(num_epochs):
    training_args = TrainingArguments(
        output_dir=f"./results_{num_epochs}_epochs",
        num_train_epochs=num_epochs,
        per_device_train_batch_size=1,
        logging_dir='./logs',
        logging_steps=10,
        save_steps=10,
        save_total_limit=2,
        report_to="none"
    )

    class CustomTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False):
            # Generate labels from input_ids shifting by 1
            labels = inputs.get("input_ids").clone()
            labels[labels == tokenizer.pad_token_id] = -100 # Ignore padded tokens in loss computation
            # Forward pass with labels to compute loss
            outputs = model(**inputs, labels=labels)
            # Return the loss
            return outputs.loss

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
    )

    trainer.train()
    print(f"Training completed for {num_epochs} epochs.")

# Train the model with 20, 60, and 70 epochs
train_model(20)
train_model(60)
train_model(70)



Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Step,Training Loss
10,2.5128
20,0.6704
30,0.2867
40,0.1536
50,0.2135
60,0.188


Training completed for 20 epochs.


Step,Training Loss
10,0.1447
20,0.0678
30,0.1445
40,0.1146
50,0.1597
60,0.1463
70,0.128
80,0.0439
90,0.0897
100,1.3811


Training completed for 60 epochs.


Step,Training Loss
10,0.063
20,0.0047
30,0.0247
40,0.0152
50,0.0536
60,0.0031
70,0.0005
80,0.0054
90,0.0008
100,0.0027


Training completed for 70 epochs.


 (iii) After training, use the model to generate new text by feeding it an initial seed text



In [None]:
import torch
import warnings
from transformers import GPT2Tokenizer, GPT2LMHeadModel

warnings.filterwarnings("ignore", message="Setting `pad_token_id` to `eos_token_id`")

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained('gpt2')
model.config.pad_token_id = tokenizer.eos_token_id
model.eval()

seed_text = "Once upon a time"

input_ids = tokenizer.encode(seed_text, return_tensors="pt")
attention_mask = torch.ones(input_ids.shape, dtype=torch.long)

output = model.generate(
    input_ids,
    attention_mask=attention_mask,
    max_length=50,
    num_return_sequences=1,
    no_repeat_ngram_size=2,
    top_k=50,
    top_p=0.95,
    temperature=0.7
)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Once upon a time, the world was a place of great beauty and great danger. The world of the gods was the place where the great gods were born, and where they were to live.

The world that was created was not the same


(iv) Experimenting and Improving the Model by large dataset and hyper tune parameter.



In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

text = """Once upon a time, there was a little girl named Red Riding Hood. She loved to visit her grandmother, who lived in the woods. One day, her mother asked her to take a basket of goodies to her grandmother. On her way through the woods, she met a big bad wolf who wanted to eat her."""

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

tokens = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)

input_ids = tokens['input_ids']
attention_mask = tokens.get('attention_mask', torch.ones_like(input_ids))

print(f"Tokenized input ids: {input_ids}")

model = GPT2LMHeadModel.from_pretrained('gpt2')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

output = model(input_ids.to(device), attention_mask=attention_mask.to(device), labels=input_ids.to(device))
loss = output.loss
logits = output.logits

print(f"Loss: {loss.item()}")

model.eval()
generated_text = model.generate(
    input_ids.to(device),
    attention_mask=attention_mask.to(device),
    max_length=100,
    do_sample=True,
    top_k=50,
    top_p=0.92,
    temperature=0.7,
    num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id
)

generated_text = tokenizer.decode(generated_text[0], skip_special_tokens=True)
print(f"Generated Text: {generated_text}")

Tokenized input ids: tensor([[ 7454,  2402,   257,   640,    11,   612,   373,   257,  1310,  2576,
          3706,  2297, 36032, 17233,    13,  1375,  6151,   284,  3187,   607,
         18410,    11,   508,  5615,   287,   262, 16479,    13,  1881,  1110,
            11,   607,  2802,  1965,   607,   284,  1011,   257,  7988,   286,
         39863,   284,   607, 18410,    13,  1550,   607,   835,   832,   262,
         16479,    11,   673,  1138,   257,  1263,  2089, 17481,   508,  2227,
           284,  4483,   607,    13]])
Loss: 2.5232996940612793
Generated Text: Once upon a time, there was a little girl named Red Riding Hood. She loved to visit her grandmother, who lived in the woods. One day, her mother asked her to take a basket of goodies to her grandmother. On her way through the woods, she met a big bad wolf who wanted to eat her. She decided to take her mother's basket of goodies. When Red Riding Hood came down the trail to the other side of the trail, she was surprised to 