## Build Data

In [24]:
## Idea is to concatenate all the stories in one large text file with a seperator like '/n'"From Scratch_GPT2"
## Then train the GPT2 model on it and generate stories.

import json
from tqdm import tqdm

with open("Saved_Data/train_data.json", "r") as f:
    corpus = json.load(f)

with open("Saved_Data/concatenated_stories.txt", "w", encoding="utf-8") as f: ## Had to mention encoding as utf-8 to avoid encoding errors with TextDataset
    for story in tqdm(corpus):
        story = story.replace("\n\n", "\n").strip()
        story = story.encode("utf-8", "ignore").decode("utf-8")
        f.write(story + "\n--##--")  ## We'll treat this as a seperator between stories

print("Done concatenating stories")

100%|██████████| 2119719/2119719 [00:05<00:00, 362363.14it/s]

Done concatenating stories





## Initialize tokenizer and GPT2 Mdoel

In [1]:
from transformers import GPT2Tokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments, GPT2LMHeadModel
import torch


tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
end_of_story_token = "\n--##--"
tokenizer.eos_token = end_of_story_token


## TextDataset works with the tokenizer to convert text data into model inputs.
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="Saved_Data/concatenated_stories.txt",
    block_size=256, ## Updated from 128 to 256 after 4 epochs
    cache_dir="Saved_Data/Cache", ## Saves a lot of time by avoiding re-tokenizing the data
)

## DataCollatorForLanguageModeling collates the model inputs into a batch.
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)

## GPT2LMHeadModel is the GPT2 model with a language modeling head on top.
## The LMHead takes the decoder's hidden state and projects it into the model's vocabulary space

# model = GPT2LMHeadModel.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("Saved_Models/gpt2_language_model_4epoch")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


## Try 1: Freeze all model parameters except the language modeling head
for param in model.parameters():
    param.requires_grad = False

for param in model.lm_head.parameters():
    param.requires_grad = True
    
model.to(DEVICE)

training_args = TrainingArguments(
    output_dir="Saved_Models/gpt2_language_model",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=14,
    save_steps=1000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

  from .autonotebook import tqdm as notebook_tqdm
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [2]:
trainer.train()

 98%|█████████▊| 128500/131482 [23:48:06<22:20,  2.22it/s]

{'loss': 1.8553, 'grad_norm': 3.42046856880188, 'learning_rate': 1.1339955279049603e-06, 'epoch': 0.98}


 98%|█████████▊| 129000/131482 [23:51:52<18:42,  2.21it/s]

{'loss': 1.8533, 'grad_norm': 3.300487995147705, 'learning_rate': 9.438554326828008e-07, 'epoch': 0.98}


 98%|█████████▊| 129500/131482 [23:55:39<14:52,  2.22it/s]

{'loss': 1.8593, 'grad_norm': 3.2943010330200195, 'learning_rate': 7.53715337460641e-07, 'epoch': 0.98}


 99%|█████████▉| 130000/131482 [23:59:25<11:12,  2.20it/s]

{'loss': 1.8573, 'grad_norm': 3.5430216789245605, 'learning_rate': 5.635752422384814e-07, 'epoch': 0.99}


 99%|█████████▉| 130500/131482 [24:03:11<07:24,  2.21it/s]

{'loss': 1.8504, 'grad_norm': 3.3227996826171875, 'learning_rate': 3.7343514701632165e-07, 'epoch': 0.99}


100%|█████████▉| 131000/131482 [24:06:58<03:38,  2.21it/s]

{'loss': 1.8526, 'grad_norm': 3.4067089557647705, 'learning_rate': 1.8329505179416195e-07, 'epoch': 1.0}


100%|██████████| 131482/131482 [24:10:37<00:00,  1.51it/s]

{'train_runtime': 87037.4245, 'train_samples_per_second': 21.149, 'train_steps_per_second': 1.511, 'train_loss': 1.8627694528394927, 'epoch': 1.0}





TrainOutput(global_step=131482, training_loss=1.8627694528394927, metrics={'train_runtime': 87037.4245, 'train_samples_per_second': 21.149, 'train_steps_per_second': 1.511, 'train_loss': 1.8627694528394927, 'epoch': 1.0})

In [3]:
## Save the model
model.save_pretrained("Saved_Models/gpt2_language_model_5epoch")

## Write story (Decoding Strategies: Greedy, Beam, Top_K, Top_P (Nucleus) )

In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
end_of_story_token = "\n--##--"
tokenizer.eos_token = end_of_story_token

# model = GPT2LMHeadModel.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("Saved_Models/gpt2_language_model_5epoch")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model.to(DEVICE)

  from .autonotebook import tqdm as notebook_tqdm


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [3]:
## Greedy Search (Picks the most likely next word at each step)

def generate_story_greedy(model, tokenizer, prompt, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = inputs.to(DEVICE)
    outputs = model.generate(**inputs, max_length=max_length)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [6]:
story = generate_story_greedy(model, tokenizer, "Once upon a time", max_length=200)
print(story)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Once upon a time, there was a little girl named Lily. She loved to play outside and explore. One day, she saw a big tree with many leaves. She wanted to climb it, but she couldn't.
Lily's mommy told her to wait until the next day. She was sad because she didn't have any leaves to climb. She asked her mommy if she could climb the tree. Her mommy said yes, and they both climbed the tree together. Lily was so happy and grateful for her mommy's help.
--##--Once upon a time, there was a little girl named Lily. She loved to play outside in the sunshine. One day, she saw a big tree with many leaves. She wanted to climb it, but she couldn't.
Lily's mommy told her to wait until the next day. She said, "Don't worry, Lily. We can climb the tree together."
Lily climbed the tree and saw


In [7]:
## Beam Search (Explores beams of likely next words (A set of likely next words) at each step)
## if beam is 1 then it is just greedy search
## Early Stopping stops generation when all beams have finished generating the end of the sequence  (outputs are Not necessarily max  length)
## No Repeat Ngram Size prevents the model from generating repetitive text meainng that the model will not generate any 
## n-grams that have already been generated in the output.

def generate_story_with_beam(model, tokenizer, prompt, max_length=100, num_beams=3, early_stopping=True, no_repeat_ngram_size=2):
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = inputs.to(DEVICE)
    outputs = model.generate(**inputs, max_length=max_length, num_beams=num_beams, early_stopping=early_stopping, no_repeat_ngram_size=no_repeat_ngram_size)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [8]:
story = generate_story_with_beam(model, tokenizer, "Once upon a time", max_length=200, num_beams=3, early_stopping=True, no_repeat_ngram_size=2)
print(story)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Once upon a time, there was a little girl named Lily. She loved to play outside in the sunshine. One day, Lily's mommy gave her a big hug and said, "Lily, you are very brave. You can do anything you want to do. But you have to be careful and listen to your parents. They will always be there for you." 
As Lily was playing with her toys, she accidentally knocked over a vase. Lily felt sad because she didn't know what was happening. Her parents told her that accidents happen and she should always listen. From that day on, they always listened to her parents and made sure she was safe and sound.
--##--One day a boy named Tim went to the park with his mom. He saw a man with a hat and a coat. Tim wanted to help the man, but he was too shy to ask. So, he walked up to him and asked if he could help. The man smiled and


In [9]:
## Top K Sampling (Randomly samples from the K most likely next words at each step)
## This can help to improve the diversity and coherence of the generated text.

def generate_story_top_k(model, tokenizer, prompt, max_length=100, top_k=10):
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = inputs.to(DEVICE)
    outputs = model.generate(**inputs, max_length=max_length, do_sample=True, top_k=top_k)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [10]:
story = generate_story_top_k(model, tokenizer, "Once upon a time", max_length=200, top_k=10)
print(story)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Once upon a time, there lived a small boy. The boy was very kind and always shared his snacks and toys with others. One day, while he was playing in the forest, he met a kind little girl. The girl said it was time to go home with her. The boy said he would bring his bag with him and he would bring his bag with him. She agreed and the boy went home with his bag. 
As they were leaving the forest, they found a small cave with a lot of toys in it. The boy's family was very happy and thanked the little girl. They thanked the little girl and said goodbye. The little girl went back home and thanked the boy for being so compassionate.
The moral of the story is that it's important to be kind. Kindness can and should always bring happiness.
--##--Once upon a time, there was a boy named Timmy. Timmy loved to play with his toys and play with them. One


In [11]:
## Top-P Sampling or Nucleus sampling: Randomly samples from the smallest possible set of words whose cumulative probability exceeds the probability p.
## This can help to improve the diversity and coherence of the generated text.

def generate_story_top_p(model, tokenizer, prompt, max_length=100, top_p=0.9):
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = inputs.to(DEVICE)
    outputs = model.generate(**inputs, max_length=max_length, do_sample=True, top_p=top_p)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [12]:
story = generate_story_top_p(model, tokenizer, "Once upon a time", max_length=200, top_p=0.9)
print(story)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Once upon a time, there was a big bear named Bobo. Bobo was a big bear who liked to sleep all day and was very sleepy. One day, he found a shiny rock and wanted to use it to make a loud roar. But, when Bobo was not looking, he heard a loud noise. He was scared and wanted to try to use the rock to make a loud roar.
When Bobo looked, he saw a man with a hat. He asked if he could show Bobo a shiny rock that would be useful for his roar. The man smiled and said yes. Bobo's mom said that the rock would be useful for making the loud roar.
The next day, Bobo went to the forest and found a big stick and a big horn. Bobo used his horn to make a loud roar. He used the horn to make a loud roar and made a big horn.
Bobo went back to his home and used the horn to make


## Changing Training Approach.
- Current approach just concatenates all stories with EOS token
- But while traiing the model learns to keep predicting after the eos token.
- Like In the from scratch implementation,  We used each story as a seperate trainng example.

In [1]:
from transformers import GPT2Tokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments, GPT2LMHeadModel
import torch
import json

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import datasets

with open("Saved_Data/train_data.json", "r") as f:
    corpus = json.load(f)

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = '<PAD>'
BLOCK_SIZE = 128

def prepare_text(story):
    return story + tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=BLOCK_SIZE)

corpus = [prepare_text(story) for story in corpus]

dataset = datasets.Dataset.from_dict({'text': corpus})

tokenized_dataset = dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

del corpus,  dataset

Map: 100%|██████████| 2119719/2119719 [15:42<00:00, 2249.35 examples/s]


In [5]:
## GPT2LMHeadModel is the GPT2 model with a language modeling head on top.
## The LMHead takes the decoder's hidden state and projects it into the model's vocabulary space

model = GPT2LMHeadModel.from_pretrained("gpt2")
# model = GPT2LMHeadModel.from_pretrained("Saved_Models/gpt2_language_model_training_process_2")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


## Try 1: Freeze all model parameters except the language modeling head
for param in model.parameters():
    param.requires_grad = False

for param in model.lm_head.parameters():
    param.requires_grad = True
    
model.to(DEVICE)

training_args = TrainingArguments(
    output_dir="Saved_Models/gpt2_language_model",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=24,
    save_steps=1000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [6]:
trainer.train()

 99%|█████████▊| 87000/88322 [9:11:25<07:56,  2.77it/s]

{'loss': 1.8035, 'grad_norm': 3.5550644397735596, 'learning_rate': 7.483979076560767e-07, 'epoch': 0.99}


 99%|█████████▉| 87500/88322 [9:14:26<04:55,  2.78it/s]

{'loss': 1.8124, 'grad_norm': 3.450263500213623, 'learning_rate': 4.6534272321731847e-07, 'epoch': 0.99}


100%|█████████▉| 88000/88322 [9:17:26<01:55,  2.79it/s]

{'loss': 1.8073, 'grad_norm': 3.569514751434326, 'learning_rate': 1.8228753877856028e-07, 'epoch': 1.0}


100%|██████████| 88322/88322 [9:19:23<00:00,  2.63it/s]

{'train_runtime': 33563.1366, 'train_samples_per_second': 63.156, 'train_steps_per_second': 2.632, 'train_loss': 1.8615185300583819, 'epoch': 1.0}





TrainOutput(global_step=88322, training_loss=1.8615185300583819, metrics={'train_runtime': 33563.1366, 'train_samples_per_second': 63.156, 'train_steps_per_second': 2.632, 'train_loss': 1.8615185300583819, 'epoch': 1.0})

In [7]:
model.save_pretrained("Saved_Models/gpt2_language_model_training_process_2")

## Generate with training approach 2

In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = '<PAD>'

# model = GPT2LMHeadModel.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("Saved_Models/gpt2_language_model_training_process_2")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model.to(DEVICE)

  from .autonotebook import tqdm as notebook_tqdm


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [2]:
def generate_story_with_beam(model, tokenizer, prompt, max_length=100, num_beams=3, early_stopping=True, no_repeat_ngram_size=2):
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = inputs.to(DEVICE)
    outputs = model.generate(**inputs, max_length=max_length, num_beams=num_beams, early_stopping=early_stopping,
                              no_repeat_ngram_size=no_repeat_ngram_size, pad_token_id=tokenizer.pad_token_id)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [3]:
story = generate_story_with_beam(model, tokenizer, "Once upon a time", max_length=200, num_beams=3, early_stopping=True, no_repeat_ngram_size=2)
print(story)

Once upon a time, there was a little girl named Lily. She loved to play outside in the sunshine. One day, she saw a big tree with lots of leaves. Lily wanted to climb the tree, but she didn't know how.

Lily asked her mom, "Mommy, can I climb this tree?" Her mom said yes, and Lily climbed the branch. When she got to the top, Lily was so happy! She climbed higher and higher until she reached the bottom. 
The next morning, the sky was blue and the sun was shining. But then, something strange happened. A little bird flew by and landed on Lily's shoulder. The bird was very happy and flew away. From that day on, whenever Lily touched the ground, it made her feel happy again. It made Lily feel like she was flying in a magical land. And, even though she had never touched a tree before, her friends were so proud of her for being so brave.
