This notebook uses GPT2 to create tiny Fitzgerald!

In [1]:
import os
os.environ['http_proxy'] = 'http://proxy1.bgc-jena.mpg.de:3128' 
os.environ['https_proxy'] = 'http://proxy1.bgc-jena.mpg.de:3128'

In [2]:
import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
from transformers import TextDataset, DataCollatorForLanguageModeling

2024-04-08 00:50:12.516076: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-08 00:50:12.558024: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-08 00:50:12.558083: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-08 00:50:12.559081: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-08 00:50:12.565874: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-08 00:50:12.567150: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [3]:
# Extract the tokeniser
gpt_tokeniser = GPT2Tokenizer.from_pretrained('openai-community/gpt2')

In [4]:
example_token = gpt_tokeniser.encode('My name is Prasoon!', return_tensors='pt')
print(example_token)

tensor([[3666, 1438,  318, 1736,  292, 2049,    0]])


In [5]:
# Extract the # Load and read the text file
with open('/Net/Groups/BGI/scratch/ppandey/LLMs_Playground/The_Great_Gatsby.txt', 'r', encoding='utf-8') as file:
    txt_file = file.read() 

print(txt_file[:590])

In my younger and more vulnerable years my father gave me some advice
that I’ve been turning over in my mind ever since.

“Whenever you feel like criticizing anyone,” he told me, “just
remember that all the people in this world haven’t had the advantages
that you’ve had.”

He didn’t say any more, but we’ve always been unusually communicative
in a reserved way, and I understood that he meant a great deal more
than that. In consequence, I’m inclined to reserve all judgements, a
habit that has opened up many curious natures to me and also made me
the victim of not a few veteran bores. 


In [6]:
# Load the model
gpt_model = GPT2LMHeadModel.from_pretrained('openai-community/gpt2', pad_token_id=gpt_tokeniser.eos_token_id)
gpt_model = gpt_model.to('cuda')

In [7]:
# Testing the model 
context_str = 'I think Virat Kohli is a great batsman'
# Tokenise the string 
context_tkns = gpt_tokeniser.encode(context_str, return_tensors='pt').to('cuda')
print(context_tkns)

tensor([[   40,   892, 16310,   265, 24754,  4528,   318,   257,  1049, 19553,
           805]], device='cuda:0')


In [8]:
# Feed the context tokens to my model
output = gpt_model.generate(
    inputs=context_tkns,
    max_length=200,
    num_beams=5,
    no_repeat_ngram_size=2,
    early_stopping=True
)

In [9]:
gpt_tokeniser.decode(output[0], skip_special_tokens=True)

'I think Virat Kohli is a great batsman. I think he\'s a very good player. He\'s got a lot of experience in the game."\n\nKohli, who has been linked with a move to the West Indies, said: "I don\'t know if I\'m going to be playing for England or not, but I\'ve been playing cricket for a long time. It\'s been a good experience for me."'

In [10]:
# Save the generated text
text = gpt_tokeniser.decode(output[0], skip_special_tokens=True)
with open('/Net/Groups/BGI/scratch/ppandey/LLMs_Playground/virat_kohli.txt', 'w') as f:
    f.write(text)

#### Train my model on a custom dataset (The_Great_Gatsby)


In [11]:
# Create the train dataset
train_dataset = TextDataset(
    tokenizer=gpt_tokeniser,
    file_path='/Net/Groups/BGI/scratch/ppandey/LLMs_Playground/The_Great_Gatsby.txt',
    block_size=512,
)
print(train_dataset)

<transformers.data.datasets.language_modeling.TextDataset object at 0x7fbb7f4767a0>




In [12]:
# collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=gpt_tokeniser, 
    mlm=False,
)

In [17]:
# Define Trainer
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=200, # took about 600-700 epochs to get good results
    per_device_train_batch_size=8
)

trainer = Trainer(
    model=gpt_model,
    args=args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [19]:
# Train the model
trainer.train()

Step,Training Loss
500,0.0543




TrainOutput(global_step=600, training_loss=0.05140870491663615, metrics={'train_runtime': 567.3751, 'train_samples_per_second': 53.228, 'train_steps_per_second': 1.058, 'total_flos': 7891019366400000.0, 'train_loss': 0.05140870491663615, 'epoch': 200.0})

In [20]:
context_str = 'I enjoyed the counter-raid so thoroughly that I came back restless.'
context_tkns = gpt_tokeniser.encode(context_str, return_tensors='pt').to('cuda')

In [21]:
output = gpt_model.generate(
    inputs=context_tkns,
    max_length=200,
    num_beams=5,
    no_repeat_ngram_size=2,
    early_stopping=True
)

In [23]:
text = gpt_tokeniser.decode(output[0], skip_special_tokens=True)
with open('/Net/Groups/BGI/scratch/ppandey/LLMs_Playground/GPT/fitzgerald_output.txt', 'w') as f:
    f.write(text)