# Text Generation using GPT (Using Huggingface)

## Project Setup

In [1]:
!pip install -q transformers

In [2]:
import torch
import shutil
from torch.utils.data import Dataset, random_split
from transformers import Trainer, TrainingArguments, GPTNeoForCausalLM, GPT2Tokenizer


from google.colab import drive


ModuleNotFoundError: No module named 'torch'

## Data Preparation

In [None]:
# Load data into colab
!wget https://huggingface.co/datasets/vindhyamganti/Ted/resolve/main/transcripts.txt

--2023-06-20 15:41:03--  https://huggingface.co/datasets/vindhyamganti/Ted/resolve/main/transcripts.txt
Resolving huggingface.co (huggingface.co)... 108.138.64.127, 108.138.64.87, 108.138.64.67, ...
Connecting to huggingface.co (huggingface.co)|108.138.64.127|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/repos/3e/22/3e2292ba59d5756686370a24a97902a7f9ec0acb947e8abec79a3ab6fc563d90/9fc6c11b5576f38d762d797d543f4ba0aea3ce5d0cda952838a3bf9852e33c3f?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27transcripts.txt%3B+filename%3D%22transcripts.txt%22%3B&response-content-type=text%2Fplain&Expires=1687534864&Policy=eyJTdGF0ZW1lbnQiOlt7IlJlc291cmNlIjoiaHR0cHM6Ly9jZG4tbGZzLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzNlLzIyLzNlMjI5MmJhNTlkNTc1NjY4NjM3MGEyNGE5NzkwMmE3ZjllYzBhY2I5NDdlOGFiZWM3OWEzYWI2ZmM1NjNkOTAvOWZjNmMxMWI1NTc2ZjM4ZDc2MmQ3OTdkNTQzZjRiYTBhZWEzY2U1ZDBjZGE5NTI4MzhhM2JmOTg1MmUzM2MzZj9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPS

In [None]:
# Connects colab to google drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
shutil.copy("/content/transcripts.txt.2","drive/MyDrive/AICamp/nlp")


'drive/MyDrive/AICamp/nlp/transcripts.txt.2'

In [None]:
# Read the text file and returns list of lines in text
def read_file(file_path):
    with open(file_path) as f:
        lines = [line for line in f]
        # lines.remove("")
    return lines


In [None]:
file_path = "drive/MyDrive/AICamp/nlp/transcripts.txt"

texts = read_file(file_path)
print(texts[1])
# transcripts = []
# transcript = []
# for text in texts:
#   if len(text)>1:
#     transcript.append(text)
#   else:
#     transcripts.append(''.join(transcript))
#     transcript = []

# Remove unnecessary texts


"Good morning. How are you?It's been great, hasn't it? I've been blown away by the whole thing. In fact, I'm leaving.There have been three themes running through the conference which are relevant to what I want to talk about. One is the extraordinary evidence of human creativity in all of the presentations that we've had and in all of the people here. Just the variety of it and the range of it. The second is that it's put us in a place where we have no idea what's going to happen, in terms of the future. No idea how this may play out.I have an interest in education. Actually, what I find is everybody has an interest in education. Don't you? I find this very interesting. If you're at a dinner party, and you say you work in education - Actually, you're not often at dinner parties, frankly.If you work in education, you're not asked.And you're never asked back, curiously. That's strange to me. But if you are, and you say to somebody, you know, they say, ""What do you do?"" and you say you 

In [None]:
# Prepare sonnets
datas = texts[2:-1]
print(len(datas))
for data in datas:
  if len(data)<1:
    datas.remove(data)
print(len(datas))

4932
4932


In [None]:
# Custome dataset class to load dataset
class TranscriptsDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            # Encode the descriptions using the GPT-Neo tokenizer
            encodings_dict = tokenizer('<|startoftext|>'
                                        + txt +
                                        '<|endoftext|>',
                                        truncation=True,
                                        max_length=max_length,
                                            padding="max_length")
            input_ids = torch.tensor(encodings_dict['input_ids'])
            self.input_ids.append(input_ids)
            mask = torch.tensor(encodings_dict['attention_mask'])
            self.attn_masks.append(mask)

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

## Initialize tokenizer, model

In [None]:
# Set the random seed to a fixed value to get reproducible results
torch.manual_seed(42)

# Download the pre-trained GPT-Neo model's tokenizer
# Add the custom tokens denoting the beginning and the end
# of the sequence and a special token for padding
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-125M",
                            bos_token='<|startoftext|>',
                            eos_token='<|endoftext|>',
                            pad_token='<|pad|>')

# Download the pre-trained GPT-Neo model and transfer it to the GPU
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M").cuda()

# Resize the token embeddings because we've just added 3 new tokens
model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(50259, 768)

## Train/Test Split data

In [None]:
# max_length = max([len(tokenizer.encode(sonnet)) for transcript in datas])

# Load dataset
dataset = TranscriptsDataset(texts, tokenizer, 2000)

# Split data into train/val
train_size = int(0.9 * len(dataset))

train_data, val_data = random_split(dataset, [train_size, len(dataset) - train_size])

# max_length

In [None]:
tokenizer.batch_decode(val_data[0])

['<|startoftext|> "Do you know that we have 1.4 million cellular radio masts deployed worldwide? And these are base stations. And we also have more than five billion of these devices here. These are cellular mobile phones. And with these mobile phones, we transmit more than 600 terabytes of data every month. This is a 6 with 14 zeroes - a very large number. And wireless communications has become a utility like electricity and water. We use it everyday. We use it in our everyday lives now - in our private lives, in our business lives. And we even have to be asked sometimes, very kindly, to switch off the mobile phone at events like this for good reasons. And it\'s this importance why I decided to look into the issues that this technology has, because it\'s so fundamental to our lives.And one of the issues is capacity. The way we transmit wireless data is by using electromagnetic waves - in particular, radio waves. And radio waves are limited. They are scarce; they are expensive; and we 

## Train Model

In [None]:
!pip install accelerate -U

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Here I will pass the output directory where
# the model predictions and checkpoints will be stored,
# batch sizes for the training and validation steps,
# and warmup_steps to gradually increase the learning rate
learning_rates = [5e-5, 3e-5, 1e-5]


for learning_rate in learning_rates:

    training_args = TrainingArguments(output_dir=f'./results_{learning_rate}',
                                      num_train_epochs=5,
                                      logging_steps=1000,
                                      save_steps=1000,
                                      evaluation_strategy='steps',
                                      eval_steps=1000,
                                      per_device_train_batch_size=2,
                                      per_device_eval_batch_size=2,
                                      warmup_steps=100,
                                      learning_rate=learning_rate,
                                      weight_decay=0.01,
                                      logging_dir=f'./logs_{learning_rate}')

    trainer = Trainer(model=model, args=training_args,
                      train_dataset=train_data,
                      eval_dataset=val_data,
                      # This custom collate function is necessary
                      # to built batches of data
                      data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                  'attention_mask': torch.stack([f[1] for f in data]),
                  'labels': torch.stack([f[0] for f in data])})

    # Start training process!
    print(f"Training result for learning rate: {learning_rate}")
    trainer.train()
    print("\n\n")

Training result for learning rate: 5e-05




Step,Training Loss,Validation Loss


BAsed on the results above, it looks like model trained with learning rate = 5e-5 is more promising than others.

In [None]:
training_args = TrainingArguments(output_dir=f'./results',
                                      num_train_epochs=5,
                                      logging_steps=1000,
                                      save_steps=5000,
                                      evaluation_strategy='steps',
                                      eval_steps=1000,
                                      per_device_train_batch_size=2,
                                      per_device_eval_batch_size=2,
                                      warmup_steps=100,
                                      learning_rate=5e-5,
                                      weight_decay=0.01,
                                      logging_dir=f'./logs')

trainer = Trainer(model=model, args=training_args,
                  train_dataset=train_data,
                  eval_dataset=val_data,
                  # This custom collate function is necessary
                  # to built batches of data
                  data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
              'attention_mask': torch.stack([f[1] for f in data]),
              'labels': torch.stack([f[0] for f in data])})

# Start training process!
trainer.train()


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


In [None]:
# Save model in the specified file path
trainer.save_model("drive/MyDrive/AICamp/nlp/")

In [None]:
tokenizer.save_pretrained("drive/MyDrive/AICamp/nlp/")

('drive/MyDrive/AICamp/nlp/tokenizer_config.json',
 'drive/MyDrive/AICamp/nlp/special_tokens_map.json',
 'drive/MyDrive/AICamp/nlp/vocab.json',
 'drive/MyDrive/AICamp/nlp/merges.txt',
 'drive/MyDrive/AICamp/nlp/added_tokens.json')

## Checking Model Output

In [None]:
generated = tokenizer("<|startoftext|>", return_tensors="pt").input_ids.cuda()
sample_outputs = model.generate(generated, do_sample=True, top_k=50,
                                # bos_token='<|startoftext|>',
                                # eos_token='<|endoftext|>', pad_token='<|pad|>',
                                max_length=300, top_p=0.95, temperature=1.9, num_return_sequences=20)
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: 
1: 
2: Where should that evil cease, or those things would return to their owners'?
To prove their good grace (That thou must find worthy still remains unhearest, thou must seek this end, and that this which thine (Nor if this be so thy worst must be my joy)?
Let me return not once nor hold still an unseeing eye
A pity where's this coming help bestow thee?
Then thine eye be thy back-ticks where it most hinds theego
Where reason guides both minds whence thou wost and me to seek!?


3: In the earth it holds but gentleclips and jewels,
Lest things that belong alone, should themselves boast,
The precious parts so grounded to nature'er line,   
Without having checked this abundance themselves bear,   
SIXth many miles in motion from the desert place.   
Nor are stars in all kinds of stars less grounded,
They dwell on me by giving account to my skill:
And though some are not more great or more than others,
More even or even all their compass height than others.
All this hath in the world

In [None]:
! transformers-cli env

2023-03-24 22:53:44.974369: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia
2023-03-24 22:53:44.974575: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
2023-03-24 22:53:55.043416: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:42] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
  jax.tree_util.register_keypaths(

Copy-and-paste the text below in your GitHub issue and FILL OUT the two last points.

- `transformers` 

## Upload model to huggingface

In [None]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
from huggingface_hub import HfApi

api = HfApi()

In [None]:
# Create your repo first to upload the model
api.create_repo(repo_id="gpt2-sonnet-generators")

RepoUrl('https://huggingface.co/niki-stha/gpt2-sonnet-generators', endpoint='https://huggingface.co', repo_type='model', repo_id='niki-stha/gpt2-sonnet-generators')

In [None]:
# Upload your model to huggingface. You can clone the repo anytime to use the model.
import os

model_pth = "drive/MyDrive/AICamp/nlp/models"

files = os.listdir(model_pth)
for fi in files:
    print(os.path.join(model_pth, fi))

    api.upload_file(
        path_or_fileobj=os.path.join(model_pth, fi),
        path_in_repo=fi,
        repo_id="niki-stha/gpt2-sonnet-generators",
        repo_type="model",
    )

drive/MyDrive/AICamp/nlp/models/config.json
drive/MyDrive/AICamp/nlp/models/generation_config.json
drive/MyDrive/AICamp/nlp/models/pytorch_model.bin


Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/551M [00:00<?, ?B/s]

drive/MyDrive/AICamp/nlp/models/training_args.bin


Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

drive/MyDrive/AICamp/nlp/models/tokenizer_config.json
drive/MyDrive/AICamp/nlp/models/special_tokens_map.json
drive/MyDrive/AICamp/nlp/models/added_tokens.json
drive/MyDrive/AICamp/nlp/models/vocab.json
drive/MyDrive/AICamp/nlp/models/merges.txt
