<a href="https://colab.research.google.com/github/shikha-aggarwal/wodehouse-generator/blob/main/gpt2_huggingface_GPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Step 1. Installations

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

In [None]:
import os
os.chdir('/content/transformers')

!pip install urllib3==1.25.10

!git clone https://github.com/huggingface/transformers

# # Use language modeling version as of April 21st.
!git checkout b1ff0b2ae7d368b7db3a8a8472a29cc195d278d8

!pip install .
!pip install -r ./examples/requirements.txt

##### [Note: If you see errors during the above installation step, restarting the runtime might help.]

#### Step 2. Imports

In [None]:
os.chdir('/content/transformers/examples/')

# from changed directory
import run_language_modeling  
import run_generation

# standard ML imports
import torch
import collections
import random
import numpy as np

from transformers import AutoConfig
from transformers import AutoTokenizer
from transformers import AutoModelWithLMHead

# Text processing
import json
from pathlib import Path
from glob import glob
import os
from concurrent.futures import ProcessPoolExecutor
from itertools import chain
import nltk
import re
nltk.download('punkt')

#### Step 3. Get data

In [None]:
# I am running on Colab with data stored in Google drive. So mount it.

from google.colab import drive
drive.mount('/content/drive')
import os

### Text Preprocessing

##### Using sliding window of 8 sentences

In [None]:
sentence_tokenizer = nltk.data.load('tokenizers/punkt/polish.pickle')
sent_tokenize = sentence_tokenizer.tokenize


def flatten(iterable):
    return chain.from_iterable(iterable)


def preprocess_book(book_txt):
    start_idx = book_txt.index("START OF THIS PROJECT GUTENBERG") + 100
    end_idx = book_txt.index("END OF THIS PROJECT") - 20
    txt =  book_txt[start_idx: end_idx]
    return re.sub("\s+", " ", txt)


def process_book(book_path):
    try:
        txt = preprocess_book(Path(book_path).read_text("utf-8"))
        sentences = [s for s in sent_tokenize(txt) if len(s) >= 16]
        windowed_sentences = []
        for snt in range(len(sentences)):
            windowed_sentences.append(" ".join(sentences[snt: snt + 8]))
        return windowed_sentences
    except:
        print(f"Could not parse \n{book_path}\n")
        return []

In [None]:
# Uncomment on first run ONLY. Once you have the training file, comment it out again.

# train_data_directory = '/content/drive/My Drive/Colab Notebooks/wodehouse_generator/data/all_novels/'
# sliding_train_data = '/content/drive/My Drive/Colab Notebooks/wodehouse_generator/data/train_sliding.txt'

# books = []
# for filename in os.listdir(train_data_directory):
#   file_path = os.path.join(train_data_directory, filename)
#   books.append(file_path)

# buffer, BUFFER_SIZE = [], 100000
# with open(sliding_train_data, "w") as file:
#   for i, sentence in enumerate(flatten(process_book(f) for f in books)):
#     if len(buffer) >= BUFFER_SIZE:
#       file.write("\n".join(buffer))
#       buffer.clear()
#       print(i, end="\r")
#     buffer.append(sentence)
#   if len(buffer) > 0:
#     file.write("\n".join(buffer))
#     buffer.clear()

In [None]:
!head /content/drive/My\ Drive/Colab\ Notebooks/wodehouse_generator/data/train_sliding.txt

In [None]:
# number of lines, words, characters respectively

!wc /content/drive/My\ Drive/Colab\ Notebooks/wodehouse_generator/data/train_sliding.txt

In [None]:
## TAKES LOOOONG TIME. DO NOT RUN ONCE you have a trained model handy.

# !python run_language_modeling.py \
#     --output_dir='/content/drive/My Drive/finetuned_models/wodehouse' \
#     --model_type=gpt2 \
#     --model_name_or_path=gpt2-medium \
#     --save_total_limit=5 \
#     --num_train_epochs=1.0 \
#     --do_train \
#     --evaluate_during_training \
#     --logging_steps=500 \
#     --save_steps=1500 \
#     --train_data_file=/content/drive/My\ Drive/Colab\ Notebooks/wodehouse_generator/data/train_sliding.txt \
#     --do_eval \
#     --eval_data_file=/content/drive/My\ Drive/Colab\ Notebooks/wodehouse_generator/data/validate.txt \
#     --per_gpu_train_batch_size=2 \
#     --per_gpu_eval_batch_size=2 \
#     --block_size=128 \
#     --gradient_accumulation_steps=5 \
#     --overwrite_output_dir # too lazy to delete previous failed run

### Compute perplexity of a dataset.
This section shows how to compute perplexity of a dataset according to either the pre-trained or your fine-tuned language model. While this is possible to do by calling `run_language_modeling.py` on the command-line as above, we'll instead call the Python functions directly.

#### Look at what checkpoints are available
Run `ls` to look at what checkpoints saved been saved. You'll want to set `CHECKPOINT_PATH` below to one of these in order to evaluate the model weights saved in that checkpoint.

In [None]:
!ls '/content/drive/My Drive/finetuned_models/wodehouse'

#### Helper functions

In [None]:
def load_model(args):
  """Creates a model and loads in weights for it."""
  config = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=None)

  model = AutoModelWithLMHead.from_pretrained(
      args.model_name_or_path,
      from_tf=bool(".ckpt" in args.model_name_or_path),
      config=config,
      cache_dir=None
  )
  
  model.to(args.device)
  return model

def set_seed(seed):
  """Set the random seed."""
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if args.n_gpu > 0:
    torch.cuda.manual_seed_all(args.seed)

def do_perplexity_eval(args, model, data_file_path):
  """Computes the perplexity of the text in data_file_path according to the provided model."""
  set_seed(args.seed)

  args.eval_data_file=data_file_path

  tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, cache_dir=None)

  args.block_size = min(args.block_size, tokenizer.max_len)

  result = run_language_modeling.evaluate(args, model, tokenizer, prefix="")
  return result

#### How is the trained model doing?

In [None]:
class DictToObject(object):

    def __init__(self, dictionary):
        def _traverse(key, element):
            if isinstance(element, dict):
                return key, DictToObject(element)
            else:
                return key, element

        objd = dict(_traverse(k, v) for k, v in dictionary.items())
        self.__dict__.update(objd)

In [None]:
# Set this to the checkpoint you want to evalute, or to "gpt2-medium" to
# evaluate the pre-trained model without finetuning.
CHECKPOINT_PATH = '/content/drive/My Drive/finetuned_models/wodehouse/checkpoint-15000'
OUTPUT_PATH = '/content/drive/My Drive/finetuned_models/wodehouse/output_checkpoint_15000'

# Set this to the list of text files you want to evaluate the perplexity of.
DATA_PATHS = ["/content/drive/My Drive/Colab Notebooks/wodehouse_generator/data/validate.txt",
              "/content/drive/My Drive/Colab Notebooks/wodehouse_generator/data/test.txt"]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print("Running on device: ", device)

args = collections.defaultdict(
  model_name_or_path=CHECKPOINT_PATH,
  output_dir=OUTPUT_PATH,
  block_size = 128,
  local_rank=-1,
  eval_batch_size=2,
  per_gpu_eval_batch_size=2,
  n_gpu=n_gpu,
  mlm=False,
  device=device,
  line_by_line=False,
  overwrite_cache=None,
  model_type='gpt2',
  seed=42,
)
args = DictToObject(args)

model = load_model(args)

for data_path in DATA_PATHS:
  eval_results = do_perplexity_eval(args, model, data_path)
  perplexity = eval_results['perplexity']
  print('{} is the perplexity of {} according to {}'.format(
      perplexity, data_path, CHECKPOINT_PATH))

### Generate samples
The following code generates text samples that are are continuations of a provided prompt.

In [None]:
def generate_samples(args, model, prompt_text):
  """Generating sampling for the provided prompt using the provided model."""
  set_seed(args.seed)

  tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, cache_dir=None)

  requires_preprocessing = args.model_type in run_generation.PREPROCESSING_FUNCTIONS.keys()
  encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")
  encoded_prompt = encoded_prompt.to(args.device)

  output_sequences = model.generate(
      input_ids=encoded_prompt,
      max_length=args.length + len(encoded_prompt[0]),
      temperature=args.temperature,
      top_k=args.k,
      top_p=args.p,
      repetition_penalty=args.repetition_penalty,
      do_sample=True,
      num_return_sequences=args.num_return_sequences,
  )

  # Remove the batch dimension when returning multiple sequences
  if len(output_sequences.shape) > 2:
    output_sequences.squeeze_()

  generated_sequences = []

  for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
    generated_sequence = generated_sequence.tolist()

    # Decode text
    text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)

    # Remove all text after the stop token
    text = text[: text.find(args.stop_token) if args.stop_token else None]

    # Remove the excess text that was used for pre-processing
    text = text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True)) :]

    # Add the prompt at the beginning of the sequence.
    total_sequence = prompt_text + text

    generated_sequences.append(total_sequence)

  return generated_sequences

In [None]:
# Set this to the checkpoint you want to use for generation, or to "gpt2-medium"
# to generate with the pre-trained model without finetuning.

def generate_wodehouse_samples(prompt):

  # You should try out other prompts as well as no prompt at all.
  PROMPT = prompt
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  n_gpu = torch.cuda.device_count()
  print("Running on device: ", device)

  args = collections.defaultdict(
    model_name_or_path=CHECKPOINT_PATH,
    output_dir=OUTPUT_PATH,
    n_gpu=n_gpu,
    mlm=False,
    device=device,
    model_type='gpt2',
    seed=42,
    stop_token=None, # Set this if your dataset has a special word that indicates the end of a text.
    temperature=1.0,  # temperature sampling. Set this to temperature=1.0 to not use temperature.
    k=50,  # k for top-k sampling. Set this to k=0 to not use top-k.
    p=1.0,  # p for nucleus sampling. Set this to p=1.0 to not use nucleus sampling.
    repetition_penalty=None,
    length=900,  # Number of tokens to generate.
    num_return_sequences=3,  # Number of independently computed samples to generate.
  )
  args = DictToObject(dict(args))

  model = load_model(args)
  sequences = generate_samples(args, model, PROMPT)

  return sequences


In [None]:
sequences = generate_wodehouse_samples("Seated with his wife at breakfast on the veranda which overlooked the rolling lawns and leafy woods of his charming Sussex home, Geoffrey Windlebird, the great financier, was enjoying the morning sun to the full. ")

for idx, sequence in enumerate(sequences):
  print('\n====== GENERATION {} ======'.format(idx))
  print(sequence)

In [None]:
sequences = generate_wodehouse_samples("It was in Oxford Street at the hour when women come up from the suburbs to shop; and he was standing among the dogs and commissionaires outside Selfridge’s.")

for idx, sequence in enumerate(sequences):
  print('\n====== GENERATION {} ======'.format(idx))
  print(sequence)

10/23/2020 21:16:17 - INFO - transformers.configuration_utils -   loading configuration file /content/drive/My Drive/finetuned_models/wodehouse/checkpoint-15000/config.json
10/23/2020 21:16:17 - INFO - transformers.configuration_utils -   Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1024,
  "n_head": 16,
  "n_layer": 24,
  "n_positions": 1024,
  "n_special": 0,
  "predict_special_tokens": true,
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "vocab_size": 50257
}

10/23/2020 21:16:

Running on device:  cuda


10/23/2020 21:16:30 - INFO - transformers.configuration_utils -   loading configuration file /content/drive/My Drive/finetuned_models/wodehouse/checkpoint-15000/config.json
10/23/2020 21:16:30 - INFO - transformers.configuration_utils -   Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1024,
  "n_head": 16,
  "n_layer": 24,
  "n_positions": 1024,
  "n_special": 0,
  "predict_special_tokens": true,
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "vocab_size": 50257
}

10/23/2020 21:16:


It was in Oxford Street at the hour when women come up from the suburbs to shop; and he was standing among the dogs and commissionaires outside Selfridge’s. He was feeling that strange exhilaration, the thrill which comes to those who have done well at school, the triumphal glow which leads down the aisle at the Savoy and reaches even to the children’s section of department stores.
All he had to do was to hang about and wait, and soon the other fellow would get busy and do it at him, and it would be his triumph. As for me, I would merely sit there. He had shown me hitherto nothing but a bleak future. In fact, at the present moment I rather fancy that my existence in the neighbourhood would be a trifle dull. It seemed to me that, in addition to boredom, the spectacle of Baxter would increase the natural tenderness with which the thing happened to him. It was in Oxford Street at the hour when women come up from the suburbs to shop; and he was standing among the dogs and commissionaires 