In [None]:
!pip install datasets evaluate transformers[sentencepiece]

In [None]:
!wget https://github.com/sadia-sust/dataset-finetune-gpt2/raw/main/Shakespeare-Writings.zip
!unzip Shakespeare-Writings.zip
import nltk
nltk.download('punkt')

In [None]:
import re
import json
from sklearn.model_selection import train_test_split
from nltk.tokenize import sent_tokenize

# I have added code for pre-processing files as I wanted

files = ['Macbeth.txt', 'THE-TRAGEDY-OF-TITUS.txt', 'THE-LIFE-AND-DEATH-OF-KING-RICHARD-THE SECOND.txt', 'romeo-juliet.txt', 'A-MIDSUMMER-NIGHT’S-DREAM.txt',
         'All-Well-That-Ends-Well.txt',
         'The-Tragedy-of-Hamlet.txt', 'The-Tragedy-of-Julius-Caesar.txt', 'The-Tragedy-of-King-Lear.txt', 'The-Tragedy-of-King-Richard.txt',
         'The-Tragedy-of-Romeo-and-Juliet.txt', 'Measure-for-Measure.txt', 'Much-Ado-about-Nothing.txt', 'OTHELLO-THE-MOOR-OF-VENICE.txt', 'THE-WINTER’S-TALE.txt',
         'The-Comedy-of-Errors.txt', 'The-Merchant-of-Venice.txt', 'The-Taming-of-the-Shrew.txt', 'The-Tempest.txt', 'Twelfth-Night.txt', 'The-Sonnets.txt']

processed_data = []
word_count = 0
for file in files:
  #print('file name ', file)
  file_path = "/content/Shakespeare-Writings/" + file
  with open(file_path) as f:
    data = f.read()
  cleaned_data = ""
  #print('First name: ' + data[:50])
  tokens = nltk.sent_tokenize(data)
  for t in tokens:
    temp_data = t.strip()
    temp_data2 = re.sub('\s+',' ', temp_data)
    cleaned_data += re.sub('\n','', temp_data2)
    cleaned_data += t
  processed_data.append(cleaned_data)
  single_file_wc = len(cleaned_data.split())
  print(f'File name: {file}, word count: {single_file_wc}')
  word_count += single_file_wc

print(len(processed_data))
print('Total word tokens: ' + str(word_count))
print("reading done")
#print(data[:200])

#modified build text files method
def build_text_files(data_files, dest_path):
  f = open(dest_path, 'w')
  format_data = ""
  for text in data_files:
    format_data += text
  print("Train dataset length: "+str(len(format_data)))
  f.write(format_data)
  f.close()

#print(len(processed_data))
#as is in tutorial - below all lines
train, test = train_test_split(processed_data, test_size=0.15)

build_text_files(train,'train_dataset.txt')
build_text_files(test,'test_dataset.txt')

print("Train dataset length: "+str(len(train)))
print("Test dataset length: "+ str(len(test)))


In [None]:
!wc train_dataset.txt
!wc test_dataset.txt


In [None]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2") #changed tokenizer

train_path = 'train_dataset.txt' # as is
test_path = 'test_dataset.txt' # as is

In [None]:
#as is in tutorial
from transformers import TextDataset,DataCollatorForLanguageModeling

def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)

    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead

model = AutoModelWithLMHead.from_pretrained("gpt2")

#modified and added parameters in Training Arguments such as save_strategy, evaluation_strategy, push_to_hub, steps value, logging_strategy
training_args = TrainingArguments(
    output_dir="./gpt2-shakespeare", #The output directory
    save_strategy="epoch",
    evaluation_strategy = "steps",
    push_to_hub=True,
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=64,  # batch size for evaluation
    eval_steps = 250, # Number of update steps between two evaluations.
    save_steps= 550, # after # steps model is saved
    warmup_steps=350,# number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
    logging_strategy="steps"
    )

#as is
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,

)

In [None]:
trainer.train()
trainer.save_model()

In [None]:
from transformers import pipeline

story = pipeline('text-generation',model='./gpt2-shakespeare', tokenizer='gpt2', max_length = 300)


In [None]:
story("romeo and juliet ")

In [None]:
story("how art thou")

In [None]:
story("a tragedy story ")

In [None]:
trainer.push_to_hub()