In [1]:
!pip install transformers



In [1]:
# importing libararis
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import math
from tqdm import tqdm
import torch
from transformers import GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, \
                        Trainer, TrainingArguments, GPT2LMHeadModel, pipeline
import warnings
warnings.filterwarnings('ignore')

### Dataset preparation

In [2]:
# preparing dataset for usage in model
with open('data/gameofthrones.txt', encoding='utf-8') as f:
    lines = [line.strip() for line in f if line!='\n']

train_df, test_df = train_test_split(lines, test_size=0.2)

def clean(s):
    s = str(s).strip()
    s = re.sub(r"\s", " ", s)
    return s

def build_dataset(df, dest_path):
    df = [clean(s) for s in df]
    with open(dest_path, 'w', encoding='utf-8') as f:
        for s in df:
                f.write(s)

build_dataset(train_df, 'data/train_dataset.txt')
build_dataset(test_df, 'data/test_dataset.txt')

### Datasets, Tokenizer and Model initializations

In [3]:
# create tokenize and create datasets 
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
train_path = 'data/train_dataset.txt'
test_path = 'data/test_dataset.txt'

def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)
     
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)

In [5]:
# create model and initialize training parameters
model = GPT2LMHeadModel.from_pretrained("gpt2-got")

training_args = TrainingArguments(
    output_dir="./gpt2-got",
    overwrite_output_dir=True,
    num_train_epochs=1,
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    eval_steps = 400,
    save_steps=800,
    warmup_steps=500)

In [6]:
# let's look at the model's architecture
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

### Training

In [None]:
# create a traner object
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset)

In [None]:
# run training process
trainer.train()

### Evaluation

In [None]:
# evaluate test dataset
trainer.evaluate()

In [None]:
# prepare dataset for calculating perplexity
with open('data/test_dataset.txt', encoding='utf-8') as f:
    test = f.readlines()[0]
    encodings = tokenizer(test, return_tensors='pt')

# calculate perplexity
def calc_perplexity(model_path):
    device = 'cuda'
    stride = 512
    model = GPT2LMHeadModel.from_pretrained(model_path).to(device)
    max_length = model.config.n_positions

    lls = []
    for i in tqdm(range(0, encodings.input_ids.size(1), stride)):
        begin_loc = max(i + stride - max_length, 0)
        end_loc = min(i + stride, encodings.input_ids.size(1))
        trg_len = end_loc - i    # may be different from stride on last loop
        input_ids = encodings.input_ids[:,begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:,:-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            log_likelihood = outputs[0] * trg_len

        lls.append(log_likelihood)

    ppl = torch.exp(torch.stack(lls).sum() / end_loc)
    print(f'Perplexity for test dataset = {ppl.item()}')

calc_perplexity("gpt2-got")
calc_perplexity("gpt2")

### Inference

In [7]:
got = pipeline('text-generation', model='./gpt2-got', tokenizer='gpt2', config={'max_length':1000})

In [8]:
for character in ['John Snow','Tyrion Lanister','Eddard Stark','Daenerys Targaryen']:
    generated_text = got(character)[0]['generated_text']
    print(generated_text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


John Snow and the rest of the village laughed at her, at the sight of them. “What does life have to do with dead heads and life and dead bodies?”“He’s not dead, but he’


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Tyrion Lanister went to speak with Arya, and said, “Lord Stannis?”He had been a wildling, and he remembered seeing the direwolf that had run from Ned Stark a fortnight before. How brave,


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Eddard Stark is strong and brave, and he will have none of it. Sansa would not have come to him with this fear if it were true—for the Lannisters were not the best friends Ser Barristan had known his whole
Daenerys Targaryen is only a boy of nine, but she must live to be crowned queen, though if she is as strong as some of the children in King’s Landing, it will almost surely be in her favor to wed
