In [1]:
import os
import time
import datetime 
import torch
from copy import deepcopy
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler
from transformers import GPT2Tokenizer, GPT2LMHeadModel, get_linear_schedule_with_warmup, AdamW


# Note: not only GPT2 
def format_time(elapsed):
    # print nicely formated elapsed time
    return str(datetime.timedelta(seconds=int(round((elapsed)))))


class GPT2Dataset(Dataset):
    """
    Pytorch Dataset wrapper that helps with training and batches of training data. Reads in texts of politicians
    :param txt_list: (Numpy) array of speeches.
    """
    def __init__(self, txt_list, tokenizer, max_length=768):
        self.tokenizer = tokenizer
        self.max_length=max_length
        self.input_ids = []
        self.attn_masks = []

        for txt in txt_list:
            encodings_dict = tokenizer(txt, truncation=True, max_length=self.max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx] 


In [10]:
class GetData():
    """
    Retrieve dataset from local or google colab. Helps generate pytorch dataloaders.
    :param data_path: path to president_speeches file
    :param token_length: Max token length to use in GPT2 model
    """
    def __init__(self, token_length=768):
        #self.data_path = data_path
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2', pad_token="<|endoftext|>")
         
        self.speeches = self.read_data()
        self.token_length = token_length

        # after run once, keep as batch
        self.dataset = None
        self.dataloader = None 

    def read_data(self):
        """
            Read data as pandas dataframe
        """
        file1 = open('quotes.txt', 'r')
        lines1 = [x[:-1] for x in file1.readlines()] 
        file1 = open('wikiquotes.txt', 'r')
        lines2 = [x[:-1] for x in file1.readlines() if x and  x != "\n"]
        return lines1+lines2

    def get_data(self):
        """
            Return the dataset
        """
        return self.speeches

    def get_dataset(self, speech_list: list = None):
        """
            Create a custom pytorch dataset specialised for this task. A customer speech list can be used as well.
        """
        if speech_list is None:
            speech_list = self.get_data()

        self.dataset = GPT2Dataset(
            txt_list=speech_list,
            tokenizer=self.tokenizer,
            max_length=self.token_length
        )

        return self.dataset

    def get_dataloader(self, dataset: Dataset = None, batch_size=2):
        """
            Create an iterable dataloader based on a GPT2Dataset and specified batch_size. Attention, a large batchsize quickly leads
            to memory overloads.
            :param dataset: Input GPT2Dataset object
            :param batch_size: Integer of desired batch_size. Smaller equal 2 recommended.
        """
        if dataset is None:
            if self.dataset is None:
                dataset = self.get_dataset()
            else:
                dataset = self.dataset

        dataloader = DataLoader(
            dataset,
            sampler=RandomSampler(dataset), # Select batches randomly
            batch_size=batch_size # Trains with this batch size.
        )
        self.dataloader = dataloader
        return dataloader

In [6]:

class HitchensQuoteModel(): 
    
    def __init__(self, model_input: GPT2LMHeadModel = None, device="cuda"):
        self.device = device
        self.device = torch.device(self.device)  # Sloppily use Cuda GPU. 
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2', pad_token="<|endoftext|>")  # extract the gpt2 tokenizer
        self.model = self.init_model(model_input=model_input)  # initiate model
        self.model.to(self.device)  # send model to device


    def init_model(self, model_input: GPT2LMHeadModel) -> GPT2LMHeadModel:
        """
            Load pretrained model or use input model
        """
        if model_input is None:
            return GPT2LMHeadModel.from_pretrained("gpt2",
                                                    pad_token_id=self.tokenizer.eos_token_id)
        else:
            return model_input
        

    def fine_tune(self,
                  data,
                  epochs: int = 1,
                  learning_rate: float = 5e-5,
                  epsilon: float = 1e-8,
                  warmup_steps: int = 100) -> None: 
        
        # check input
        assert type(data) == DataLoader, "Datatype for 'data' must be DataLoader"

        # define implicite variables
        batch_size = data.batch_size
        total_steps = len(data) * epochs
        sample_every = 20

        # define optimizer
        optimizer = AdamW(self.model.parameters(),
                            lr=learning_rate,
                            eps=epsilon)

        # define scheduler for learningrate strategy
        scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                    num_warmup_steps=warmup_steps, 
                                                    num_training_steps=total_steps)

        self.model.train()  # put model in training mode (for dropout etc.)
        
        for epoch_i in range(0, epochs):
            t0 = time.time()
            for step, batch in enumerate(data):
                # set batch values
                b_input_ids = batch[0].to(self.device)
                b_labels = batch[0].to(self.device)
                b_masks = batch[1].to(self.device) 
                self.model.zero_grad() # reset gradients to not accumulate! 

                # forward propagation
                outputs = self.model.forward( 
                    b_input_ids,
                    labels=b_labels, 
                    #attention_mask = b_masks,
                    token_type_ids=None)

                # update params
                loss = outputs[0]
                loss.backward()
                optimizer.step()
                scheduler.step()
                batch_loss = loss.item()

                # print in-between times
                if step % sample_every == 0 and not step == 0:
                    elapsed = format_time(time.time() - t0)
                    print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(step, len(data), round(batch_loss, 4), elapsed))


    def save_model(self, save_name): 
        self.model.save_pretrained(save_name)
        
      
     

In [11]:
import os
os.getcwd()

# load dataobject
dt = GetData()

mod = HitchensQuoteModel()

In [12]:

all_loader = dt.get_dataloader()
# train on  Hitchens quotes
params = dict(
    epochs=20,
    learning_rate=2e-4,
    epsilon = 1e-07
)

mod.fine_tune(all_loader, **params)

  Batch    20  of    207. Loss: 0.9424.   Elapsed: 0:00:05.
  Batch    40  of    207. Loss: 0.3891.   Elapsed: 0:00:09.
  Batch    60  of    207. Loss: 0.1038.   Elapsed: 0:00:14.
  Batch    80  of    207. Loss: 0.8286.   Elapsed: 0:00:18.
  Batch   100  of    207. Loss: 0.209.   Elapsed: 0:00:23.
  Batch   120  of    207. Loss: 0.1317.   Elapsed: 0:00:27.
  Batch   140  of    207. Loss: 0.4562.   Elapsed: 0:00:32.
  Batch   160  of    207. Loss: 0.4108.   Elapsed: 0:00:36.
  Batch   180  of    207. Loss: 0.1435.   Elapsed: 0:00:41.
  Batch   200  of    207. Loss: 0.4621.   Elapsed: 0:00:45.
  Batch    20  of    207. Loss: 0.1434.   Elapsed: 0:00:05.
  Batch    40  of    207. Loss: 0.2062.   Elapsed: 0:00:09.
  Batch    60  of    207. Loss: 0.6976.   Elapsed: 0:00:14.
  Batch    80  of    207. Loss: 0.0453.   Elapsed: 0:00:18.
  Batch   100  of    207. Loss: 0.3644.   Elapsed: 0:00:23.
  Batch   120  of    207. Loss: 0.187.   Elapsed: 0:00:27.
  Batch   140  of    207. Loss: 0.1433.   

In [None]:

mod.save_model("hitch_gpt2")