In [1]:
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
from transformers import get_linear_schedule_with_warmup


2021-10-30 20:29:47.718553: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-10-30 20:29:47.718573: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
import os

data_list = []
folder = 'dataset/'
for filename in os.listdir(folder):
    file_path = os.path.join(folder, filename)
    with open(file_path, 'r') as f:
        text = f.read()
        data_list.append(text)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
# Used: https://towardsdatascience.com/fine-tuning-gpt2-for-text-generation-using-pytorch-2ee61a4f1ba7
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<SOS>', eos_token='<EOS>')
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
max_length = 800

class ComplDataset(Dataset):
    def __init__(self, compl, tokenizer, length):
        self.compl = compl
        self.ids = []
        self.masks = []

        for sent in data_list:
            encod_dic = tokenizer('<SOS> ' + sent + ' <EOS>', truncation=True, max_length=length,
                                  padding='max_length')
            self.ids.append(torch.tensor(encod_dic['input_ids']))
            self.masks.append(torch.tensor(encod_dic['attention_mask']))

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        return self.ids[idx], self.masks[idx]

In [6]:
dataset = ComplDataset(data_list, tokenizer, max_length)

In [7]:
# used:https://discuss.pytorch.org/t/how-to-use-sklearns-train-test-split-on-pytorchs-dataset/31521
# used: https://pytorch.org/docs/stable/data.html
import numpy as np

train_ids, val_ids = train_test_split(
    np.arange(len(dataset)),
    test_size=0.2,
    shuffle=True)

train = torch.utils.data.SubsetRandomSampler(train_ids)
val = torch.utils.data.SubsetRandomSampler(val_ids)

In [8]:
train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=4, sampler=train)
val_dataloader =  torch.utils.data.DataLoader(dataset, batch_size=2, sampler=val)

train_size = len(train_ids)
val_size = len(val_ids)

In [9]:
from torch.optim import Adam

# Used documentation: https://huggingface.co/transformers/model_doc/gpt2.html

configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)
model = GPT2LMHeadModel.from_pretrained('gpt2', config=configuration)
model.resize_token_embeddings(len(tokenizer))
model = model.to(device)
model.train()

optimizer = Adam(model.parameters(), lr=5e-4)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=10000, num_training_steps=-1)

In [10]:
from tqdm import tqdm

import gc

for epoch in range(3):
    epoch_loss_train = 0
    model.train()
    for i, batch in enumerate(tqdm(train_dataloader)):
        input_ids = batch[0].to(device)
        labels = batch[0].to(device)
        masks = batch[1].to(device)

        outputs = model(input_ids, labels=labels, attention_mask=masks)

        loss = outputs[0]

        batch_loss = loss.item()
        epoch_loss_train += batch_loss

        loss.backward()
        optimizer.step()
        scheduler.step()

    model.eval()
    epoch_loss_val = 0

    for batch in tqdm(val_dataloader):
        input_ids = batch[0].to(device)
        labels = batch[0].to(device)
        masks = batch[1].to(device)

        outputs = model(input_ids, labels=labels, attention_mask=masks)
        loss = outputs[0]

        batch_loss = loss.item()
        epoch_loss_val += batch_loss

    print('Average train loss: {}'.format(epoch_loss_train / train_size))
    print('Average val loss: {}'.format(epoch_loss_val / val_size))
    torch.save(model.state_dict(), 'models/GPT2.h5')

  1%|          | 1/109 [00:00<01:00,  1.79it/s]


RuntimeError: CUDA out of memory. Tried to allocate 116.00 MiB (GPU 0; 3.95 GiB total capacity; 3.16 GiB already allocated; 64.25 MiB free; 3.21 GiB reserved in total by PyTorch)

In [None]:
state_dict = torch.load('models/GPT2.h5')
model.load_state_dict(state_dict)

In [None]:
model.eval()
generated = torch.tensor(tokenizer.encode('<SOS>')).unsqueeze(0)
generated = generated.to(device)

print(generated)

sample_outputs = model.generate(
                                generated,
                                do_sample=True,
                                top_k=50,
                                max_length = 800,
                                top_p=0.95,
                                num_return_sequences=3
                                )

for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True).replace('\n',' ')))