In [1]:
import numpy
import sys
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader, RandomSampler
from torch.nn import functional as F

from sklearn.model_selection import train_test_split

from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
from transformers import AdamW, get_linear_schedule_with_warmup
import nltk

2021-10-30 18:39:30.659744: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-10-30 18:39:30.659770: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
import os

data_list = []
folder = 'dataset/'
for filename in os.listdir(folder):
    file_path = os.path.join(folder, filename)
    with open(file_path, 'r') as f:
        text = f.read()
        data_list.append(text)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<SOS>', eos_token='<EOS>')
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
max_length = 100

class ComplDataset(Dataset):
    def __init__(self, compl, tokenizer, max_length):

        self.compl = compl
        self.ids = []
        self.masks = []

        for sent in data_list:
            encod_dic = tokenizer('<SOS> ' + sent + ' <EOS>', truncation=True, max_length=max_length, padding='max_length')
            self.ids.append(torch.tensor(encod_dic['input_ids']))
            self.masks.append(torch.tensor(encod_dic['attention_mask']))

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        return self.ids[idx], self.masks[idx]

In [6]:
dataset = ComplDataset(data_list,tokenizer,max_length)

In [7]:
import numpy as np

train_ids, valid_ids = train_test_split(
    np.arange(len(dataset)),
    test_size=0.1,
    shuffle=True)

In [8]:
train_sampler = torch.utils.data.SubsetRandomSampler(train_ids)
valid_sampler = torch.utils.data.SubsetRandomSampler(valid_ids)

dataloaders = {'train': torch.utils.data.DataLoader(dataset, batch_size=2, sampler=train_sampler),
'val': torch.utils.data.DataLoader(dataset, batch_size=1,sampler=valid_sampler)}

dataset_sizes = {'train': len(train_ids), 'val': len(valid_ids)}

In [9]:
# Used documentation: https://huggingface.co/transformers/model_doc/gpt2.html
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)
model = GPT2LMHeadModel.from_pretrained('gpt2', config=configuration)
model.resize_token_embeddings(len(tokenizer))
model = model.to(device)
model.train()

optimizer = AdamW(model.parameters(), lr=5e-4, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=10000, num_training_steps = -1)

In [11]:
from tqdm import tqdm

import gc

for epoch in range(3):
    epoch_loss_train = 0
    model.train()
    for i, batch in enumerate(tqdm(dataloaders['train'])):
        input_ids = batch[0].to(device)
        labels = batch[0].to(device)
        masks = batch[1].to(device)

        outputs = model(input_ids, labels=labels, attention_mask=masks)

        loss = outputs[0]

        batch_loss = loss.item()
        epoch_loss_train += batch_loss

        loss.backward()
        optimizer.step()
        scheduler.step()

    model.eval()
    epoch_loss_val = 0

    for batch in tqdm(dataloaders['val']):
        input_ids = batch[0].to(device)
        labels = batch[0].to(device)
        masks = batch[1].to(device)


        outputs = model(input_ids, labels=labels, attention_mask=masks)
        loss = outputs[0]

        batch_loss = loss.item()
        epoch_loss_val += batch_loss

    print('Average train loss: {}'.format(epoch_loss_train/len(dataloaders['train'])))
    print('Average val loss: {}'.format(epoch_loss_val/len(dataloaders['val'])))
    torch.save(model.state_dict(), 'models/GPT2.h5')

100%|██████████| 123/123 [00:43<00:00,  2.84it/s]
100%|██████████| 28/28 [00:01<00:00, 25.58it/s]


Average train loss: 5.19830058454498
Average val loss: 4.810767412185669


100%|██████████| 123/123 [00:43<00:00,  2.84it/s]
100%|██████████| 28/28 [00:01<00:00, 24.89it/s]


Average train loss: 4.673725753295712
Average val loss: 4.397540824753897


100%|██████████| 123/123 [00:40<00:00,  3.04it/s]
100%|██████████| 28/28 [00:00<00:00, 29.63it/s]


Average train loss: 4.598887606364925
Average val loss: 4.66388002889497
