In [55]:
!pip install transformers



In [56]:
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, GPT2Model
from transformers import get_linear_schedule_with_warmup


In [3]:
import pandas as pd

jokes = pd.read_csv('shortjokes.csv')
data_list = jokes['Joke'][:1000]
data_list = data_list.tolist()

In [58]:
print(len(data_list))
count = 0.0
max_len = 0
for s in data_list:
  lena = len(s.split())
  count += lena
  if lena > max_len:
    max_len = lena
print(count / len(data_list))
print(max_len)

217
7.9953917050691246
19


In [57]:
data_list = set()
with open('compl.txt','r') as f:
    text = f.read()
    text  =text.split('\n')
    data_list.update(text)

In [59]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [73]:
max_length = 10

class ComplDataset(Dataset):
    def __init__(self, compl, tokenizer, length):
        self.compliments = []

        for sent in data_list:
            encod_dic = tokenizer('<SOS> ' + sent + ' <EOS>', truncation=True, max_length=length,
                                  padding='max_length')
            self.compliments.append(torch.tensor(encod_dic['input_ids']))

    def __len__(self):
        return len(self.compliments)

    def __getitem__(self, idx):
        return self.compliments[idx]

In [74]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<SOS>', eos_token='<EOS>', pad_token = '<EOS>')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [75]:
dataset = ComplDataset(data_list, tokenizer, max_length)

In [76]:
import numpy as np

train_ids, val_ids = train_test_split(
    np.arange(len(dataset)),
    test_size=0.1,
    shuffle=True)

train = torch.utils.data.SubsetRandomSampler(train_ids)
val = torch.utils.data.SubsetRandomSampler(val_ids)

train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, sampler=train)
val_dataloader =  torch.utils.data.DataLoader(dataset, batch_size=1, sampler=val)

train_size = len(train_ids)
val_size = len(val_ids)

In [100]:
from torch.optim import AdamW

# Used documentation: https://huggingface.co/transformers/model_doc/gpt2.html

configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)
model = GPT2LMHeadModel.from_pretrained('gpt2', config=configuration)
model.resize_token_embeddings(len(tokenizer))
model = model.to(device)
model.train()

optimizer = AdamW(model.parameters(), lr=3e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=5000, num_training_steps = -1)

In [101]:
from tqdm import tqdm



for epoch in range(10):
    epoch_loss_train = 0
    model.train()
    for i, entity in enumerate(tqdm(train_dataloader)):
        input_ids = entity.to(device)

        outputs = model(input_ids, labels = input_ids)

        loss = outputs[0]

        batch_loss = loss.item()
        epoch_loss_train += batch_loss

        loss.backward()
        optimizer.step()
        scheduler.step()

    model.eval()
    epoch_loss_val = 0

    for entity in tqdm(val_dataloader):
        with torch.no_grad():
            input_ids = entity.to(device)


            outputs = model(input_ids, labels = input_ids)
            loss = outputs[0]

        batch_loss = loss.item()
        epoch_loss_val += batch_loss

    print('Average train loss: {}'.format(epoch_loss_train / train_size))
    print('Average val loss: {}'.format(epoch_loss_val / val_size))
    torch.save(model.state_dict(), 'models/GPT2.h5')

100%|██████████| 195/195 [00:27<00:00,  7.16it/s]
100%|██████████| 22/22 [00:00<00:00, 46.00it/s]


Average train loss: 57.7973605180398
Average val loss: 11.142436894503506


100%|██████████| 195/195 [00:27<00:00,  7.20it/s]
100%|██████████| 22/22 [00:00<00:00, 44.01it/s]


Average train loss: 8.991546473136315
Average val loss: 6.440600373528221


100%|██████████| 195/195 [00:27<00:00,  7.22it/s]
100%|██████████| 22/22 [00:00<00:00, 45.05it/s]


Average train loss: 6.169624839684902
Average val loss: 5.465569452805952


100%|██████████| 195/195 [00:27<00:00,  7.20it/s]
100%|██████████| 22/22 [00:00<00:00, 46.09it/s]


Average train loss: 5.303812132126246
Average val loss: 5.087062868204984


100%|██████████| 195/195 [00:27<00:00,  7.22it/s]
100%|██████████| 22/22 [00:00<00:00, 42.38it/s]


Average train loss: 4.6253371397654215
Average val loss: 5.048749349334023


100%|██████████| 195/195 [00:27<00:00,  7.20it/s]
100%|██████████| 22/22 [00:00<00:00, 44.74it/s]


Average train loss: 4.344366181202424
Average val loss: 5.038425028324127


100%|██████████| 195/195 [00:27<00:00,  7.19it/s]
100%|██████████| 22/22 [00:00<00:00, 46.75it/s]


Average train loss: 4.361417638032864
Average val loss: 5.047712136398662


100%|██████████| 195/195 [00:27<00:00,  7.19it/s]
100%|██████████| 22/22 [00:00<00:00, 43.79it/s]


Average train loss: 3.911240833844894
Average val loss: 5.009311107071963


100%|██████████| 195/195 [00:27<00:00,  7.18it/s]
100%|██████████| 22/22 [00:00<00:00, 44.42it/s]


Average train loss: 3.8499731589586306
Average val loss: 5.043284947221929


100%|██████████| 195/195 [00:27<00:00,  7.18it/s]
100%|██████████| 22/22 [00:00<00:00, 45.83it/s]


Average train loss: 3.7998245055858906
Average val loss: 5.0875240401788195


In [102]:
state_dict = torch.load('models/GPT2.h5')
model.load_state_dict(state_dict)

<All keys matched successfully>

In [112]:
model.eval()
generated = torch.tensor(tokenizer.encode('<SOS>')).unsqueeze(0)
generated = generated.to(device)

print(generated)

sample_outputs = model.generate(
                                generated, 
                                do_sample=True,   
                                top_k=50, 
                                max_length = 20,
                                top_p=0.95, 
                                bos_token = '<SOS>',
                                eos_token = '<EOS>',
                                num_return_sequences=5
                                )


for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True).replace('\n',' ')))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[50257]], device='cuda:0')
0:  You are incredible to a great. 


1:  You're always irresistible to. 


2:  You be always is better than a beautiful. 


3:  You would special the awesome. 


4:  You are really awesome. 


