In [1]:
# Preliminaries
import os
import pandas as pd
import numpy as np

#Pytorch
import torch
import torch.nn as nn
from torch.utils.data import DataLoader,Dataset

#Transformers
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import AdamW, WarmUp, get_linear_schedule_with_warmup

#Warning
import warnings
warnings.filterwarnings('ignore')

In [2]:
BATCH_SIZE = 16
EPOCHS = 4
LEARNING_RATE = 3e-5
MAX_LEN = 64
TRAIN_PATH = "../input/short-jokes/shortjokes.csv"
Tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [3]:
# Processing Data
def process_jokes(raw_fp):
    df = pd.read_csv(raw_fp)

    # Append token at the end of each joke to indicate the end of a joke

    what_jokes = df[df.Joke.str.lower().str.startswith("what")].Joke.str.split("?")
    how_jokes = df[df.Joke.str.lower().str.startswith("how")].Joke.str.split("?")
    why_jokes = df[df.Joke.str.lower().str.startswith("why")].Joke.str.split("?")
    when_jokes = df[df.Joke.str.lower().str.startswith("when")].Joke.str.split("?")
    where_jokes = df[df.Joke.str.lower().str.startswith("where")].Joke.str.split("?")

    jokes = []
    for joke_ in [what_jokes, how_jokes, why_jokes, when_jokes, where_jokes]:
        joke_df_ = pd.DataFrame(joke_.values.tolist()).iloc[:, :2].dropna()
        joke_df_.columns = ["questions", "answer"]
        jokes.append(joke_df_)

    jokes_df = pd.concat(jokes)
    jokes_df = (
        jokes_df[~(jokes_df.answer.isin([""]))].drop_duplicates().reset_index(drop=True)
    )

    riddle_jokes_list = (
        "<soq> " + jokes_df.questions + " <eoq> " + jokes_df.answer + " <|endoftext|>"
    ).values.tolist()
    riddle_jokes = "\n".join(riddle_jokes_list)

    return riddle_jokes_list

In [4]:
# Creating Custom DataSet

class Jokesdataset(Dataset):
  def __init__(self,data,tokenizer):
    self.data = data
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.data)

  def __getitem__(self,idx):
    joke = self.data[idx]
  
    inputs = self.tokenizer.encode_plus(
            joke,
            None,
            add_special_tokens = True,
            max_length = MAX_LEN,
            pad_to_max_length = True
        )

    ids = inputs["input_ids"]
    mask = inputs["attention_mask"]

    return {'ids':torch.tensor(ids,dtype=torch.long),
            'mask': torch.tensor(mask,dtype=torch.long),
            'target':torch.tensor(ids,dtype=torch.long)}

In [5]:
# Initializing Model and adding our special Tokens to model vocab

model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
special_tokens_dict = {'pad_token': '<PAD>','bos_token':'<soq>','sep_token':'<eoq>'}
num_added_toks = Tokenizer.add_special_tokens(special_tokens_dict)
print('We have added', num_added_toks, 'tokens')
model.resize_token_embeddings(len(Tokenizer))

Downloading:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

We have added 3 tokens


Embedding(50260, 1024)

In [6]:
# Training Function

def train_fn(data_loader, model, optimizer, device, scheduler,epoch):
  model.train()
  
  for bi, d in enumerate(data_loader):
        ids = d["ids"]
        mask = d["mask"]
        labels = d['target']

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        labels = labels.to(device,dtype=torch.long)
          
        optimizer.zero_grad()
        outputs = model(
            input_ids =ids,
            attention_mask=mask,
            labels = labels
        )

        loss, logits = outputs[:2]                        
        loss.backward()

        optimizer.step()
        if scheduler is not None:
                scheduler.step()

        if (bi+1) % 100 == 0:
           print('Epoch [{}/{}], bi[{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, EPOCHS, bi+1,len(data_loader), loss.item()))

In [7]:
device= "cuda:0"

In [8]:
#ENGINE

def run():
  joke_list = process_jokes(TRAIN_PATH)
  
  jokes_dataset = Jokesdataset(joke_list,Tokenizer)
  jokes_dataloader = DataLoader(jokes_dataset,
                                batch_size=BATCH_SIZE,
                                shuffle=True,
                                num_workers=4)
  
  model.to(device)

  num_train_steps = int(len(jokes_dataloader) / BATCH_SIZE * EPOCHS)

  optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
  scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=num_train_steps)

  for epoch in range(EPOCHS):
        print(f"EPOCH {epoch+1} started" + '=' * 30)
        train_fn(jokes_dataloader, model, optimizer, device, scheduler,epoch=epoch)
        
#         models_folder = MODEL_FOLDER
#         if not os.path.exists(models_folder):
#           os.mkdir(models_folder)
        torch.save(model.state_dict(),f"gpt2_medium_joker_3.pt")

In [9]:
# Begin Training
run()



Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

Epoch [1/4], bi[100/4166], Loss: 1.3645
Epoch [1/4], bi[200/4166], Loss: 1.3901
Epoch [1/4], bi[300/4166], Loss: 1.2468
Epoch [1/4], bi[400/4166], Loss: 1.4268
Epoch [1/4], bi[500/4166], Loss: 1.1035
Epoch [1/4], bi[600/4166], Loss: 1.1333
Epoch [1/4], bi[700/4166], Loss: 1.0665
Epoch [1/4], bi[800/4166], Loss: 1.2183
Epoch [1/4], bi[900/4166], Loss: 1.1353
Epoch [1/4], bi[1000/4166], Loss: 1.0100
Epoch [1/4], bi[1100/4166], Loss: 1.1698
Epoch [1/4], bi[1200/4166], Loss: 1.0658
Epoch [1/4], bi[1300/4166], Loss: 1.1142
Epoch [1/4], bi[1400/4166], Loss: 1.0410
Epoch [1/4], bi[1500/4166], Loss: 1.2294
Epoch [1/4], bi[1600/4166], Loss: 1.0015
Epoch [1/4], bi[1700/4166], Loss: 1.0723
Epoch [1/4], bi[1800/4166], Loss: 1.0134
Epoch [1/4], bi[1900/4166], Loss: 1.1615
Epoch [1/4], bi[2000/4166], Loss: 0.9847
Epoch [1/4], bi[2100/4166], Loss: 0.9887
Epoch [1/4], bi[2200/4166], Loss: 1.1134
Epoch [1/4], bi[2300/4166], Loss: 1.0142
Epoch [1/4], bi[2400/4166], Loss: 1.0762
Epoch [1/4], bi[2500/4166

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

Epoch [2/4], bi[100/4166], Loss: 1.1545
Epoch [2/4], bi[200/4166], Loss: 1.2173
Epoch [2/4], bi[300/4166], Loss: 1.0319
Epoch [2/4], bi[400/4166], Loss: 1.3387
Epoch [2/4], bi[500/4166], Loss: 1.0994
Epoch [2/4], bi[600/4166], Loss: 1.1587
Epoch [2/4], bi[700/4166], Loss: 0.9962
Epoch [2/4], bi[800/4166], Loss: 1.1295
Epoch [2/4], bi[900/4166], Loss: 1.0799
Epoch [2/4], bi[1000/4166], Loss: 1.2278
Epoch [2/4], bi[1100/4166], Loss: 1.2978
Epoch [2/4], bi[1200/4166], Loss: 1.0654
Epoch [2/4], bi[1300/4166], Loss: 1.1175
Epoch [2/4], bi[1400/4166], Loss: 1.1036
Epoch [2/4], bi[1500/4166], Loss: 1.2918
Epoch [2/4], bi[1600/4166], Loss: 1.1953
Epoch [2/4], bi[1700/4166], Loss: 1.2856
Epoch [2/4], bi[1800/4166], Loss: 0.9681
Epoch [2/4], bi[1900/4166], Loss: 1.0946
Epoch [2/4], bi[2000/4166], Loss: 1.0934
Epoch [2/4], bi[2100/4166], Loss: 1.1148
Epoch [2/4], bi[2200/4166], Loss: 1.0603
Epoch [2/4], bi[2300/4166], Loss: 1.2067
Epoch [2/4], bi[2400/4166], Loss: 1.0750
Epoch [2/4], bi[2500/4166

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

Epoch [3/4], bi[100/4166], Loss: 1.0390
Epoch [3/4], bi[200/4166], Loss: 1.1733
Epoch [3/4], bi[300/4166], Loss: 1.0273
Epoch [3/4], bi[400/4166], Loss: 1.1621
Epoch [3/4], bi[500/4166], Loss: 0.9894
Epoch [3/4], bi[600/4166], Loss: 1.1391
Epoch [3/4], bi[700/4166], Loss: 1.2240
Epoch [3/4], bi[800/4166], Loss: 1.1658
Epoch [3/4], bi[900/4166], Loss: 1.1368
Epoch [3/4], bi[1000/4166], Loss: 1.1087
Epoch [3/4], bi[1100/4166], Loss: 1.0392
Epoch [3/4], bi[1200/4166], Loss: 1.0067
Epoch [3/4], bi[1300/4166], Loss: 1.1517
Epoch [3/4], bi[1400/4166], Loss: 1.0328
Epoch [3/4], bi[1500/4166], Loss: 1.0996
Epoch [3/4], bi[1600/4166], Loss: 1.0351
Epoch [3/4], bi[1700/4166], Loss: 1.0323
Epoch [3/4], bi[1800/4166], Loss: 1.2628
Epoch [3/4], bi[1900/4166], Loss: 1.0666
Epoch [3/4], bi[2000/4166], Loss: 1.1218
Epoch [3/4], bi[2100/4166], Loss: 1.1874
Epoch [3/4], bi[2200/4166], Loss: 1.0293
Epoch [3/4], bi[2300/4166], Loss: 1.0084
Epoch [3/4], bi[2400/4166], Loss: 1.0431
Epoch [3/4], bi[2500/4166

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

Epoch [4/4], bi[100/4166], Loss: 0.9493
Epoch [4/4], bi[200/4166], Loss: 1.2157
Epoch [4/4], bi[300/4166], Loss: 1.0700
Epoch [4/4], bi[400/4166], Loss: 0.9790
Epoch [4/4], bi[500/4166], Loss: 0.9984
Epoch [4/4], bi[600/4166], Loss: 0.9402
Epoch [4/4], bi[700/4166], Loss: 0.9643
Epoch [4/4], bi[800/4166], Loss: 0.9737
Epoch [4/4], bi[900/4166], Loss: 1.0156
Epoch [4/4], bi[1000/4166], Loss: 1.0699
Epoch [4/4], bi[1100/4166], Loss: 1.2093
Epoch [4/4], bi[1200/4166], Loss: 1.1362
Epoch [4/4], bi[1300/4166], Loss: 1.0099
Epoch [4/4], bi[1400/4166], Loss: 1.0005
Epoch [4/4], bi[1500/4166], Loss: 1.2123
Epoch [4/4], bi[1600/4166], Loss: 0.9987
Epoch [4/4], bi[1700/4166], Loss: 1.1117
Epoch [4/4], bi[1800/4166], Loss: 1.2009
Epoch [4/4], bi[1900/4166], Loss: 1.0770
Epoch [4/4], bi[2000/4166], Loss: 0.8702
Epoch [4/4], bi[2100/4166], Loss: 1.0548
Epoch [4/4], bi[2200/4166], Loss: 1.1262
Epoch [4/4], bi[2300/4166], Loss: 1.0671
Epoch [4/4], bi[2400/4166], Loss: 1.1242
Epoch [4/4], bi[2500/4166