In [1]:
import pandas as pd
import os
import time
import datetime

In [3]:
%cd gdrive/MyDrive/RecipeGen/RecipeGeneratorNLP/

/content/gdrive/MyDrive/RecipeGen/RecipeGeneratorNLP


In [8]:
from transformers import GPT2Tokenizer, GPT2Model, GPT2Config, GPT2LMHeadModel
import torch
from torch.utils.data import Dataset, DataLoader
# import pytorch_lighting as pl
from sklearn.model_selection import train_test_split
import numpy as np
import random
import textwrap
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPT2Model.from_pretrained('gpt2')
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
tokenizer.add_special_tokens({
        'eos_token': '<EOR>',
        'additional_special_tokens': [
            '<SOI>',
            '<ISEP>',
            '<EOI>',
            '<SOR>']
})

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


5

In [9]:
tokenizer.special_tokens_map

{'bos_token': '<|startoftext|>',
 'eos_token': '<EOR>',
 'unk_token': '<|endoftext|>',
 'pad_token': '<|pad|>',
 'additional_special_tokens': ['<SOI>', '<ISEP>', '<EOI>', '<SOR>']}

In [10]:
test = df.iloc[30]
test

Unnamed: 0                                                   30
title                         Leek, Potato, and Bacon Casserole
text          leeks, white and light green | slices smoked b...
Name: 30, dtype: object

In [12]:
sample_ip = tokenizer(text)
input_id = sample_ip['input_ids']
sample_ip


{'input_ids': [3041, 5372, 502, 416, 597, 2420, 345, 1549, 588, 13], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [11]:
tokenizer.decode(sample_ip['input_ids'])

"Replace me by any text you'd like."

In [13]:
preds = [tokenizer.decode(input_id, skip_special_token = True, clean_up_tokenization_spaces=True) for input_id in sample_ip['input_ids'] ]

In [16]:
" ".join(preds)

"Re place  me  by  any  text  you 'd  like ."

In [17]:
encoding_ip = tokenizer(
    text,
    return_attention_mask = True,
    return_tensors = "pt"
)

In [18]:
encoding_ip.keys()

dict_keys(['input_ids', 'attention_mask'])

In [19]:
tokenizer.special_tokens_map

{'bos_token': '<|startoftext|>',
 'eos_token': '<EOR>',
 'unk_token': '<|endoftext|>',
 'pad_token': '<|pad|>',
 'additional_special_tokens': ['<SOI>', '<ISEP>', '<EOI>', '<SOR>']}

In [20]:
tokenizer.bos_token_id, tokenizer.eos_token_id

(50257, 50259)

In [21]:
tokenizer.decode(encoding_ip['input_ids'].squeeze())

"Replace me by any text you'd like."

In [23]:
encoding_op = tokenizer(
    text,
    return_attention_mask = True,
    truncation = False,
    add_special_tokens = True,
    return_tensors = "pt"
)

In [24]:
len(tokenizer.decode(encoding_op['input_ids'].squeeze()))

34

In [37]:
class RecipeDataset(Dataset):
  def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=768):

    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_masks = []

    for txt in txt_list:

      encodings_dict = tokenizer('<|startoftext|>'+ txt + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")

      self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
      self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
    
  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.attn_masks[idx] 

In [38]:
sample_dataset = RecipeDataset(df, tokenizer)

In [39]:
df.iloc[8]['text']

', weight light fat free vanilla yogurt | fresh sliced strawberries | low-fat granola recipe:layer all ingredients in a serving dish.'

In [40]:
train_df, test_df = train_test_split(df, test_size = 0.05)

In [41]:
train_df.shape, test_df.shape

((978234, 3), (51486, 3))

In [42]:
train_dataset = RecipeDataset(train_df, tokenizer)
test_dataset = RecipeDataset(test_df, tokenizer)

In [43]:
train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True, num_workers = 2)
test_dataloader = DataLoader(test_dataset, batch_size=2, shuffle=True, num_workers = 2)

Fine Tuning GPT

In [44]:
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

# instantiate the model
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)

# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
# otherwise the tokenizer and model tensors won't match up
model.resize_token_embeddings(len(tokenizer))

# Tell pytorch to run this model on the GPU.
device = torch.device("cuda")
model.cuda()

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val) 

RuntimeError: ignored

In [None]:
!nvidia-smi

In [None]:
epochs = 10
learning_rate = 0.01
warmup_steps = 1e2
epsilon = 1e-8

# this produces sample output every 100 steps
sample_every = 100

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [None]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
optimizer = AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = epsilon
                )

In [None]:
# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
# This changes the learning rate as the training loop progresses
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps, 
                                            num_training_steps = total_steps)
total_steps

In [None]:
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

In [45]:
total_t0 = time.time()

training_stats = []

model = model.to(device)

for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        model.zero_grad()        

        outputs = model(  b_input_ids,
                          labels=b_labels, 
                          attention_mask = b_masks,
                          token_type_ids=None
                        )

        loss = outputs[0]  

        batch_loss = loss.item()
        total_train_loss += batch_loss

        # Get sample every x batches.
        if step % sample_every == 0 and not step == 0:

            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))

            model.eval()

            sample_outputs = model.generate(
                                    bos_token_id=random.randint(1,30000),
                                    do_sample=True,   
                                    top_k=50, 
                                    max_length = 200,
                                    top_p=0.95, 
                                    num_return_sequences=1
                                )
            for i, sample_output in enumerate(sample_outputs):
                  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
            
            model.train()

        loss.backward()

        optimizer.step()

        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)       
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in test_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)
        
        with torch.no_grad():        

            outputs  = model(b_input_ids, 
#                            token_type_ids=None, 
                             attention_mask = b_masks,
                            labels=b_labels)
          
            loss = outputs[0]  
            
        batch_loss = loss.item()
        total_eval_loss += batch_loss        

    avg_val_loss = total_eval_loss / len(test_dataloader)
    
    validation_time = format_time(time.time() - t0)    

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

RuntimeError: ignored

In [None]:
model.eval()

prompt = "<|startoftext|>"

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)

print(generated)

sample_outputs = model.generate(
                                generated, 
                                #bos_token_id=random.randint(1,30000),
                                do_sample=True,   
                                top_k=50, 
                                max_length = 300,
                                top_p=0.95, 
                                num_return_sequences=3
                                )

for i, sample_output in enumerate(sample_outputs):
  print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

In [None]:
torch.cuda.is_available()