In [5]:
import pandas as pd
from tqdm import tqdm
import numpy as np

import torch
import pickle

In [6]:
replace_dict = {'and': ['&', "'n"], '': ['%', ',', '.', '#', '[', ']', '!', '?']}
    
def clean_instruction(instruction):
    instruction = instruction.lower()
    for rep, char_list in replace_dict.items():
        for c_ in char_list:
            if c_ in instruction:
                instruction = instruction.replace(c_, rep)
        instruction = instruction.strip()
    
    # remove sentences starting with "1.", "2.", ... from the targets
    if len(instruction) > 0 and instruction[0].isdigit():
        instruction = ''
    return instruction


In [7]:
from transformers import GPT2Tokenizer

bos_token = "<BOS>"
eos_token = "<EOS>"
pad_token = "<PAD>"

ingrs_start = "<INGRS_START>"
ingrs_end = "<INGRS_END>"
ingrs_next = "<INGRS_NEXT>"
instr_start = "<INSTR_START>"
instr_end = "<INSTR_END>"
instr_next = "<INSTR_NEXT>"
title_start = "<TITLE_START>"
title_end = "<TITLE_END>"


special_tokens = [ingrs_start, ingrs_end, ingrs_next, instr_start, instr_end, instr_next, title_start, title_end]

tokenizer = GPT2Tokenizer.from_pretrained(
    "gpt2", 
    additional_special_tokens=special_tokens, 
    bos_token=bos_token,
    eos_token=eos_token,
    pad_token=pad_token,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
tokenizer.model_max_length

1024

In [9]:
# def get_seq(df):
#     seq = []
    
#     for i, row in tqdm(df.iterrows()):
#         seq.append(build_sequence(row))
    
#     return seq

In [10]:
test_df = pd.read_csv("./dataset/new_merged/test_merged.csv")
# train_df = pd.read_csv("./dataset/new_merged/train_merged.csv")
# val_df = pd.read_csv("./dataset/new_merged/val_merged.csv")

In [11]:
# train_seq = get_seq(train_df)
# val_seq = get_seq(val_df)
# test_seq = get_seq(test_df)

In [12]:
# pickle.dump(test_df, open('./test_seq.pkl', 'wb'))
# pickle.dump(train_df, open('./train_seq.pkl', 'wb'))
# pickle.dump(val_df, open('./val_seq.pkl', 'wb'))

In [13]:
# tokenizer.save_vocabulary("./", "recipe")

In [14]:
from transformers import GPT2Model, TrainingArguments

model = GPT2Model.from_pretrained("gpt2")
training_args = TrainingArguments(output_dir="./checkpoints")

2022-03-09 11:30:06.094975: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-03-09 11:30:06.095029: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [39]:
def tokenize_function(row):
    title = row["title"]
    ingrs = " ".join(row["base_ingrs"].split(";"))
    instructions = f"{instr_next}".join([clean_instruction(inst) for inst in row["instructions"].split(";")])
    
    seq = f"{bos_token}{ingrs_start}{ingrs}{ingrs_end}{instr_start}{instructions}{instr_end}{title_start}{title}{title_end}{eos_token}"
    tkns = tokenizer(seq, truncation=True)

    return tkns

In [40]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(test_df)
tokenized_train_dataset = train_dataset.map(
    tokenize_function,
    num_proc=4,
    remove_columns=list(test_df.columns)
)

# train_dataset = Dataset.from_pandas(val_df)
# tokenized_train_dataset = val_dataset.map(tokenize_function)

In [41]:
tokenized_train_dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 8812
})

In [42]:
# block_size = tokenizer.model_max_length
block_size = 128

In [85]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    
    full_length = (total_length // block_size) * block_size
    
    pad_length = block_size - (total_length - full_length)
    
    padded_seq = {
        'input_ids': concatenated_examples['input_ids'] + ([tokenizer.convert_tokens_to_ids('<PAD>')] * pad_length),
        'attention_mask': concatenated_examples['attention_mask'] + ([0] * pad_length)
    }
    
#     print({
#         't': total_length,
#         'f': full_length,
#         'p': pad_length
#     })
    
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length + pad_length, block_size)]
        for k, t in padded_seq.items()
    }
    result["labels"] = result["input_ids"].copy()
    
    return result

In [84]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    print(len(result['input_ids']))
    return result

In [86]:
lm_datasets = tokenized_train_dataset.map(
    group_texts,
    batched=True,
    batch_size=1000,
#     num_proc=4,
)

  0%|          | 0/9 [00:00<?, ?ba/s]

*****171355*****
{'t': 171355, 'f': 171264, 'p': 37}
0
1339
*****168156*****
{'t': 168156, 'f': 168064, 'p': 36}
0
1314
*****176735*****
{'t': 176735, 'f': 176640, 'p': 33}
0
1381
*****177345*****
{'t': 177345, 'f': 177280, 'p': 63}
0
1386
*****167881*****
{'t': 167881, 'f': 167808, 'p': 55}
0
1312
*****175677*****
{'t': 175677, 'f': 175616, 'p': 67}
0
1373
*****175586*****
{'t': 175586, 'f': 175488, 'p': 30}
0
1372
*****179179*****
{'t': 179179, 'f': 179072, 'p': 21}
0
1400
*****143385*****
{'t': 143385, 'f': 143360, 'p': 103}
0
1121


In [78]:
lm_datasets[-1]

{'input_ids': [458,
  1436,
  50265,
  34975,
  19894,
  351,
  20720,
  5935,
  50265,
  11793,
  744,
  351,
  40377,
  24314,
  290,
  4691,
  50264,
  50266,
  31686,
  12,
  4933,
  13754,
  11084,
  50267,
  50258,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  50259,
  502

In [87]:
lm_datasets

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 11998
})

In [39]:
# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     return metric.compute(predictions=predictions, references=labels)

In [88]:
from transformers import DataCollatorForLanguageModeling

dataColl = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [89]:
from transformers import Trainer

model.resize_token_embeddings(len(tokenizer))

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
#     eval_dataset=val_seq,
    tokenizer=tokenizer,
#     data_collator=dataColl
)

In [90]:
trainer.train()

***** Running training *****
  Num examples = 8812
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3306


KeyboardInterrupt: 