In [None]:
!pip install transformers datasets accelerate nvidia-ml-py3

Collecting accelerate
  Downloading accelerate-0.7.1-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 3.5 MB/s 
Installing collected packages: accelerate
Successfully installed accelerate-0.7.1


In [None]:
import json
import pickle
from pathlib import Path

# import pandas as pd
from tqdm import tqdm
import torch

from datasets import Dataset
from transformers import Trainer, AutoTokenizer, AutoModelForCausalLM, TrainingArguments, default_data_collator

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [None]:
checkpoint_dir = Path("./checkpoints/model-test")
save_model_dir = Path("./models/model-test")
eval_result_dir = Path("./eval")

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

print("="*100)
print(f"device: {device}")
print("="*100)

device: cpu


In [None]:
replace_dict = {'and': ['&', "'n"], '': ['%', ',', '.', '#', '[', ']', '!', '?']}

def clean_instruction(instruction):
    instruction = instruction.lower()
    for rep, char_list in replace_dict.items():
        for c_ in char_list:
            if c_ in instruction:
                instruction = instruction.replace(c_, rep)
        instruction = instruction.strip()

    # remove sentences starting with "1.", "2.", ... from the targets
    if len(instruction) > 0 and instruction[0].isdigit():
        instruction = ''
    return instruction

In [None]:
def tokenize_function(row):
    title = row["title"]
    ingrs = ingrs_next.join(row["base_ingrs"].split(";"))
    instructions = f"{instr_next}".join([clean_instruction(inst) for inst in row["instructions"].split(";")])

    seq = f"{bos_token}{ingrs_start}{ingrs}{ingrs_end}{instr_start}{instructions}{instr_end}{title_start}{title}{title_end}{eos_token}"
    tkns = tokenizer(seq, truncation=True)
    tkns["labels"] = tkns["input_ids"].copy()

    return tkns

In [None]:
block_size = 128

def pad_group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])

    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.

    full_length = (total_length // block_size) * block_size
    pad_length = block_size - (total_length - full_length)

    padded_seq = {
        'input_ids': concatenated_examples['input_ids'] + ([tokenizer.convert_tokens_to_ids('<PAD>')] * pad_length),
        'attention_mask': concatenated_examples['attention_mask'] + ([0] * pad_length)
    }

    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length + pad_length, block_size)]
        for k, t in padded_seq.items()
    }
    result["labels"] = result["input_ids"].copy()

    return result

In [None]:
bos_token = "<BOS>"
eos_token = "<EOS>"
pad_token = "<PAD>"

ingrs_start = "<INGRS_START>"
ingrs_end = "<INGRS_END>"
ingrs_next = "<INGRS_NEXT>"
instr_start = "<INSTR_START>"
instr_end = "<INSTR_END>"
instr_next = "<INSTR_NEXT>"
title_start = "<TITLE_START>"
title_end = "<TITLE_END>"

In [None]:
special_tokens = [ingrs_start, ingrs_end, ingrs_next, instr_start, instr_end, instr_next, title_start, title_end]

tokenizer = AutoTokenizer.from_pretrained(
    "distilgpt2",
    additional_special_tokens=special_tokens,
    bos_token=bos_token,
    eos_token=eos_token,
    pad_token=pad_token,
)

In [None]:
# test_df = pd.read_csv("../dataset/new_merged/test_merged.csv")
train_df = pd.read_csv("../dataset/new_merged/train_merged.csv")
val_df = pd.read_csv("../dataset/new_merged/val_merged.csv")

In [None]:
train_dataset = Dataset.from_pandas(train_df)
train_dataset = train_dataset.shuffle()
tokenized_train_dataset = train_dataset.map(tokenize_function, remove_columns=list(train_df.columns), num_proc=8)

val_dataset = Dataset.from_pandas(val_df)
val_dataset = val_dataset.shuffle()
tokenized_val_dataset = val_dataset.map(tokenize_function, remove_columns=list(val_df.columns), num_proc=8)


In [None]:
lm_train_datasets = tokenized_train_dataset.map(
#     group_texts,
    pad_group_texts,
    batched=True,
    batch_size=1000,
    num_proc=8,
)
lm_train_datasets.set_format("pt")

pickle.dump(lm_train_datasets, open("./lm_train_datasets.pkl", "wb"))

lm_val_datasets = tokenized_val_dataset.map(
#     group_texts,
    pad_group_texts,
    batched=True,
    batch_size=1000,
    num_proc=8,
)
lm_val_datasets.set_format("pt")

pickle.dump(lm_val_datasets, open("./lm_val_datasets.pkl", "wb"))

In [None]:
# lm_train_datasets = pickle.load(open("./lm_train_datasets.pkl", "rb"))
# lm_val_datasets = pickle.load(open("./lm_val_datasets.pkl", "rb"))

print(lm_train_datasets)
print(lm_val_datasets)

In [None]:
model = AutoModelForCausalLM.from_pretrained("distilgpt2")
model.resize_token_embeddings(len(tokenizer))

model.to(device)
data_collator = default_data_collator

In [None]:
default_args = {
	"output_dir": checkpoint_dir,
	"learning_rate": 2e-5,
	"weight_deacy": 0.01,
	"save_strategy": "epoch",
	"logging_strategy": "steps",
	"logging_steps": 100,
    "num_train_epochs": 10
}

training_arguments = TrainingArguments(
	per_device_train_batch_size = 4,
	per_device_eval_batch_size = 4,
	gradient_accumulation_steps = 32
	**default_args
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=lm_train_datasets,
    eval_dataset=lm_val_datasets,
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [None]:
trainer.train()
# trainer.train(resume_from_checkpoint=True)

In [None]:
trainer.save_model(save_model_dir)
print(f"model saved at {save_model_dir}")

In [None]:
eval_results = trainer.evaluate()

result_path = eval_result_dir / "eval_result.json"
json.dump(eval_results, open(result_path, "w"))

print(eval_results)

# print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
drive.flush_and_unmount()