In [1]:
import pandas as pd
from utils import scripts_rework
import warnings

warnings.filterwarnings("ignore")

In [2]:
df = pd.read_pickle("data/scripts_reworked.pkl")
df

Unnamed: 0,answer,question,context
0,"There’s no point, I just think it’s a good id...","Agreed, what’s your point?",
1,I think this is the place.,"If you have to ask, maybe you shouldn’t be here.","Hang on. One across is Aegean, eight down is..."
2,I think this is the place.,"If you have to ask, maybe you shouldn’t be here.","One across is Aegean, eight down is Nabakov, ..."
3,I think this is the place.,"If you have to ask, maybe you shouldn’t be here.","Can I help you? Yes. Um, is this the High IQ ..."
4,I think this is the place.,"If you have to ask, maybe you shouldn’t be here.","Yes. Um, is this the High IQ sperm bank?"
...,...,...,...
48859,"Well, that would raise a number of problems. ...",What if I were?,"Well, perfect. I made us sandwiches. How thou..."
48860,"Well, that would raise a number of problems. ...",What if I were?,"How thoughtful. Thank you. Mmm. No big deal, ..."
48861,"Well, that would raise a number of problems. ...",What if I were?,"Mmm. No big deal, I enjoy spending time with ..."
48862,"Well, that would raise a number of problems. ...",What if I were?,"And I with you. Question, are you seeking a r..."


In [3]:
df["combined"] = df[["question", "context"]].apply(
    lambda df: "context: " + df["context"] + "</s>" + 'question: '
+ df['question'], axis = 1)
df

Unnamed: 0,answer,question,context,combined
0,"There’s no point, I just think it’s a good id...","Agreed, what’s your point?",,"context: </s>question: Agreed, what’s your po..."
1,I think this is the place.,"If you have to ask, maybe you shouldn’t be here.","Hang on. One across is Aegean, eight down is...","context: Hang on. One across is Aegean, eigh..."
2,I think this is the place.,"If you have to ask, maybe you shouldn’t be here.","One across is Aegean, eight down is Nabakov, ...","context: One across is Aegean, eight down is ..."
3,I think this is the place.,"If you have to ask, maybe you shouldn’t be here.","Can I help you? Yes. Um, is this the High IQ ...","context: Can I help you? Yes. Um, is this the..."
4,I think this is the place.,"If you have to ask, maybe you shouldn’t be here.","Yes. Um, is this the High IQ sperm bank?","context: Yes. Um, is this the High IQ sperm b..."
...,...,...,...,...
48859,"Well, that would raise a number of problems. ...",What if I were?,"Well, perfect. I made us sandwiches. How thou...","context: Well, perfect. I made us sandwiches...."
48860,"Well, that would raise a number of problems. ...",What if I were?,"How thoughtful. Thank you. Mmm. No big deal, ...",context: How thoughtful. Thank you. Mmm. No b...
48861,"Well, that would raise a number of problems. ...",What if I were?,"Mmm. No big deal, I enjoy spending time with ...","context: Mmm. No big deal, I enjoy spending t..."
48862,"Well, that would raise a number of problems. ...",What if I were?,"And I with you. Question, are you seeking a r...","context: And I with you. Question, are you se..."


In [4]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.05, random_state=42)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
train_df.shape, val_df.shape

((46420, 4), (2444, 4))

In [5]:
from datasets import Dataset, DatasetDict

dataset = DatasetDict(
    {
        "train": Dataset.from_pandas(train_df.reset_index(drop=True)),
        "valid": Dataset.from_pandas(val_df.reset_index(drop=True)),
    }
)
dataset

DatasetDict({
    train: Dataset({
        features: ['answer', 'question', 'context', 'combined'],
        num_rows: 46420
    })
    valid: Dataset({
        features: ['answer', 'question', 'context', 'combined'],
        num_rows: 2444
    })
})

In [6]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [7]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [8]:
from datasets import concatenate_datasets

# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["valid"]]).map(
    lambda x: tokenizer(x["combined"], truncation=True),
    batched=True,
    remove_columns=["context", "question", "answer", "combined"],
)
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset["train"], dataset["valid"]]).map(
    lambda x: tokenizer(x["answer"], truncation=True),
    batched=True,
    remove_columns=["context", "question", "answer", "combined"],
)
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

Map: 100%|██████████| 48864/48864 [00:01<00:00, 35370.81 examples/s]


Max source length: 332


Map: 100%|██████████| 48864/48864 [00:00<00:00, 62611.42 examples/s]


Max target length: 304


In [9]:
def preprocess_function(sample, padding="max_length"):

    model_inputs = tokenizer(
        sample['combined'], max_length=max_source_length, padding=padding, truncation=True
    )

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(
        text_target=sample["answer"],
        max_length=max_target_length,
        padding=padding,
        truncation=True,
    )

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label]
            for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_dataset = dataset.map(
    preprocess_function, batched=True, remove_columns=["context", "question", "answer", "combined"]
)
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

Map: 100%|██████████| 46420/46420 [00:11<00:00, 3888.36 examples/s]
Map: 100%|██████████| 2444/2444 [00:00<00:00, 4017.64 examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']





In [10]:
from transformers import AutoModelForSeq2SeqLM

# huggingface hub model id
model_id = "google/flan-t5-base"

# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

In [11]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize

nltk.download("punkt")

# Metric
metric = evaluate.load("rouge")


# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    return result

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kate\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8
)

In [13]:
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Hugging Face repository id
repository_id = f"{model_id.split('/')[1]}-sheldon-chat"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    fp16=False,  # Overflows with fp16
    learning_rate=5e-5,
    num_train_epochs=3,
    # logging & evaluation strategies
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    # metric_for_best_model="overall_f1",
    # push to hub parameters
    report_to="tensorboard",
    push_to_hub=False,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"],
    compute_metrics=compute_metrics,
)

In [14]:
trainer.train()

  0%|          | 1/8706 [00:38<92:09:10, 38.11s/it]

OutOfMemoryError: CUDA out of memory. Tried to allocate 68.00 MiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Of the allocated memory 20.73 GiB is allocated by PyTorch, and 975.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
trainer.evaluate()

In [None]:
# Save our tokenizer and create model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
# Push the results to the hub
trainer.push_to_hub()