In [None]:
%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet
%pip install ipython==8.16.1
%pip install wandb --upgrade
%pip install sentencepiece
%pip install \
    transformers==4.27.2 \
    datasets==2.11.0 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet

In [None]:
# Import the necessary libraries
import nltk
from datasets import load_dataset
import evaluate
import numpy as np
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments

In [None]:
# Load and split the dataset
dataset = load_dataset("bsaurav/biography")
dataset = dataset["train"].train_test_split(test_size=0.2)
#print(dataset['test']) 
#print(dataset.shape())
dataset

In [None]:
# Load the tokenizer, model, and data collator
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
prompt = f"""

question: What is your name?
answer:
"""
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
original_model_outputs = model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

print(f'ORIGINAL MODEL:\n{original_model_text_output}')

In [None]:
# We prefix our tasks with "answer the question"
prefix = "answer the question: "

# Define our preprocessing function
def preprocess_function(examples):
    """Add prefix to the sentences, tokenize the text, and set the labels"""
    # The "inputs" are the tokenized answer:
    inputs = [prefix + doc for doc in examples["question"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)
    
    # The "labels" are the tokenized outputs:
    labels = tokenizer(text_target=examples["answer"], max_length=512, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Map the preprocessing function across our dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
# Set up Rouge score for evaluation
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return result

In [None]:
# Set up training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results2",
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=6,
    predict_with_generate=True,
    push_to_hub=False,
    hub_token="vvvvvv"
)

# Set up trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

In [None]:
import shutil
shutil.which("git-lfs")

In [None]:
#save the model to the hub
#%pip install git-lfs

#MODEL_NAME = "personal"  # the name of your model
#MODEL_ID = "google/flan-t5-base"  # the id of the base model we will train (can be small, base, large, xl, etc.) (the bigger - the more GPU memory you need)
#REPOSITORY_ID = f"{MODEL_ID.split('/')[1]}-{MODEL_NAME}"
#f"{MODEL_ID.split('/')[1]}-{MODEL_NAME}"  # the id of your huggingface repository where the model will be stored
#NUM_TRAIN_EPOCHS = 4  # number of epochs to train
#tokenizer.save_pretrained("results")
trainer.save_model("results2") 
trainer.create_model_card()
#model.config.to_json()
#trainer.push_to_hub()
#tokenizer.push_to_hub(REPOSITORY_ID)