In [None]:
%pip install -U datasets==2.17.0

%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet

%pip install \
    transformers==4.27.2 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet

In [None]:
%pip install --upgrade torch transformers datasets


In [None]:
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer


In [None]:
from datasets import load_dataset

dataset = load_dataset("kmyoo/cnn-dailymail-v1-tiny")

In [None]:
model_name='google/flan-t5-base'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

TESTING WITHOUT FINETUNING

In [None]:
index = 50
report = dataset['test'][index]['article']
summary = dataset['test'][index]['highlights']

print(f'{report}\n\n')
print (summary)

In [None]:
prompt = f"""
summarize the following report

{report}


"""

inputs = tokenizer(prompt, return_tensors = "pt")
outputs = tokenizer.decode(
    original_model.generate(
        inputs["input_ids"],
        max_new_tokens = 100,
        )[0],
    skip_special_tokens=True
    )

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'ORIGINAL SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION :\n{outputs}')


In [None]:
def tokenize_function(example):
    start_prompt = 'Summarize the following report.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + report + end_prompt for dialogue in example["article"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["highlights"], padding="max_length", truncation=True, return_tensors="pt").input_ids

    return example


tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['article', 'highlights', 'id'])

In [None]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")



In [None]:

output_dir = "/content/output"

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_steps=10,
    max_steps=10
)

trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

In [None]:
trainer.train()

SAVE THE TRAINED MODEL FOR FUTURE

In [None]:
model_save_path = "/content/model"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

In [None]:
instruct_model = AutoModelForSeq2SeqLM.from_pretrained("/content/model",
                                                        torch_dtype=torch.bfloat16)

In [None]:
prompt = f"""
summarize the following report

{report}


"""


inputs_instruct = tokenizer(prompt, return_tensors = "pt")
outputs_instruct = tokenizer.decode(
    instruct_model.generate(
        inputs["input_ids"],
        max_new_tokens = 100,
        )[0],
    skip_special_tokens=True
    )

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'ORIGINAL SUMMARY:\n{summary}\n')

print(dash_line)
print(f'FINETUNED MODEL GENERATION :\n{outputs_instruct}')