<a href="https://colab.research.google.com/github/pranalibose/LLM_Workshop/blob/main/LLM_Workshop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets evaluate rouge_score accelerate

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 

In [None]:
import transformers

transformers.__version__

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
# print a couple of datasets from HF

from huggingface_hub import list_datasets

datasets_list = list_datasets()



In [None]:
from datasets import load_dataset

cnn_news_summary_ds = load_dataset("cnn_dailymail", "3.0.0")

cnn_news_summary_ds

In [None]:
# Load a subset of the dataset as we will be working with less data (1%)


In [2]:
# look and shape and features of the dataset which we are going to work with


In [None]:
cnn_news_summary_ds = cnn_news_summary_ds.train_test_split(test_size = 0.2, seed=42)

cnn_news_summary_ds

In [None]:
def clean_text(example):
    for txt in ['article', 'highlights']:
        example[txt] = example[txt].replace('\n', '')
        example[txt] = example[txt].replace('--', '')
        example[txt] = example[txt].replace('\\', '')
        example[txt] = example[txt].replace('/', '')
        example[txt] = example[txt].replace('"', '')
        example[txt] = example[txt].replace('``', '')
        return example

In [None]:
# call the clean_text function on each and every record of our dataset


In [None]:
TEXT_INDEX = 33

example_text = cleaned_cnn_news_summary_ds['test']['article'][TEXT_INDEX]

example_text

# Using Pre-trained model to extract summaries

In [None]:
from transformers import pipeline

# set the model you want to use
MODEL_NAME = ''

prefix = 'summarize: '

# add the correct parameters to the transformers pipeline
summarizer = pipeline(task='', model='')

summary_text = summarizer(prefix + example_text)

summary_text

In [None]:
ref_text = cleaned_cnn_news_summary_ds['test']['highlights'][TEXT_INDEX]

ref_text

# Evaluation of summarization using ROUGE

In [None]:
import evaluate

rouge = evaluate.load('rouge')

rouge

In [None]:
# add the predictions and references to compute the rouge scores

rouge_score = rouge.compute(predictions=[], references=[], use_stemmer=True)

rouge_score

# Generate summaries for multiple articles in the dataset

In [None]:
article_texts = cleaned_cnn_news_summary_ds['test']['article']

article_summaries = cleaned_cnn_news_summary_ds['test']['highlights']


In [None]:
# Execution time ~ 1 sec

from tqdm import tqdm

candidate_summaries = []

prefix = 'summarize: '

for i, text in enumerate(tqdm(article_texts[:10])):
    candidate = summarizer(prefix + text)
    candidate_summaries.append(candidate[0]['summary_text'])

In [None]:
result_agg = rouge.compute(predictions=candidate_summaries, references=article_summaries[:10], use_stemmer=True)

result_agg

In [None]:
result_unagg = rouge.compute(predictions=candidate_summaries, references=article_summaries[:10], use_stemmer=True, use_aggregator=False)

result_unagg

In [None]:
import numpy as np

# test the best and worst scores of rouge 2
result_unagg_rsum = np.array(result_unagg['rouge2'])


In [None]:
import pandas as pd

act_vs_pred_summaries_df = pd.DataFrame(list(zip(candidate_summaries, article_summaries[:10])),
                                        columns = ['Predicted_summaries', 'Reference_summaries'])

act_vs_pred_summaries_df.head()

In [None]:
# compare the best scored summaries

print('Actual Summary')
print(act_vs_pred_summaries_df._get_value(3, 'Predicted_summaries'))
print()
print('Predicted Summary')
print(act_vs_pred_summaries_df._get_value(3, 'Reference_summaries'))

In [None]:
# compare the worst scored summaries


# Fine-Tuning T5 Model

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer, model = MODEL_NAME)

tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
prefix = 'summarize: '

def preprocess_function(examples):

    inputs = [prefix + doc for doc in examples['article']]
    model_inputs = tokenizer(inputs, max_length = 1024, truncation = True)

    labels = tokenizer(text_target = examples['highlights'], max_length = 128, truncation = True)

    model_inputs['labels'] = labels['input_ids']

    return model_inputs

In [None]:
# call the preprocess function on all of the records of our cleaned_cnn_news_summary_ds, use batched = True


In [None]:
# function to compute the rouge scores and evaluate the model's performance

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens = True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens = True)

    result = rouge.compute(predictions = decoded_preds, references = decoded_labels, use_stemmer = True)

    prediction_length = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result['generated_length'] = np.mean(prediction_length)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
# fill in the missing parameters: output_dir, num_train_epochs, push_to_hub

# training_args = Seq2SeqTrainingArguments(
#     output_dir = '',
#     eval_strategy = 'epoch',
#     learning_rate = 2e-5,
#     per_device_train_batch_size = 16,
#     per_device_eval_batch_size = 16,
#     weight_decay = 0.01,
#     save_total_limit = 3,
#     num_train_epochs = ,
#     predict_with_generate = True,
#     fp16 = True,
#     push_to_hub =
# )

trainer = Seq2SeqTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_cnn_news_summary_ds['train'],
    eval_dataset = tokenized_cnn_news_summary_ds['test'],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics
)

# train the trainer

In [None]:
# push the configurations of our fine tuned model to HF hub
