In [None]:
!pip install -U transformers
!pip install -U accelerate
!pip install -U datasets
!pip install -U bertviz
!pip install -U umap-learn
!pip install -U sentencepiece
!pip install --upgrade urllib3
!pip install py7zr  #use to compress the model for download

In [None]:
from datasets import load_dataset
dataset=load_dataset('ccdv/cnn_dailymail', '3.0.0')

In [None]:
dataset


In [None]:
dataset['train']

In [None]:
dataset['train']['article'][0][:200]

In [None]:
dataset['train']['highlights'][0]

In [None]:
from transformers import pipeline


# Let's first check the performance of the different summarization models without fine tuning.


In [None]:
# using gpt2-medium model due to gpu limitation.
pipe=pipeline("text-generation",model='gpt2-medium')

In [None]:
#text summarization
input=dataset['train'][1]['article'][:2000]

In [None]:
query=input+"\nTL;DR:\n"   #\nTL;DR:\n command is used to generate the summary in gpt2 model as mentioned in the paper
output=pipe(query,max_length=700,clean_up_tokenization_spaces=True)

In [None]:
output

In [None]:
output[0]['generated_text']

In [None]:
output[0]['generated_text'][len(query):]

In [None]:
summaries={}
summaries['gpt2-medium-380M']=output[0]['generated_text'][len(query):]

In [None]:
# T5 base model with 233M parameters: Transformer based

In [None]:
pipe=pipeline("summarization",model='t5-base')

In [None]:
output=pipe(input)

In [None]:
output[0]

In [None]:
output[0]['summary_text']

In [None]:
summaries['t5-base-233M']=output[0]['summary_text']

In [None]:
#BART :bart-large fine tuned over cnn data
#pipe=pipeline('summarization',model='facebook/bart-large-cnn')
#output=pipe(input)

In [None]:
output[0]

In [None]:
#summaries['facebook/bart-large-cnn']=output[0]['summary_text']

In [None]:
#pegasus-cnn_daily
#pipe=pipeline('summarization',model='google/pegasus-cnn_dailymail')
#output=pipe(input)

In [None]:
#output[0]

In [None]:
#summaries['google/pegasus-cnn_dailymail-568M']=output[0]['summary_text']

In [None]:
for model in summaries:
  print(model.upper())
  print(summaries[model])
  print(" ")

# Fine tuning the 'facebook/bart-large-cnn' model using samsum dataset

In [None]:
!pip install -U datasets

In [None]:
from datasets import load_dataset
from transformers import pipeline

from transformers import AutoModelForSeq2SeqLM,AutoTokenizer
import torch

In [None]:
device='gpu'
model_ckpt='facebook/bart-large-cnn' #for model checkpoint
tokenizer=AutoTokenizer.from_pretrained(model_ckpt)
model=AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)

In [None]:
samsum=load_dataset('samsum')

In [None]:
samsum

In [None]:
samsum['train'][0]

In [None]:
dialogue_len=[len(x['dialogue'].split()) for x in samsum['train']]
summary_len=[len(x['summary'].split()) for x in samsum['train']]

In [None]:
import pandas as pd

df=pd.DataFrame([dialogue_len,summary_len]).T
df.columns=["dialogue_len","summary_len"]
df

In [None]:
df.hist(figsize=(10,5))

In [None]:
# Let's build the data collator

def get_feature(batch):
  encoding=tokenizer(batch['dialogue'],text_target=batch['summary'],
                     max_length=1024,truncation=True)

  encoding={'input_ids':encoding['input_ids'],
            'attention_mask':encoding['attention_mask'],
            'labels':encoding['labels']}
  return encoding

In [None]:
samsum_pt=samsum.map(get_feature,batched=True) #pt for pytorch

In [None]:
samsum_pt

In [None]:
columns=['input_ids','attention_mask','labels']
samsum_pt.set_format(type='torch',columns=columns)

In [None]:
! pip install rouge_score

In [None]:
! pip install evaluate

In [None]:
import evaluate
rouge = evaluate.load('rouge')

In [None]:
from datasets import load_metric

#rouge_metric = load_metric("rouge")
rouge_metric = evaluate.load('rouge')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute the Rouge score using the Rouge metric
    rouge_output = rouge_metric.compute(predictions=predictions, references=labels, use_stemmer=True)
    return rouge_output

In [None]:
from transformers import DataCollatorForSeq2Seq
data_collator=DataCollatorForSeq2Seq(tokenizer,model=model)


In [None]:
from transformers import TrainingArguments, Trainer


training_args=TrainingArguments(
    output_dir='finetunedN-facebook-bart-samsum',
    num_train_epochs=1,
    warmup_steps=500,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    logging_steps=10,
    evaluation_strategy='steps',
    metric_for_best_model='eval_loss',
    eval_steps=500,
    gradient_accumulation_steps=16,
    push_to_hub=True,
    push_to_hub_model_id='finetunedN-facebook-bart-samsum',
    push_to_hub_organization="codebasics",
    push_to_hub_token="hf_ltknSULNroXjMrgBWlSOOpvHWPnADrcD", #replace with original
    report_to="tensorboard",  # or "wandb" for Weights & Biases
)

# Initialize the Trainer object with the updated TrainingArguments
trainer=Trainer(model=model,
                args=training_args,
                tokenizer=tokenizer,
                data_collator=data_collator,
                train_dataset=samsum_pt['train'],
                eval_dataset=samsum_pt['validation'],
                compute_metrics=compute_metrics
                )

In [None]:
trainer.train()

In [None]:
# Evaluating model performance on the tokenized validation dataset
#validation = trainer.evaluate(eval_dataset = samsum_pt['validation'])
#print(validation) # Coudn't do due to resource problems

# Push to hub from trainer directly

In [None]:
trainer.push_to_hub()

In [None]:
finetuned_model="codebasics/finetunedN-facebook-bart-samsum"
pipe=pipeline("summarization",model=model)

In [None]:
#original_model='facebook/bart-large-cnn'
finetuned_model="codebasics/finetunedN-facebook-bart-samsum"
tokenizer=AutoTokenizer.from_pretrained(finetuned_model)
finetuned_model=AutoModelForSeq2SeqLM.from_pretrained(finetuned_model,torch_dtype=torch.bfloat16).to('cpu')

In [None]:
model_ckpt='facebook/bart-large-cnn' #for model checkpoint
tokenizer=AutoTokenizer.from_pretrained(model_ckpt)
original_model=AutoModelForSeq2SeqLM.from_pretrained(model_ckpt,torch_dtype=torch.bfloat16).to('cpu')

In [None]:
rouge = evaluate.load('rouge')

In [None]:
from transformers import  GenerationConfig

In [None]:
dialogues = samsum['test'][0:10]['dialogue']
human_baseline_summaries = samsum['test'][0:10]['summary']

original_model_summaries = []
finetuned_model_summaries = []

for _, dialogue in enumerate(dialogues):
    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
    original_model_summaries.append(original_model_text_output)

    finetuned_model_outputs = finetuned_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    finetuned_model_text_output = tokenizer.decode(finetuned_model_outputs[0], skip_special_tokens=True)
    finetuned_model_summaries.append(finetuned_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, finetuned_model_summaries))

df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries', 'finetuned_model_summaries'])
df

In [None]:
original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

finetuned_model_results = rouge.compute(
    predictions=finetuned_model_summaries,
    references=human_baseline_summaries[0:len(finetuned_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('FINETUNED MODEL:')
print(finetuned_model_results)

# Let's check the summaries produced by the fnetuned model

In [None]:
# Customer dialogue prediction


custom_dialogue="""
Sam: Hey hello rachel, how are you?
rachel: good. are you interested in joining this new data science course here?
Sam: what is special about this course?
rachel: prifessor ori is teaching this and he is legend in this area. If want to join just let me know.
Sam: that sounds great. But which topic he is going to cover?
rachel: i thisnk je will teach NLP and LLM.
Sam: Okay, i will join then. thank you for letting me know.
"""



In [None]:
output=pipe(custom_dialogue)


In [None]:
output

In [None]:
text="""Sam: Hey Harry, have you noticed how everything seems to be getting more expensive lately?

Harry: Yeah, it's crazy. I went grocery shopping yesterday and couldn't believe the prices.

Sam: Exactly! I heard on the news that it's because of inflation. Do you know what that means exactly?

Harry: I think it's when the prices of goods and services increase over time, right?

Sam: Yeah, that's it. It's happening because the value of money is decreasing, so it takes more money to buy the same things.

Harry: So, what can we do about it?

Sam: Well, we can try to budget more carefully and maybe look for cheaper alternatives when possible. But ultimately, it's a complex economic issue.

Harry: Yeah, I guess we'll just have to adapt and hope things get better soon.

Sam: Agreed. Let's keep an eye on it and stay informed."""

In [None]:
output=pipe(text)


In [None]:
output

In [None]:
text2="""Sam: Hey Harry, I just read an article about investing in stocks to combat inflation.

Harry: Really? How does that work?

Sam: Well, apparently, when inflation goes up, stock prices tend to rise too. So, investing in stocks could potentially help us keep up with the rising cost of living.

Harry: That sounds interesting, but isn't investing in stocks risky?

Sam: It can be, but if we do our research and diversify our investments, it could pay off in the long run.

Harry: Hmm, I'll have to look into it more. Thanks for the tip, Sam.

Sam: No problem, Harry. It's always good to explore different ways to manage our finances, especially with inflation on the rise."""

In [None]:
output=pipe(text2)

In [None]:
output

In [None]:
text3="""Sam: Hey Harry, have you seen the latest advancements in artificial intelligence?

Harry: Yeah, I heard about some pretty cool stuff. Like AI helping doctors diagnose diseases more accurately.

Sam: Exactly! It's amazing how AI is revolutionizing various industries, from healthcare to finance.

Harry: But I also read about concerns regarding AI taking over jobs. Do you think it's something we should worry about?

Sam: It's definitely a valid concern. AI has the potential to automate many tasks, but it also creates new opportunities for innovation and creativity.

Harry: True, but I wonder how AI will impact our daily lives in the future.

Sam: Well, one thing's for sure, AI is here to stay, and it's up to us to adapt and harness its potential for the greater good.

Harry: Absolutely, let's stay curious and keep learning about the exciting possibilities AI brings."""

In [None]:
output=pipe(text3)

In [None]:
output