In [71]:
import sys
import json
import glob
import tqdm
import pandas as pd
import torch
import evaluate
from transformers import pipeline
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
tqdm.tqdm.pandas()
sys.path.append('/home/verma.shi/LLM/LitArt/models')
from summarizer import TextSummaryModel
cache_dir="/work/LitArt/cache"

In [99]:
def load_model_details(path):

    with open(path+"run_config.json") as json_file:
        run_details = json.load(json_file)
    
    base_model_name = run_details["base_model_name"]
    tokenizer_name = run_details["tokenizer_name"]
    cache_dir = run_details["cache_dir"]

    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    base_model = AutoModelForSeq2SeqLM.from_pretrained(base_model_name,cache_dir=cache_dir).to(device)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name,cache_dir=cache_dir)

    checkpoint_location = path+"my_model/version_0/checkpoints/*.ckpt"
    best_checkpoint_location = glob.glob(checkpoint_location)[0]

    model = torch.load(f=best_checkpoint_location,map_location=device)
    keys_to_modify = list(model["state_dict"].keys())  # Create a copy of the keys
    for key in keys_to_modify:
        new_key = key[6:]
        model["state_dict"][new_key] = model["state_dict"][key]
        del model["state_dict"][key]

    summary_model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path=base_model_name,state_dict=model["state_dict"])

    run_details["best_model_path"] = best_checkpoint_location
    
    return summary_model,base_model,tokenizer,run_details
    

In [100]:
checkpoints_path = "/work/LitArt/verma/google-pegasus-xsum-2024-03-14-23:19:43/"
summary_model,base_model,tokenizer,run_details = load_model_details(checkpoints_path)
device = "cuda" if torch.cuda.is_available() else "cpu"

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [101]:
run_details

{'train_path': '/work/LitArt/data/generated_summaries/train_dataset_with_summaries.csv',
 'test_path': '/work/LitArt/data/generated_summaries/test_dataset_with_summaries.csv',
 'val_path': '/work/LitArt/data/generated_summaries/validation_dataset_with_summaries.csv',
 'base_model_name': 'google/pegasus-xsum',
 'tokenizer_name': 'google/pegasus-xsum',
 'cache_dir': '/work/LitArt/cache',
 'batch_size': 32,
 'tokenizer_chapter_max_length': 512,
 'tokenizer_summary_max_length': 64,
 'epochs': 10,
 'log_path': '/work/LitArt/verma/',
 'best_model_path': '/work/LitArt/verma/google-pegasus-xsum-2024-03-14-23:19:43/my_model/version_0/checkpoints/epoch=6-val_loss=3.82.ckpt'}

In [106]:
def summarize(text,model,tokenizer,chapter_length,summary_length,temperature=1,repetition_penalty=1,device='cpu'):
    model = model.to(device)
    text = "Summarize the following : \n" + text
    inputs = tokenizer(text, 
                       max_length=chapter_length,
                       truncation=True,
                       padding="max_length",
                       add_special_tokens=True, 
                       return_tensors="pt").to(device)
    summarized_ids = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"], 
            max_length= summary_length,
            temperature = temperature,
            do_sample = True,
            repetition_penalty = repetition_penalty).to(device)

    return " ".join([tokenizer.decode(token_ids, skip_special_tokens=True)
                    for token_ids in summarized_ids])

In [107]:
test_df = pd.read_csv(run_details["test_path"])
test_df = test_df.sample(n=100,random_state=42) 

In [108]:
rouge = evaluate.load('rouge')

In [109]:
test_df["model_summary"] = test_df["chapter"].progress_apply( lambda text: summarize(text,summary_model,tokenizer,chapter_length=run_details["tokenizer_chapter_max_length"],summary_length=run_details["tokenizer_summary_max_length"],temperature=1.5,repetition_penalty=1.5,device=device))

100%|██████████| 100/100 [07:31<00:00,  4.51s/it]


In [110]:
test_df["base_model_summary"] = test_df["chapter"].progress_apply( lambda text: summarize(text,base_model,tokenizer,chapter_length=run_details["tokenizer_chapter_max_length"],summary_length=run_details["tokenizer_summary_max_length"],temperature=1.5,repetition_penalty=1.5,device=device))

100%|██████████| 100/100 [06:22<00:00,  3.82s/it]


In [111]:
predictions = test_df["model_summary"].to_list()
references = test_df["summary_text"].to_list()
results_model = rouge.compute(predictions=predictions, references=references)
results_model

{'rouge1': 0.1060802331681305,
 'rouge2': 0.02020470364595954,
 'rougeL': 0.07256178889223228,
 'rougeLsum': 0.07310122951667639}

In [112]:
predictions = test_df["generated_summary"].to_list()
references = test_df["summary_text"].to_list()
results_gpt = rouge.compute(predictions=predictions, references=references)
results_gpt

{'rouge1': 0.06966278793405115,
 'rouge2': 0.013128496993345683,
 'rougeL': 0.051979927863158755,
 'rougeLsum': 0.05243213606150958}

In [113]:
predictions = test_df["base_model_summary"].to_list()
references = test_df["summary_text"].to_list()
results_base = rouge.compute(predictions=predictions, references=references)
results_base

{'rouge1': 0.0722585722639571,
 'rouge2': 0.01058775230274284,
 'rougeL': 0.05240994838149399,
 'rougeLsum': 0.05264777377620605}

In [114]:
def calculate_percentage_difference(dict1, dict2):
    percentage_difference = {}

    for metric in dict1.keys():
        difference = dict2[metric] - dict1[metric]
        percentage_diff = (difference / dict1[metric]) * 100
        percentage_difference[metric] = percentage_diff

    for metric, percentage_diff in percentage_difference.items():
        print(f"{metric}: {percentage_diff:.2f}%")

    return percentage_difference

In [115]:
percentage_change = calculate_percentage_difference(results_gpt,results_model)

rouge1: 52.28%
rouge2: 53.90%
rougeL: 39.60%
rougeLsum: 39.42%


In [116]:
percentage_change = calculate_percentage_difference(results_base,results_model)

rouge1: 46.81%
rouge2: 90.83%
rougeL: 38.45%
rougeLsum: 38.85%


In [117]:
from pprint import pprint

In [125]:
index = 0
chapter = test_df.iloc[index]["chapter"]
summary = test_df.iloc[index]["summary_text"]

In [126]:
base_model_summary = summarize(chapter,
                               base_model,
                               tokenizer,
                               chapter_length=run_details["tokenizer_chapter_max_length"],
                               summary_length=run_details["tokenizer_summary_max_length"],
                               temperature=1.5,
                               repetition_penalty=1.5,
                               device=device)
model_summary = summarize(chapter,
                               summary_model,
                               tokenizer,
                               chapter_length=run_details["tokenizer_chapter_max_length"],
                               summary_length=run_details["tokenizer_summary_max_length"],
                               temperature=1.5,
                               repetition_penalty=1.5,
                               device=device)
gpt_summary = test_df.iloc[index]["generated_summary"]

In [127]:
chapter

'says my idealistic friend what vulgar details what good is there in taking all these pains to give an exact likeness of old women and clowns what a low phase of life what clumsy ugly people but bless us things may be lovable that are not altogether handsome i hope i am not at all sure that the majority of the human race have not been ugly and even among those lords of their kind the british squat figures ill-shapen nostrils and dingy complexions are not startling exceptions yet there is a great deal of family love amongst us i have a friend or two whose class of features is such that the apollo curl on the summit of their brows would be decidedly trying yet to my certain knowledge tender hearts have beaten for them and their miniatures--flattering but still not lovely--are kissed in secret by motherly lips i have seen many an excellent matron who could have never in her best days have been handsome and yet she had a packet of yellow love-letters in a private drawer and sweet children 

In [129]:
print(f"Base Model Summary : \n {base_model_summary}")
print(f"Fine Tuned Model Summary : \n {model_summary}")
print(f"GPT Model Summary : \n {gpt_summary}")
print(f"Human Summary : \n {summary}")


Base Model Summary : 
 My dear friend what is it about the human race that makes us so ugly?
Fine Tuned Model Summary : 
 in the second act jane tells her idealistic friend that there are some people in the human race who have not been ugly and even among those lords of their kind the majority of the human race has not been ugly and even among those lords of their kind the apollo curl on the summit of their brows would
GPT Model Summary : 
 The passage emphasizes the importance of human connection over aesthetic beauty, highlighting the value of common, everyday people and experiences.
Human Summary : 
 book second chapter in which the story pauses a little this chapter is a time out from the plot as eliot steps in as the author to explain her style of storytelling she defends the realism of it and eschews being sentimental or using ideal character types she is writing from nature and fact as if she were in a courtroom she cannot lie or whitewash the details she reminds us that we must