##  Load Packages

In [1]:
import torch.nn.functional as F
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset

from evaluate import load
import bert_score


## Load Test Dataset

In [2]:
dataset = load_dataset("xsum")
use_percent_train = 6
dataset_test = load_dataset("xsum", split=f"test[:{use_percent_train}%]")


Found cached dataset xsum (/home/user/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset xsum (/home/user/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71)


In [3]:
example = dataset_test[0]

print("text:", example["document"])
print("Summary:", example["summary"])
print("id:", example["id"])

text: Prison Link Cymru had 1,099 referrals in 2015-16 and said some ex-offenders were living rough for up to a year before finding suitable accommodation.
Workers at the charity claim investment in housing would be cheaper than jailing homeless repeat offenders.
The Welsh Government said more people than ever were getting help to address housing problems.
Changes to the Housing Act in Wales, introduced in 2015, removed the right for prison leavers to be given priority for accommodation.
Prison Link Cymru, which helps people find accommodation after their release, said things were generally good for women because issues such as children or domestic violence were now considered.
However, the same could not be said for men, the charity said, because issues which often affect them, such as post traumatic stress disorder or drug dependency, were often viewed as less of a priority.
Andrew Stevens, who works in Welsh prisons trying to secure housing for prison leavers, said the need for acco

# Load model

In [6]:
tokenizer = AutoTokenizer.from_pretrained("ZinebSN/GPT2_summarizer")
model = AutoModelForCausalLM.from_pretrained("ZinebSN/GPT2_summarizer")

Downloading (…)okenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/999k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/75.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/907 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

# Inference

In [7]:
def summarize(input_text, tokenizer, model, length, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
    text=tokenizer.encode_plus(f'<bos> {input_text} <sep>', truncation=True, max_length=1024).input_ids
    text_length=len(text)
    text = torch.tensor(text, dtype=torch.long, device=device)
    text = text.unsqueeze(0)
    generated = text
    model = model.to(device)
    with torch.no_grad():
        for _ in range(length):
            
            inputs = {'input_ids': generated}
            outputs = model(**inputs)
            next_token_logits = outputs[0][0, -1, :]
            next_token = torch.multinomial(F.softmax(next_token_logits, dim=-1), num_samples=1)
            #print(next_token)
            generated = torch.cat((generated, next_token.unsqueeze(0)), dim=1)
            generated=generated[:, -1024:]
            
    generated = generated[0, text_length:]
    text = tokenizer.convert_ids_to_tokens(generated,skip_special_tokens=True)
    text = tokenizer.convert_tokens_to_string(text)

    return text


In [24]:
gpt2_summary=summarize(dataset_test[0]['document'], tokenizer, model, 50)

  for _ in tnrange(length):


  0%|          | 0/50 [00:00<?, ?it/s]

In [25]:
print(gpt2_summary)

Almost two-thirds of people found a permanent new home in Cymru have received the help, according to a Welsh investigation.


In [26]:
print(dataset_test[0]['summary'])

There is a "chronic" need for more housing for prison leavers in Wales, according to a charity.


In [26]:
data_id = 9
test_document = dataset_test[data_id]['document']
test_gt_summary = dataset_test[data_id]['summary']
generated_summary=summarize(test_document, tokenizer, model, 50)

In [27]:
def append_results(article, ground_truth_summary, generated_summary, filename="results.txt"):
    # Initialize the result string in a readable format
    result = f"\n\n---\n\nArticle:\n{article}\n\nGround Truth Summary:\n{ground_truth_summary}\n\nGenerated Summary:\n{generated_summary}\n\n---\n"
    
    # Open the file in append mode, if the file does not exist it will be created
    with open(filename, "a", encoding="utf-8") as file:
        file.write(result)

    print(f'Results have been appended successfully to {filename}')


append_results(test_document, test_gt_summary, generated_summary)

Results have been appended successfully to results.txt


## Evaluation

In [31]:
class Evaluator:
    def __init__(self, hypothesis, reference):
        self.hypothesis = hypothesis
        self.reference = reference
        
        self.metrics={
        'rouge1': self.rouge_N(1),
        'rouge2': self.rouge_N(2),
        'rougeL': self.rouge_L(),
        'rougeLsum': self.rouge_L_sum(),
        'bert': self.bert_Score()
    }
        
    def rouge_L(self):
        metric = load("rouge")
        metric_type = 'rougeL'
        rg_score = metric.compute(predictions=self.hypothesis, references=self.reference, rouge_types=[metric_type])[metric_type]
        print('rougeL computed')
        return rg_score
        
    
    def rouge_N(self, n=1):
        metric = load("rouge")
        metric_type = f'rouge{n}'
        rg_score = metric.compute(predictions=self.hypothesis, references=self.reference, rouge_types=[metric_type])[metric_type]
        print('rougeN computed')
        return rg_score
		
		
    def rouge_L_sum(self):
        metric = load("rouge")
        metric_type = 'rougeLsum'
        rg_score = metric.compute(predictions=self.hypothesis, references=self.reference, rouge_types=[metric_type])[metric_type]
        print('rougeLsum computed')
        return rg_score
		
        
    def bert_Score(self):
        metric = load("bertscore")
        all_preds= bert_score.score(self.hypothesis, self.reference, lang='en')
        score = {"precision": (torch.sum(all_preds[0].cpu())/all_preds[0].cpu().numel()).item(), "recall": (torch.sum(all_preds[1].cpu())/all_preds[1].cpu().numel()).item(), "f1":
                (torch.sum(all_preds[2].cpu())/all_preds[2].cpu().numel()).item()}
        print('Bert computed')
        return score
    
    def evaluate_loss(self):
        logits = tokenizer.encode(self.hypothesis).input_ids
        labels = tokenizer.encode(self.reference).input_ids
        

        # Calculate token frequencies in the reference summary
        reference_token_counts = np.unique(labels, return_counts=True)[1]
        reference_token_probabilities = reference_token_counts / len(labels)

        # Calculate token frequencies in the generated summary
        generated_token_counts = np.unique(logits, return_counts=True)[1]
        generated_token_probabilities = generated_token_counts / len(logits)

        # Calculate entropy for reference and generated summaries
        reference_entropy = -np.sum(reference_token_probabilities * np.log2(reference_token_probabilities))
        generated_entropy = -np.sum(generated_token_probabilities * np.log2(generated_token_probabilities))

        # Compute entropy loss as the difference between the entropies
        entropy_loss = generated_entropy - reference_entropy

        return entropy_loss
    
    
    def rouge_L_evaluation(self):
        # Tokenize hypothesos and reference sentences
        hypothesis_tokens = self.hypothesis.split()
        reference_tokens = self.reference.split()

        # Compute the length of the longest common subsequence
        lcs = lcs_length(hypothesis_tokens, reference_tokens)

        # Compute precision, recall, and f1 score
        precision = lcs / len(hypothesis_tokens)
        recall = lcs / len(reference_tokens)
        f1_score = 2 * ((precision * recall) / (precision + recall + 1e-7))

        return {"precision": precision, "recall": recall, "f1": f1_score}


    def rouge_N_evaluation(self, n=1):
        # split sentences into n-grams
        def ngrams(sentence, n):
            # use a list comprehension to generate n-grams
            return Counter([tuple(sentence[i:i+n]) for i in range(len(sentence) - n + 1)])

        # compute the n-grams for the candidate and reference sentences
        hypothesis_ngrams = ngrams(self.hypothesis.split(" "), n)
        reference_ngrams = ngrams(self.reference.split(" "), n)

        # count the number of shared n-grams
        shared_ngrams = hypothesis_ngrams & reference_ngrams
        shared_count = sum(shared_ngrams.values())

        # calculate precision, recall, and f1 score
        precision = shared_count / sum(hypothesis_ngrams.values())
        recall = shared_count / sum(reference_ngrams.values())
        f1_score = 2 * ((precision * recall) / (precision + recall + 1e-7))

        return {"precision": precision, "recall": recall, "f1": f1_score}
    
    def rouge_L_sum_evaluation(self):
        # Tokenize candidate and reference summaries
        hypothesis_tokens = self.hypothesis.split()
        reference_tokens = self.reference.split()

        # Compute the length of the longest common subsequence for summarizations
        lcs_sum = lcs_length(hypothesis_tokens, reference_tokens)

        # Compute precision, recall, and f1 score
        precision = lcs_sum / len(hypothesis_tokens)
        recall = lcs_sum / len(reference_tokens)
        f1_score = 2 * ((precision * recall) / (precision + recall + 1e-7))

        return {"precision": precision, "recall": recall, "f1": f1_score}
    
    @staticmethod
    def lcs_length(s1, s2):
            m, n = len(s1), len(s2)
            dp = [[0] * (n + 1) for _ in range(m + 1)]

            for i in range(1, m + 1):
                for j in range(1, n + 1):
                    if s1[i - 1] == s2[j - 1]:
                        dp[i][j] = dp[i - 1][j - 1] + 1
                    else:
                        dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])

            return dp[m][n]
        
        
def evaluate_model(test_data, model, tokenizer, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
        reference=[]
        candidate=[]
        for input_text, summary in zip(test_data['document'], test_data['summary']):
            
            generated_summary = summarize(input_text, tokenizer, model, 30, device=device)  # Generate summary using your model's generate function
           
            reference.append(summary)
            candidate.append(generated_summary)
        scores = Evaluator(candidate, reference).metrics
        
        return scores
    


## Evaluate Fine-tuned GPT2

In [None]:
scores= evaluate_model(dataset_test, model, tokenizer)
bert_scores=scores['bert']
rouge_1_score=scores['rouge1']
rouge_2_score=scores['rouge2']
rouge_L_score=scores['rougeL']
rouge_L_sum_score=scores['rougeLsum']

In [11]:
print('Bert Scores: Precision ',bert_scores['precision'],'| Recall ',bert_scores['recall'],'| F1Score ',bert_scores['f1'])
print('Rouge-1 Score: ',rouge_1_score)
print('Rouge-2 Score: ',rouge_2_score)
print('Rouge-L Score: ',rouge_L_score)
print('Rouge-L-summ Score: ',rouge_L_sum_score)

Bert Scores: Precision  0.7621852159500122 | Recall  0.7650243639945984 | F1Score  0.7634808421134949
Rouge-1 Score:  0.1643108200516018
Rouge-2 Score:  0.027788670404758707
Rouge-L Score:  0.12639778248659123
Rouge-L-summ Score:  0.12639778248659123
