In [1]:
!pip install -qU transformers torch sentencepiece accelerate>=0.20.1 rouge

In [2]:
!pip install protobuf==3.20.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Check how we can use pretrained Model

In [3]:
from transformers import pipeline
from rouge import Rouge
from nltk import word_tokenize
import glob
import numpy as np
import os
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
text = "Personally, I thought today was fine. Talking about APIs again really solidified my understanding of what they were. As for our group project, I don't feel great about it. Some of my team members have been \"collecting data\" for about two days straight. I can understand that some of our data sets can be harder to collect than others, but I feel like there's not tons of effort being put in by some of my team members. Today we decided to implement a new kind of data set to improve our model (recycling symbols on objects) and from what I can see on our Roboflow, only one team member collected and labeled his recycling symbol data. I don't know if my other team members just don't understand the idea, or if they are just not doing it. I'm planning on working on all of those other recycling labels tonight, and hopefully a data health check in our Roboflow to see how balanced our data sets are. I just kinda wish more of my team members were interactive with ideas and input."
words = word_tokenize(text)
words_length = len(words)

In [4]:
words_length

205

In [5]:
min_summary_length = int(words_length*0.20)
max_summary_length = int(words_length*0.50)

In [6]:
model = 'facebook/bart-large-cnn' #To-Do - Try different summarization model from https://huggingface.co/models?pipeline_tag=summarization&sort=downloads

In [7]:
summarize_model = pipeline("summarization", model=model)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [8]:
summary = summarize_model(text, max_length=max_summary_length, min_length=min_summary_length)
print(summary)

[{'summary_text': 'Some of my team members have been "collecting data" for about two days straight. Today we decided to implement a new kind of data set to improve our model. Only one team member collected and labeled his recycling symbol data. I don\'t know if my other team members just don\'t understand the idea, or if they are just not doing it.'}]


In [9]:
rouge = Rouge()
scores = rouge.get_scores(summary[0]['summary_text'], text)
scores
#To-Do: Research about Rouge score and How we can use it to evaluate summarization task

[{'rouge-1': {'r': 0.43859649122807015,
   'p': 0.9803921568627451,
   'f': 0.6060606017895317},
  'rouge-2': {'r': 0.3333333333333333,
   'p': 0.9491525423728814,
   'f': 0.49339206663742746},
  'rouge-l': {'r': 0.43859649122807015,
   'p': 0.9803921568627451,
   'f': 0.6060606017895317}}]

## Use pre trained model for your dataset

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
def Summarize_models(path, model):
  summarize_model = pipeline("summarization", model=model)
  rouge_score = []
  for filename in os.listdir(path):
    if filename.endswith(".txt"):
        file_path = os.path.join(data_folder, filename)
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()
            words = word_tokenize(text)
            length = len(words)
            if length>500:
                words = words[:500]
            length = len(words)
            text = ' '.join(words)
            summary = summarize_model(text, max_length=int(length*0.50), min_length=int(length*0.20), do_sample=False)[0]["summary_text"]
            rouge = Rouge()
            scores = rouge.get_scores(summary, text)
            rouge_score.append(scores[0]['rouge-l']['f'])
            print(f" Summary of Filename - {filename}\n Summary - {summary} \n Model - {model} \n Score - {scores[0]['rouge-l']['f']}")
            print("-"*25)
  avg = np.mean(rouge_score)
  return avg

In [7]:
data_folder = "/content/drive/MyDrive/NewsNebula/tests"

Summarize_models(data_folder,'facebook/bart-large-cnn')

 Summary of Filename - Copy of file_8.txt
 Summary - " mija" is the feature debut of director isabel castro. The film follows the lives of two artists who are the children of immigrants. Director castro wanted to tell a more nuanced story about immigration than just the act of immigrating. "mija" premieres friday on disney+ and is available on iTunes and Google Play. For more information on " mija," visit mija.org or go to www.mija.com. Back to the page you came from. 
 Model - facebook/bart-large-cnn 
 Score - 0.22875816682194883
-------------------------
 Summary of Filename - Copy of file_21.txt
 Summary - President joe biden says u.s. forces would defend taiwan if china tries to invade the self-ruled island claimed by beijing as part of its territory. Beijing criticizes official foreign contact with taipei as encouragement to make its de facto independence permanent. The mainland says such a move would lead to war. The united states has no formal relations with the island but maint

0.3137461703297374

In [8]:
#To-Do: Find best 3 models
models = ['facebook/bart-large-cnn',
          'philschmid/bart-large-cnn-samsum',
          'sshleifer/distilbart-cnn-12-6',
          'moussaKam/barthez-orangesum-abstract',
          'google/pegasus-cnn_dailymail',
          'google/bigbird-pegasus-large-bigpatent',
          'csebuetnlp/mT5_multilingual_XLSum']

ans_dict = {}
for model in models:
  ans = Summarize_models(data_folder,model)
  ans_dict[model] = ans

 Summary of Filename - Copy of file_8.txt
 Summary - " mija" is the feature debut of director isabel castro. The film follows the lives of two artists who are the children of immigrants. Director castro wanted to tell a more nuanced story about immigration than just the act of immigrating. "mija" premieres friday on disney+ and is available on iTunes and Google Play. For more information on " mija," visit mija.org or go to www.mija.com. Back to the page you came from. 
 Model - facebook/bart-large-cnn 
 Score - 0.22875816682194883
-------------------------
 Summary of Filename - Copy of file_21.txt
 Summary - President joe biden says u.s. forces would defend taiwan if china tries to invade the self-ruled island claimed by beijing as part of its territory. Beijing criticizes official foreign contact with taipei as encouragement to make its de facto independence permanent. The mainland says such a move would lead to war. The united states has no formal relations with the island but maint

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.31G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/232 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/1.92M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/3.51M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/775 [00:00<?, ?B/s]

Attention type 'block_sparse' is not possible if sequence_length: 619 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


 Summary of Filename - Copy of file_8.txt
 Summary - “ mija” is a new documentary that follows the complicated emotions and the ripple effects that come along with immigrating to a new country, and the intersection of several creative goals for two of its subjects: a musician and music manager and two of her clients: a singer/actor/producer and a singer/actor/producer/producer. “ mija” is directed by isabel castro and is available for streaming on video-on-demand and over the internet at video-on-demand. 
 Model - google/bigbird-pegasus-large-bigpatent 
 Score - 0.22448979323221807
-------------------------
 Summary of Filename - Copy of file_21.txt
 Summary - In an interview with cbs news, biden said “ yes” when asked whether u.s. forces would be sent to defend taiwan in the event of a chinese invasion of the self-ruled island claimed by beijing as part of its territory, adding displays of official american support for the island’s constitution and its people&#39;s resilience to the a

Downloading (…)lve/main/config.json:   0%|          | 0.00/730 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]



 Summary of Filename - Copy of file_8.txt
 Summary - A new documentary about immigrants is being released on  Disney+ , which tells the story of a growing number of young people seeking to make it in the creative profession . The BBC’s weekly The Boss series profiles different writers from around the world and across the globe. This week we speak to isabel  Castro - the director and director of one of the most successful Mexican film festivals in recent years. 
 Model - csebuetnlp/mT5_multilingual_XLSum 
 Score - 0.15081966905111538
-------------------------
 Summary of Filename - Copy of file_21.txt
 Summary - US President Joe biden says u.s. forces would defend  Taiwan if China tries to invade the island , adding to displays of official support for its democracy .biden said he was willing to get involved militarily in the event of a chinese invasion. Taipei makes their own judgments about independence but  But  BBC Vietnamese has been asked to ask why. 
 Model - csebuetnlp/mT5_multil

In [9]:
ans_dict

{'facebook/bart-large-cnn': 0.3137461703297374,
 'philschmid/bart-large-cnn-samsum': 0.32228487566581077,
 'sshleifer/distilbart-cnn-12-6': 0.3845342322465741,
 'moussaKam/barthez-orangesum-abstract': 0.2139352063012742,
 'google/pegasus-cnn_dailymail': 0.32527762774428837,
 'google/bigbird-pegasus-large-bigpatent': 0.26758470085695274,
 'csebuetnlp/mT5_multilingual_XLSum': 0.2140236944260032}

In [10]:
# Sort the models based on their scores in descending order
sorted_models = sorted(ans_dict.items(), key=lambda x: x[1], reverse=True)

# Get the top three models with the highest scores
top_three_models = sorted_models[:3]

# Print the top three models and their scores
for model, score in top_three_models:
    print(f"Model: {model}, Score: {score}")

Model: sshleifer/distilbart-cnn-12-6, Score: 0.3845342322465741
Model: google/pegasus-cnn_dailymail, Score: 0.32527762774428837
Model: philschmid/bart-large-cnn-samsum, Score: 0.32228487566581077
