#### Install libraries

In [1]:
!pip install rouge transformers gradio

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Collecting gradio
  Downloading gradio-5.4.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.3-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.2 (from gradio)
  Downloading gradio_client-1.4.2-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.26.1-py3-none-any.whl.metadata (13 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting orjson~=3.0 (from grad

#### Import necessary libraries

In [2]:
import torch
from transformers import pipeline, BartTokenizer, MarianMTModel, MarianTokenizer
from rouge import Rouge
import gradio as gr

#### Use GPU for summarizing and load summarization and translator models

###### BART-model for summarization and Helsinki-NLP ENG to FI translation model

In [3]:
# Check if GPU is available
device = 0 if torch.cuda.is_available() else -1
if device >= 0:
    print(f"Using device: {torch.cuda.get_device_name(0)}")
else:
    print("Using CPU")

# Load BART model and tokenizer for text summarization
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

# Load translation model and tokenizer for English to Finnish
translation_model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-fi')
translation_tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-fi')

# Create Rouge metric object
rouge = Rouge()

Using device: Tesla T4


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/842k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]



#### Convert chars to token length

In [4]:
# Convert character-based length to token-based length
def char_to_token_length(text, char_length):
    tokens = tokenizer.encode(text)
    if len(text) == 0:
        return 0
    token_length = int(len(tokens) * (char_length / len(text)))
    return token_length

# Print the number of tokens in the input text
def print_token_count(text):
    tokens = tokenizer.encode(text)
    print(f"Input text token count: {len(tokens)}")
    return len(tokens)

## Summarizing function

In [5]:
# Function to translate English text to Finnish
def translate_to_finnish(text):
    translated_tokens = translation_model.generate(**translation_tokenizer(text, return_tensors="pt", padding=True))
    translated_text = translation_tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    return translated_text

# Summarization function
def summarize_text(text, min_length=500, max_length=2000, target_language="English"):
    if not text.strip():
        return "Please provide a non-empty text.", 0, "N/A", "N/A", 0

    input_length_chars = len(text)
    token_count = print_token_count(text)

    # Check if input text exceeds token limit
    max_tokens = 1000
    if token_count > max_tokens:
        return f"Input text ({token_count} tokens) exceeds the maximum token limit of {max_tokens}. Please reduce the text and try again.", 0, "N/A", "N/A", input_length_chars

    # Convert character length to token length
    min_length_tokens = char_to_token_length(text, min_length)
    max_length_tokens = char_to_token_length(text, max_length)

    # Ensure max_length isn't shorter than min_length
    if max_length_tokens < min_length_tokens:
        return "Max length must be greater than or equal to the min length.", 0, "N/A", "N/A", input_length_chars

    try:
        # Summarize text using the BART model
        # combined_summary = summarizer(text, min_length=min_length_tokens, max_length=max_length_tokens, do_sample=False)[0]['summary_text']
        combined_summary = summarizer(
            text,
            min_length=min_length_tokens,
            max_length=max_length_tokens,
            length_penalty=1.0,  # Basic value for length_penalty
            do_sample=False,  # Disable random samples
            num_beams=12,  # Beam Search for better result
            early_stopping=True  # Stop summarizing once the summarization is good enough
            )[0]['summary_text']

    except Exception as e:
        return f"An error occurred: {str(e)}", 0, "N/A", "N/A", input_length_chars

    if not combined_summary:
        return "Error in summarizing the text.", 0, "N/A", "N/A", input_length_chars

    # Translate summary to Finnish if needed
    if target_language == "Finnish":
        combined_summary = translate_to_finnish(combined_summary)

    # Calculate Rouge score
    rouge_scores = rouge.get_scores(combined_summary, text)
    rouge_1 = rouge_scores[0]['rouge-1']['f'] * 100  # F1 score
    rouge_L = rouge_scores[0]['rouge-l']['f'] * 100  # F1 score

    return combined_summary, rouge_1, input_length_chars, rouge_L


In [6]:
# Character count function
def get_character_count(text):
    return len(text)

#### Button logic for summary sizes

In [7]:
# Functions for preset summary size based on character count
def set_small_summary(text):
    input_length_chars = len(text)
    min_length = max(100, int(input_length_chars * 0.1))
    max_length = max(300, int(input_length_chars * 0.2))
    return min_length, max_length

def set_medium_summary(text):
    input_length_chars = len(text)
    min_length = max(100, int(input_length_chars * 0.2))
    max_length = max(300, int(input_length_chars * 0.3))
    return min_length, max_length

def set_large_summary(text):
    input_length_chars = len(text)
    min_length = max(100, int(input_length_chars * 0.3))
    max_length = max(300, int(input_length_chars * 0.4))
    return min_length, max_length

## Gradio Interface

In [8]:
# Gradio interface
with gr.Blocks() as iface:
    gr.Markdown("""
    # Text Summarization with BART
    ### Enter the text to summarize and adjust the length settings.""")

    # Textbox for input
    textbox = gr.Textbox(lines=5, label="Input Text", placeholder="Enter your text here...")
    char_count_output = gr.Number(label="Input Length (Characters)", value=0, interactive=False)

    # Not visible, more simple interface
    min_length_slider = gr.Slider(minimum=100, maximum=3000, value=100, label="Min Length (Characters)", visible=False)
    max_length_slider = gr.Slider(minimum=300, maximum=5000, value=300, label="Max Length (Characters)", visible=False)

    # Language selection radio buttons
    language_selection = gr.Radio(
        choices=["English", "Finnish"], label="Select output language", value="English"
    )

    summary_size = gr.Radio(
        choices=["Small", "Medium", "Large"],
        label="Select Summary Size",
    )

    summary_output = gr.Textbox(label="Summary", visible=True)

    # Can't be measured from ENG to FIN so ROUGE-scores are not visible
    rouge_1_output = gr.Number(label="ROUGE-1 Score", visible=False)
    rouge_L_output = gr.Number(label="ROUGE-L Score", visible=False)

    submit_btn = gr.Button("Summarize")

    # Define function to adjust sliders based on selected summary size
    def adjust_summary_size(text, selected_size):
        if selected_size == "Small":
            return set_small_summary(text)
        elif selected_size == "Medium":
            return set_medium_summary(text)
        elif selected_size == "Large":
            return set_large_summary(text)

    # Bind functions to Gradio components
    submit_btn.click(
        summarize_text,
        inputs=[textbox, min_length_slider, max_length_slider, language_selection],
        outputs=[summary_output, rouge_1_output, char_count_output, rouge_L_output]
    )

    # Update character count output
    textbox.change(fn=lambda text: len(text), inputs=textbox, outputs=char_count_output)

    # Update sliders when radio button changes
    summary_size.change(
        fn=adjust_summary_size,
        inputs=[textbox, summary_size],
        outputs=[min_length_slider, max_length_slider]
    )

iface.launch(share=True, debug=True)


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://7afc77161261f77ec6.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Input text token count: 894
Input text token count: 894
Input text token count: 894
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://7afc77161261f77ec6.gradio.live


