<a href="https://colab.research.google.com/github/selvinsj/AI_article_summarizer/blob/main/bartaudio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries
!pip install flask newspaper3k wikipedia-api beautifulsoup4 requests pytube transformers torch sentencepiece pyngrok lxml_html_clean pyttsx3 gtts


Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting wikipedia-api
  Downloading wikipedia_api-0.8.1.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pytube
  Downloading pytube-15.0.0-py3-none-any.whl.metadata (5.0 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.5-py3-none-any.whl.metadata (8.9 kB)
Collecting lxml_html_clean
  Downloading lxml_html_clean-0.4.2-py3-none-any.whl.metadata (2.4 kB)
Collecting pyttsx3
  Downloading pyttsx3-2.98-py3-none-any.whl.metadata (3.8 kB)
Collecting gtts
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.

In [None]:


# ------------------------------------------
# FLASK APP CODE (RUN IN SAME CELL BELOW)
# ------------------------------------------
from flask import Flask, request, render_template_string, send_file
from newspaper import Article
import wikipediaapi
from bs4 import BeautifulSoup
import requests
import urllib.parse
from pytube import YouTube
from transformers import pipeline
from pyngrok import ngrok
import time
from gtts import gTTS
import os

app = Flask(__name__)

# Initialize BART summarization pipeline with error handling
try:
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
except Exception as e:
    print(f"Error loading model: {str(e)}")
    summarizer = None

# ========================
# EMBEDDED HTML/CSS TEMPLATES
# ========================
INDEX_HTML = """
<!DOCTYPE html>
<html>
<head>
    <title>URL Summarizer</title>
    <style>
        body { font-family: Arial, sans-serif; margin: 40px; background: #f0f2f5; }
        h1 { color: #1a73e8; text-align: center; }
        form { text-align: center; margin: 30px 0; }
        input[type="text"] { width: 60%; padding: 12px; font-size: 16px; border: 2px solid #ddd; border-radius: 4px; }
        button { padding: 12px 24px; background: #1a73e8; color: white; border: none; border-radius: 4px; cursor: pointer; }
        button:hover { background: #1557b0; }
    </style>
</head>
<body>
    <h1>URL Text Summarizer</h1>
    <form method="POST" action="/process">
        <input type="text" name="url" placeholder="Enter article/news/YouTube/Wikipedia URL" required>
        <button type="submit">Summarize</button>
    </form>
</body>
</html>
"""

RESULT_HTML = """
<!DOCTYPE html>
<html>
<head>
    <title>Summary Result</title>
    <style>
        body { font-family: Arial, sans-serif; margin: 40px; background: #f0f2f5; }
        h1 { color: #1a73e8; text-align: center; }
        .result { background: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); margin: 20px auto; max-width: 800px; }
        .error { color: red; font-weight: bold; }
        a { display: block; text-align: center; margin-top: 20px; color: #1a73e8; text-decoration: none; }
        audio { width: 100%; margin-top: 20px; }
    </style>
</head>
<body>
    <h1>Summary Result</h1>
    <div class="result">
        {% if error %}
            <p class="error">{{ text }}</p>
        {% else %}
            <p>{{ text }}</p>
            {% if audio_file %}
                <audio controls autoplay>
                    <source src="/audio/{{ audio_file }}" type="audio/mpeg">
                    Your browser does not support the audio element.
                </audio>
                <p>Press <strong>Enter</strong> to replay the audio.</p>
            {% endif %}
        {% endif %}
        <a href="/">Back to Home</a>
    </div>
</body>
</html>
"""

# ========================
# IMPROVED URL EXTRACTION
# ========================
def extract_from_url(url):
    try:
        print(f"\n=== Extracting from: {url} ===")

        if "wikipedia.org/wiki/" in url:
            return extract_wikipedia_page(url)
        elif "youtube.com/" in url:
            return extract_from_youtube(url)
        else:
            return extract_news_article(url)

    except Exception as e:
        print(f"Extraction failed: {str(e)}")
        return None, f"Extraction error: {str(e)}"

def extract_news_article(url):
    try:
        print("Extracting news article...")
        start = time.time()

        article = Article(url)
        article.download()
        article.parse()

        if not article.text.strip():
            return None, "No text found in article"

        print(f"Extracted {len(article.text)} characters in {time.time()-start:.2f}s")
        return article.text, None

    except Exception as e:
        return None, f"Article extraction failed: {str(e)}"

def extract_wikipedia_page(url):
    try:
        print("Extracting Wikipedia page...")
        start = time.time()

        title = url.split("/")[-1]
        decoded_title = urllib.parse.unquote(title).replace("_.html", "").replace("_", " ")

        wiki_wiki = wikipediaapi.Wikipedia(
            user_agent="MySummarizer/1.0",
            language='en',
            extract_format=wikipediaapi.ExtractFormat.WIKI
        )

        page = wiki_wiki.page(decoded_title)

        if not page.exists():
            return None, "Wikipedia page not found"

        text = page.text
        if not text.strip():
            return None, "Empty Wikipedia page"

        print(f"Extracted {len(text)} characters in {time.time()-start:.2f}s")
        return text, None

    except Exception as e:
        return None, f"Wikipedia extraction failed: {str(e)}"

def extract_from_youtube(url):
    try:
        print("Extracting YouTube captions...")
        start = time.time()

        yt = YouTube(url)
        captions = yt.captions.get_by_language_code('en')

        if not captions:
            return None, "No English captions found"

        text = captions.generate_srt_captions()
        print(f"Extracted {len(text)} characters in {time.time()-start:.2f}s")
        return text, None

    except Exception as e:
        return None, f"YouTube extraction failed: {str(e)}"

# ========================
# ROBUST SUMMARIZATION
# ========================
def summarize_text(text):
    try:
        if not text or len(text.strip()) < 100:
            return None, "Text too short for summarization"

        print(f"Summarizing {len(text)} characters...")
        print(f"Input Text: {text[:500]}...")  # Debug: Print first 500 characters

        # Truncate input text to first 512 words
        truncated_text = " ".join(text.split()[:512])

        start = time.time()
        result = summarizer(
            truncated_text,  # Use truncated text
            max_length=150,
            min_length=50,
            do_sample=False,
            truncation=True
        )

        if not result or not result[0].get('summary_text'):
            return None, "Failed to generate summary"

        summary_text = result[0]['summary_text']
        if not summary_text.strip():  # Check if summary is empty
            return None, "Empty summary generated"

        print(f"Summarization completed in {time.time()-start:.2f}s")
        return summary_text, None

    except Exception as e:
        return None, f"Summarization error: {str(e)}"

# ========================
# AUDIO GENERATION
# ========================
def generate_audio(text):
    try:
        filename = "summary.mp3"
        filepath = f"audio/{filename}"
        os.makedirs("audio", exist_ok=True)

        tts = gTTS(text=text, lang="en")
        tts.save(filepath)
        return filename
    except Exception as e:
        print(f"Audio generation error: {str(e)}")
        return None

# ========================
# FLASK ROUTES
# ========================
@app.route("/", methods=["GET"])
def index():
    return render_template_string(INDEX_HTML)

@app.route("/process", methods=["POST"])
def process():
    url = request.form["url"].strip()
    print(f"\nProcessing URL: {url}")

    # Step 1: URL Extraction
    extracted_text, error = extract_from_url(url)
    if error:
        return render_template_string(RESULT_HTML, text=error, error=True)

    # Step 2: Text Validation
    if not extracted_text or len(extracted_text) < 100:
        error_msg = "Extracted text too short for summarization"
        print(error_msg)
        return render_template_string(RESULT_HTML, text=error_msg, error=True)

    # Step 3: Summarization
    if not summarizer:
        error_msg = "Summarization model not loaded"
        print(error_msg)
        return render_template_string(RESULT_HTML, text=error_msg, error=True)

    summary, error = summarize_text(extracted_text)
    if error:
        return render_template_string(RESULT_HTML, text=error, error=True)

    # Step 4: Generate Audio
    audio_file = generate_audio(summary)
    return render_template_string(RESULT_HTML, text=summary, audio_file=audio_file)

@app.route("/audio/<filename>")
def get_audio(filename):
    return send_file(f"audio/{filename}", as_attachment=True)

# ========================
# START APPLICATION
# ========================
if __name__ == "__main__":
    # Terminate any existing Ngrok tunnels
    ngrok.kill()

    # Set Ngrok auth token
    ngrok.set_auth_token("Auth_token_here")

    # Start Ngrok tunnel
    public_url = ngrok.connect(5000).public_url
    print(f"\n=== APP RUNNING AT: {public_url} ===")

    # Run Flask app
    app.run()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu



=== APP RUNNING AT: https://8f7f-34-16-190-181.ngrok-free.app ===
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [04/May/2025 17:19:49] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [04/May/2025 17:19:51] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -



Processing URL: https://en.wikipedia.org/wiki/Japanese_battleship_Tosa

=== Extracting from: https://en.wikipedia.org/wiki/Japanese_battleship_Tosa ===
Extracting Wikipedia page...


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Extracted 12392 characters in 0.27s
Summarizing 12392 characters...
Input Text: Tosa (Japanese: 土佐, named after the ancient Tosa Province) was a planned battleship of the Imperial Japanese Navy. Designed by Yuzuru Hiraga, Tosa was to be the first of two Tosa-class ships. Displacing 39,900 long tons (40,540 t) and armed with ten 410 mm (16.1 in) guns, these warships would have brought Japan closer to its goal of an "Eight-four" fleet (eight battleships and four battlecruisers). The ship was laid down in 1920, but all work was halted after the signing of the Washington Naval ...

Processing URL: https://en.wikipedia.org/wiki/Japanese_battleship_Tosa

=== Extracting from: https://en.wikipedia.org/wiki/Japanese_battleship_Tosa ===
Extracting Wikipedia page...
Extracted 12392 characters in 0.28s
Summarizing 12392 characters...
Input Text: Tosa (Japanese: 土佐, named after the ancient Tosa Province) was a planned battleship of the Imperial Japanese Navy. Designed by Yuzuru Hiraga, Tosa was to 

INFO:werkzeug:127.0.0.1 - - [04/May/2025 17:20:24] "POST /process HTTP/1.1" 200 -


Summarization completed in 28.79s


INFO:werkzeug:127.0.0.1 - - [04/May/2025 17:20:32] "POST /process HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [04/May/2025 17:20:34] "[35m[1mGET /audio/summary.mp3 HTTP/1.1[0m" 206 -
INFO:werkzeug:127.0.0.1 - - [04/May/2025 17:20:35] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
