In [7]:
import yfinance
from yfinance.ticker import Ticker
import requests
from bs4 import BeautifulSoup
import re
from pprint import pprint
from transformers import pipeline, SummarizationPipeline

In [113]:
class NewsSummarizer:
    """
    A utility class for retrieving, cleaning, and summarizing news articles 
    associated with a Yahoo Finance ticker symbol.

    This class uses a Hugging Face summarization pipeline to generate concise
    summaries for each article, and optionally produces a meta-summary 
    combining all individual summaries.
    """

    def __init__(self, summarizer: SummarizationPipeline, ticker: Ticker):
        """
        Initialize the NewsSummarizer.

        Args:
            summarizer (SummarizationPipeline): A Hugging Face summarization pipeline.
            ticker (Ticker): A Yahoo Finance Ticker object providing news data.
        """
        # Use comma-free assignment (previous version had a trailing comma bug)
        self.summarizer = summarizer
        self.ticker = ticker
        self.content_list = []
        self.content_summaries = []
        self.meta_summary = None

    def _yfinance_aticle_crawler(self, news_obj: dict) -> dict[str, str]:
        """
        Fetch the full HTML content of a Yahoo Finance news article.

        Args:
            news_obj (dict): A single news object from yfinance.ticker.news.

        Returns:
            dict[str, str]: A dictionary containing the article title, URL, and raw HTML text.
        """
        # Determine the correct article URL (click-through or preview)
        if news_obj["content"]["clickThroughUrl"] is not None:
            url = news_obj["content"]["clickThroughUrl"]["url"]
        else:
            url = news_obj["content"]["previewUrl"]

        # Define realistic browser headers to avoid being blocked by the site
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
            "Accept-Encoding": "gzip, deflate, br",
            "Referer": "https://www.google.com/",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
            "DNT": "1",  # Do Not Track
        }

        # Request the article content
        session = requests.Session()
        response = session.get(url, headers=headers)
        session.close()

        return {
            "title": news_obj["content"]["title"],
            "url": url,
            "response_text": response.text
        }

    def _yfinance_article_cleaner(self, response_text: str) -> str:
        """
        Clean and extract relevant textual content from an article’s HTML.

        Args:
            response_text (str): Raw HTML text of the article.

        Returns:
            str: Cleaned plain text without irrelevant sections or HTML tags.
        """
        soup = BeautifulSoup(response_text, "html.parser")
        plain_text = soup.get_text(separator=" ", strip=True)

        # Remove leading promotional content (e.g., “In this article:”)
        pattern_leader = "^.*In this article:"
        relevant_text = re.sub(pattern=pattern_leader, repl="", string=plain_text)

        # Cut off at known end markers that often indicate unrelated sections
        for marker in [
            "Related articles",
            "Read more",
            "Continue Reading",
            "Story Continues",
            "View Comments",
        ]:
            idx = relevant_text.rfind(marker)
            if idx != -1:
                relevant_text = relevant_text[:idx]

        return relevant_text.strip()

    def _yfinance_pull_clean(self, news_obj: dict) -> dict[str, str]:
        """
        Retrieve and clean an article from Yahoo Finance.

        Args:
            news_obj (dict): A single Yahoo Finance news object.

        Returns:
            dict[str, str]: Dictionary with title, URL, raw HTML, and cleaned text.
        """
        response_obj = self._yfinance_aticle_crawler(news_obj)
        response_obj["clean_text"] = self._yfinance_article_cleaner(response_obj["response_text"])
        return response_obj

    def _content_summarizer(self) -> list[dict[str, str]]:
        """
        Summarize each cleaned article using the summarization model.

        Returns:
            list[dict[str, str]]: A list of summary dictionaries containing title, URL, and summary text.
        """
        summaries: list[dict[str, str]] = []

        for content in self.content_list:
            clean_text = content["clean_text"]
            r = None
            max_length = 200
            min_length = 30

            try:
                word_count = len(clean_text.split(" "))

                # Dynamically adjust summary lengths based on article size
                if word_count < 200:
                    max_length = int(word_count * 0.9)

                if word_count < min_length:
                    min_length = max_length

                # Run summarization pipeline
                r = self.summarizer(
                    clean_text, min_length=min_length, max_length=max_length
                )
            except Exception as e:
                print(f"Summarization failed: {e}")
                pass
            finally:
                if r is not None:
                    summary_obj = {
                        "title": content["title"],
                        "url": content["url"],
                        "summary_text": r[0]["summary_text"],
                    }
                    summaries.append(summary_obj)

        self.content_summaries = summaries

    def _meta_summary(self):
        """
        Generate a higher-level summary that summarizes all article summaries.
        """
        if self.content_summaries:
            # Combine all individual summaries into one string
            summary_texts = [t["summary_text"] for t in self.content_summaries]
            summary_str = " ".join(summary_texts)

            min_length = 100
            max_length = 1000
            word_count = len(summary_str.split(" "))

            # Dynamically adjust summary length parameters
            if word_count < 1000:
                max_length = int(word_count * 0.9)
            if word_count < min_length:
                min_length = max_length

            # Generate the meta-summary
            self.meta_summary = self.summarizer(
                summary_str, min_length=min_length, max_length=max_length
            )
        else:
            # If no summaries exist, generate them first
            self.get_summaries(include_meta=True)

    def get_summaries(self, include_meta: bool = False) -> dict:
        """
        Retrieve all news for the given ticker, clean and summarize each one.

        Args:
            include_meta (bool, optional): If True, also generate a meta-summary. Defaults to False.

        Returns:
            dict: A dictionary containing summaries and optionally the meta-summary.
        """
        news_list = self.ticker.news

        # Retrieve and clean each article
        self.content_list = [self._yfinance_pull_clean(n) for n in news_list]

        # Summarize the cleaned content
        self._content_summarizer()

        if include_meta:
            self._meta_summary()

    def get_meta_summary(self):
        """
        Retrieve or generate a meta-summary summarizing all article summaries.
        """
        if self.content_summaries:
            if self.meta_summary is None:
                self._meta_summary()
        else:
            self.get_summaries(include_meta=True)


# Test Case


In [105]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
ticker = yfinance.Ticker('GOOG')

Device set to use cpu


In [114]:
news_summarizer = NewsSummarizer(summarizer, ticker)

In [115]:
news_summarizer.get_summaries(include_meta=True)

Summarization failed: index out of range in self


In [116]:
pprint(news_summarizer.content_summaries)

[{'summary_text': "Google's Gemini Enterprise offering is meant to go "
                  "toe-to-toe with Microsoft's 365 Copilot. It will enable "
                  "customers to use Google's Gemini to analyze corporate data "
                  'and access AI agents in one place. Google said users will '
                  'be able to connect Gemini Enterprise to existing data '
                  'sources.',
  'title': 'Google launches Gemini Enterprise, taking aim at Microsoft, OpenAI',
  'url': 'https://finance.yahoo.com/news/google-launches-gemini-enterprise-taking-aim-at-microsoft-openai-120020256.html'},
 {'summary_text': 'BofA Securities reiterated a Buy rating on the stock with a '
                  'price target of $252. The rating affirmation follows '
                  'emerging reports that Google is testing a redesign of its '
                  'Gemini AI application. The move could help boost user '
                  'interaction and expand use cases for Gemini.',
  'titl

In [117]:
pprint(news_summarizer.meta_summary)

[{'summary_text': 'Broadcom Inc. (NASDAQ: AVGO ) is one of the Trending AI '
                  'Stocks on Wall Street’s Radar. BofA Securities reiterated a '
                  'Buy rating on the stock with a price target of $252. '
                  "Applied Digital beat Wall Street's revenue expectations in "
                  'Q3 CY2025, with sales up 5.8% year on year to $64.22 '
                  'million. UiPath stock surged more than 40% this week '
                  'following a new collaboration with ChatGPT maker OpenAI.'}]
