In [None]:
import yfinance
import requests
from bs4 import BeautifulSoup
import re
from pprint import pprint
from transformers import pipeline, SummarizationPipeline

In [None]:
def yfinance_aticle_crawler(news_obj: dict) -> dict[str, str]:
    """
    Fetch the full article HTML response from a Yahoo Finance news object.

    The function extracts the article URL from the provided `news_obj` dictionary,
    constructs a session with appropriate headers to mimic a browser, and retrieves
    the article content. It returns a dictionary containing the article's title,
    URL, and raw HTML text.

    Parameters
    ----------
    news_obj : dict
        A Yahoo Finance news dictionary containing article metadata with either 
        "clickThroughUrl" or "previewUrl" under `news_obj["content"]`.

    Returns
    -------
    dict[str, str]
        A dictionary with the keys:
        - "title": The article title.
        - "url": The resolved article URL.
        - "response_text": The raw HTML text of the article.
    """
    if news_obj["content"]["clickThroughUrl"] is not None:
        url = news_obj["content"]["clickThroughUrl"]["url"]
    else:
        url = news_obj["content"]["previewUrl"]

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate, br",
        "Referer": "https://www.google.com/",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "DNT": "1",  # Do Not Track
    }

    session = requests.Session()
    response = session.get(url, headers=headers)
    session.close()

    return {
        "title": news_obj["content"]["title"],
        "url": url,
        "response_text": response.text
    }


def yfinance_article_cleaner(response_text: str) -> str:
    """
    Clean and extract the main text from a Yahoo Finance article HTML string.

    This function removes irrelevant parts of the article such as navigation text,
    "Related articles," "Read more," and other footer or promotional content.

    Parameters
    ----------
    response_text : str
        The raw HTML of the article.

    Returns
    -------
    str
        The cleaned article text with unnecessary sections removed.
    """
    soup = BeautifulSoup(response_text, "html.parser")
    plain_text = soup.get_text(separator=" ", strip=True)

    pattern_leader = "^.*In this article:"
    relevant_text = re.sub(pattern=pattern_leader, repl="", string=plain_text)

    for marker in ["Related articles",
                   "Read more",
                   "Continue Reading",
                   "Story Continues",
                   "View Comments"]:
        idx = relevant_text.rfind(marker)
        if idx != -1:
            relevant_text = relevant_text[:idx]

    return relevant_text.strip()


def yfinance_pull_clean(news_obj: dict) -> dict[str, str]:
    """
    Fetch and clean a Yahoo Finance news article.

    This function serves as a pipeline: it retrieves the raw article HTML
    using `yfinance_aticle_crawler` and then cleans the text with
    `yfinance_article_cleaner`.

    Parameters
    ----------
    news_obj : dict
        A Yahoo Finance news dictionary containing article metadata.

    Returns
    -------
    dict[str, str]
        A dictionary containing:
        - "title": Article title
        - "url": Article URL
        - "response_text": Raw HTML
        - "clean_text": Cleaned article body text
    """
    response_obj = yfinance_aticle_crawler(news_obj)
    response_obj["clean_text"] = yfinance_article_cleaner(
        response_obj["response_text"]
    )
    return response_obj


def content_summarizer(content_list: list[dict[str, str]],
                       summarizer_pipeline: SummarizationPipeline) -> list[dict[str, str]]:
    """
    Summarize a list of cleaned article texts using a Hugging Face summarization pipeline.

    Each article in the input list should contain at least "title", "url", and "clean_text".
    Summaries are dynamically adjusted in length depending on the article size.

    Parameters
    ----------
    content_list : list of dict
        A list of article dictionaries with keys "title", "url", and "clean_text".
    summarizer_pipeline : SummarizationPipeline
        A Hugging Face summarization pipeline object.

    Returns
    -------
    list of dict
        A list of dictionaries, each containing:
        - "title": Article title
        - "url": Article URL
        - "summary_text": The generated summary
    """
    summaries: list[dict[str, str]] = []

    for content in content_list:
        clean_text = content["clean_text"]
        r = None
        max_length = 200
        min_length = 30
        try:
            word_count = len(clean_text.split(" "))

            if word_count < 200:
                max_length = int(word_count * 0.9)

            if word_count < min_length:
                min_length = max_length

            r = summarizer_pipeline(
                clean_text, min_length=min_length, max_length=max_length
            )
        except IndexError as e:
            print(e)
            pass
        finally:
            if r is not None:
                summary_obj = {
                    "title": content["title"],
                    "url": content["url"],
                    "summary_text": r[0]["summary_text"],
                }
                summaries.append(summary_obj)

    return summaries


def meta_summary(summary_list: list[dict[str, str]], 
                 summarizer_pipeline: SummarizationPipeline) -> list[dict[str, str]]:
    """
    Generate a higher-level meta-summary from a list of individual summaries.

    This function concatenates all summaries into a single string and then 
    applies the summarization pipeline again to produce a longer, 
    more comprehensive summary.

    Parameters
    ----------
    summary_list : list of dict
        A list of summary dictionaries, each containing at least a "summary_text" key.
    summarizer_pipeline : SummarizationPipeline
        A Hugging Face summarization pipeline object.

    Returns
    -------
    list of dict
        A summarization pipeline output containing the meta-summary text.
    """
    summary_texts = [t["summary_text"] for t in summary_list]
    summary_str = "".join(summary_texts)

    min_length = 100
    max_length = 1000
    word_count = len(summary_str.split(" "))

    if word_count < 1000:
        max_length = int(word_count * 0.9)

    if word_count < min_length:
        min_length = max_length

    return summarizer_pipeline(summary_str, 
                               min_length=min_length, max_length=max_length)

# Test Case

In [80]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
ticker = yfinance.Ticker('MSFT')

Device set to use cpu


In [81]:
content_list = [yfinance_pull_clean(n) for n in ticker.news]

In [82]:
summary_list = content_summarizer(content_list, summarizer)
pprint(summary_list)

[{'summary_text': 'Nebius Group (NASDAQ: NBIS) has emerged as a potential '
                  "superstar in one of the world's biggest growth industries: "
                  'artificial intelligence (AI) The company sells access to '
                  'compute, and just recently it signed a multibillion-dollar '
                  'deal to provide AI infrastructure to tech giant Microsoft. '
                  'Revenue is already taking off at Nebius, as it surged more '
                  'than 600% in the recent quarter.',
  'title': 'Is Nebius Stock Your Ticket to Becoming a Millionaire?',
  'url': 'https://finance.yahoo.com/news/nebius-stock-ticket-becoming-millionaire-172000391.html'},
 {'summary_text': ' Maximor announced a $9 million seed round to expand its AI '
                  'platform. Co-founders Ramnandan Krishnamurthy and Ajay '
                  'Krishna Amudan at Microsoft witnessed organizations revert '
                  'to spreadsheets despite investments in enterpri

In [83]:
m_summary = meta_summary(summary_list, summarizer)
pprint(m_summary[0]["summary_text"])

('Nebius Group (NASDAQ: NBIS) has emerged as a potential superstar in one of '
 "the world's biggest growth industries: artificial intelligence. Revenue is "
 'already taking off at Nebius, as it surged more than 600% in the recent '
 "quarter. Nvidia and Microsoft made large investments in OpenAI. Microsoft's "
 "investment gave it privileged access to OpenAI's AI models. Amazon offers an "
 'opportunity to bet on the lucrative AI and Cloud market as well as the '
 'behemoth e-commerce market.')
