In [11]:
!pip install yahoo_fin feedparser transformers torch pandas requests beautifulsoup4 newspaper3k

Collecting newspaper3k
  Using cached newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting nltk>=3.2.1 (from newspaper3k)
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.1.3-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
     ---------------------------------------- 0.0/7.4 MB ? eta -:--:--
     - -------------------------------------- 0.3/7.4 MB ? eta -:--:--
     -- ------------------------------------- 0.5/7.4 MB 1.7 MB/s eta 0:00:05
     ---- ----------------------------------- 0.8/7.4 MB 1.5 MB/s eta 0:00:05
     ----- ---------------------------------- 1.0/7.4 MB 1.4 MB/s eta 0:00:05
     ------- -------------------

In [13]:
pip install lxml_html_clean

Collecting lxml_html_clean
  Downloading lxml_html_clean-0.4.1-py3-none-any.whl.metadata (2.4 kB)
Downloading lxml_html_clean-0.4.1-py3-none-any.whl (14 kB)
Installing collected packages: lxml_html_clean
Successfully installed lxml_html_clean-0.4.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
import feedparser
import requests
import logging
from bs4 import BeautifulSoup
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
import pandas as pd
import os
from newspaper import Article
from concurrent.futures import ThreadPoolExecutor

In [3]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [4]:
# Load FinBERT model for financial sentiment analysis
MODEL_NAME = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to("cpu")

In [5]:
# Function to get sentiment using FinBERT
def get_finbert_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to("cpu")
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    sentiment_labels = ['Negative', 'Neutral', 'Positive']
    predicted_index = torch.argmax(predictions).item()
    return sentiment_labels[predicted_index] if predicted_index < len(sentiment_labels) else "Unknown"

In [7]:
# Function to process a single article
def process_article(entry, ticker):
    url = entry.link
    title = entry.title
    publication_date = entry.published if 'published' in entry else 'N/A'
    try:
        article = Article(url)
        article.download()
        article.parse()
        text = article.text
        
        words = text.split(' ')[:1000]
        ARTICLE = ' '.join(words) if len(words) >= 50 else text

        if ARTICLE:
            summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=-1)
            summary = summarizer(ARTICLE, max_length=150, min_length=50, do_sample=False)[0]['summary_text']

            final_sentiment = get_finbert_sentiment(ARTICLE)
            return [ticker, title, publication_date, summary, final_sentiment, url]
    except Exception as e:
        logging.error(f"Error processing article {url}: {e}")
    return None

In [9]:
# Function to fetch articles using RSS feed
def fetch_articles(ticker):
    rss_feed = feedparser.parse(f'https://feeds.finance.yahoo.com/rss/2.0/headline?s={ticker}&region=US&lang=en-US')
    return [(entry, ticker) for entry in rss_feed.entries[:10]] if rss_feed.entries else []

if __name__ == "__main__":
    ARTICLE_LIST = []
    unique_urls = set()

    # Default tickers
    TICKERS = ['SPY', 'AAPL', 'GOOGL', 'TSLA', 'BTC-USD', 'ETH-USD', 'AMZN', 'MSFT', 'DJI', 'IXIC', 'CRYPTO', 'FOREX']

    article_entries = sum(map(fetch_articles, TICKERS), [])
    
    with ThreadPoolExecutor(max_workers=5) as executor:
        results = list(executor.map(lambda args: process_article(*args), article_entries))
    
    for result in results:
        if result and result[5] not in unique_urls:
            ARTICLE_LIST.append(result)
            unique_urls.add(result[5])

    # Export to CSV
    if ARTICLE_LIST:
        output_file = 'financial_news_summary.csv'
        df = pd.DataFrame(ARTICLE_LIST, columns=['Ticker', 'Title', 'Publication Date', 'Summary', 'Sentiment', 'URL'])
        df.to_csv(output_file, index=False)
        logging.info(f"CSV file created: {output_file}")
    else:
        logging.info("No articles fetched.")


2025-03-13 13:21:43,640 - ERROR - Error processing article http://www.etf.com/sections/news/stock-etfs-waver-investors-weigh-cpi-data-tariffs?utm_source=yahoo-finance&utm_medium=rss&utm_campaign=yahoo-finance-rss&.tsrc=rss: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.etf.com/sections/news/stock-etfs-waver-investors-weigh-cpi-data-tariffs?utm_source=yahoo-finance&utm_medium=rss&utm_campaign=yahoo-finance-rss&.tsrc=rss on URL http://www.etf.com/sections/news/stock-etfs-waver-investors-weigh-cpi-data-tariffs?utm_source=yahoo-finance&utm_medium=rss&utm_campaign=yahoo-finance-rss&.tsrc=rss
Device set to use cpu
Device set to use cpu
Device set to use cpu
Your max_length is set to 150, but your input_length is only 65. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=32)
Device set to use cpu
Device set to use cpu
2025-03-13 