# Sentiment Analysis - `TextBlob`
- FMP API Documentation: https://site.financialmodelingprep.com/developer/docs/stable/stock-news

### **Step 1: Import Libraries + Functions**

In [1]:
from io import StringIO
from typing import Dict, List
from datetime import datetime, timedelta

# Data manipulation libraries
import polars as pl
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from pandas.tseries.offsets import BDay

# API Requests
import requests
from requests.adapters import HTTPAdapter
from concurrent.futures import ThreadPoolExecutor
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Libraries for sentiment analysis
from textblob import TextBlob
import re
from collections import defaultdict

def create_session():
    """Create a requests session with retry configuration"""
    session = requests.Session()
    retries = Retry(
        total=3,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504]
    )
    session.mount('https://', HTTPAdapter(max_retries=retries, pool_maxsize=10))
    return session

def fetch_data(api_key: str, session: requests.Session, days_back: int = None, max_pages: int = None, records_per_page: int = None, request_timeout: int = None) -> Dict:
    """Fetch stock news with pagination"""
    # Use parameters or fall back to default values
    days_back = days_back if days_back is not None else 7
    max_pages = max_pages if max_pages is not None else 10
    records_per_page = records_per_page if records_per_page is not None else 1000
    request_timeout = request_timeout if request_timeout is not None else 10
    
    # API base URL
    API_BASE_URL = "https://financialmodelingprep.com/api/v3/stock_news"
    
    # Calculate date range
    today = datetime.now().date()
    week_ago = today - timedelta(days=days_back)
    
    all_data = []
    
    # Loop through max_pages with records_per_page records each
    for page in range(max_pages):
        url = API_BASE_URL
        params = {
            "apikey": api_key,
            "from": week_ago.strftime('%Y-%m-%d'),
            "to": today.strftime('%Y-%m-%d'),
            "limit": records_per_page,
            "page": page
        }
        
        try:
            print(f"Fetching page {page + 1}/{max_pages}...")
            response = session.get(url, params=params, timeout=request_timeout)
            response.raise_for_status()
            data = response.json()
            
            if not data:  # If no more data, break the loop
                print(f"No more data found at page {page + 1}. Stopping pagination.")
                break
                
            all_data.extend(data)
            print(f"Page {page + 1}: {len(data)} articles fetched")
            
        except Exception as e:
            print(f"Error fetching page {page + 1}: {e}")
            continue
    
    print(f"Total articles fetched: {len(all_data)}")
    return all_data

# ===============================================================================
# SENTIMENT ANALYSIS FUNCTIONS
# ===============================================================================

def extract_stock_symbols(text, all_tickers=None, excluded_symbols=None):
    """Extract valid stock symbols from text, excluding common words"""
    if all_tickers is None:
        all_tickers = set()
    if excluded_symbols is None:
        excluded_symbols = {'AI', 'S', 'A', 'U', 'E', 'US', 'ET', 'TSXV', 'CODI', 'C'}
    
    symbols = re.findall(r'\b[A-Z]{1,5}\b', text)
    return [symbol for symbol in symbols 
            if symbol in all_tickers and symbol not in excluded_symbols]

def analyze_sentiment(text):
    """Analyze sentiment using TextBlob and classify as bullish/bearish/neutral"""
    polarity = TextBlob(text).sentiment.polarity
    
    if polarity > 0.1:
        return 'bullish', polarity
    elif polarity < -0.1:
        return 'bearish', polarity
    else:
        return 'neutral', polarity

def calculate_stock_sentiment_metrics(df, all_tickers=None, excluded_symbols=None):
    """Calculate comprehensive sentiment metrics for each stock symbol"""
    if all_tickers is None:
        all_tickers = set()
    if excluded_symbols is None:
        excluded_symbols = {'AI', 'S', 'A', 'U', 'E', 'US', 'ET', 'TSXV', 'CODI', 'C'}
    
    stock_metrics = defaultdict(lambda: {
        'sentiment_scores': [],
        'bullish_count': 0,
        'bearish_count': 0,
        'neutral_count': 0,
        'total_articles': 0
    })
    
    # Process each news article
    for row in df.iter_rows(named=True):
        full_text = f"{row.get('title', '')} {row.get('text', '')}"
        mentioned_symbols = extract_stock_symbols(full_text, all_tickers, excluded_symbols)
        sentiment_type, sentiment_score = analyze_sentiment(full_text)
        
        # Update metrics for each mentioned symbol
        for symbol in mentioned_symbols:
            metrics = stock_metrics[symbol]
            metrics['sentiment_scores'].append(sentiment_score)
            metrics['total_articles'] += 1
            metrics[f'{sentiment_type}_count'] += 1
    
    # Calculate final metrics
    final_metrics = {}
    for symbol, data in stock_metrics.items():
        if data['total_articles'] > 0:
            total = data['total_articles']
            avg_sentiment = sum(data['sentiment_scores']) / len(data['sentiment_scores'])
            
            final_metrics[symbol] = {
                "articlesInLastWeek": total,
                "companyNewsScore": round((avg_sentiment + 1) / 2, 4),
                "sentiment": {
                    "bearishPercent": round(data['bearish_count'] / total, 4),
                    "bullishPercent": round(data['bullish_count'] / total, 4)
                },
                "averageSentimentScore": round(avg_sentiment, 4),
                "totalArticles": total
            }
    
    return final_metrics

# ===============================================================================
# SECTOR ANALYSIS & FUNDAMENTAL DATA INTEGRATION
# ===============================================================================

def calculate_sector_averages(sentiment_df, fundamentals_pandas):
    """Calculate sector-level sentiment averages"""
    sector_metrics = defaultdict(list)
    
    for row in sentiment_df.iter_rows(named=True):
        symbol = row['symbol']
        if symbol in fundamentals_pandas.index:
            sector = fundamentals_pandas.loc[symbol, 'Sector']
            sector_metrics[sector].append({
                'bullishPercent': row['bullishPercent'],
                'newsScore': row['companyNewsScore']
            })
    
    return {
        sector: {
            'sectorAverageBullishPercent': round(sum(m['bullishPercent'] for m in metrics) / len(metrics), 4),
            'sectorAverageNewsScore': round(sum(m['newsScore'] for m in metrics) / len(metrics), 4)
        }
        for sector, metrics in sector_metrics.items() if metrics
    }

def get_fundamental_value(symbol, column, default=0):
    """Safely get fundamental data value for a symbol"""
    # This function needs access to fundamentals_pandas from the calling scope
    # We'll modify this to accept it as a parameter
    import pandas as pd
    try:
        # Try to access the global fundamentals_pandas if it exists
        import __main__
        if hasattr(__main__, 'fundamentals_pandas'):
            fundamentals_pandas = __main__.fundamentals_pandas
            return fundamentals_pandas.loc[symbol, column] if symbol in fundamentals_pandas.index else default
        else:
            return default
    except:
        return default

### **Step 2: Extract News - FMP API Call**

In [None]:
# Get API key from environment variables
FMP_API_KEY = 'FMP_API_KEY'

# Configuration variables for fetch_data function
DAYS_BACK = 7                    # Number of days to look back for news
MAX_PAGES = 10                   # Maximum number of pages to fetch
RECORDS_PER_PAGE = 1000          # Number of records per page
REQUEST_TIMEOUT = 10             # Timeout for API requests in seconds
API_BASE_URL = "https://financialmodelingprep.com/api/v3/stock_news"
DATE_FORMAT = "%Y-%m-%d %H:%M:%S"  # Date format for parsing publishedDate

# Create session and fetch data with all required parameters
session = create_session()
data = fetch_data(
    api_key=FMP_API_KEY,
    session=session,
    days_back=30,           # Custom: 30 days back
    max_pages=15,           # Custom: 15 pages
    records_per_page=1000,  
    request_timeout=10      
)

news_df = pl.DataFrame(data)
display(news_df.sort('publishedDate', descending=True).head())

Fetching page 1/15...
Page 1: 998 articles fetched
Fetching page 2/15...
Page 2: 1000 articles fetched
Fetching page 3/15...
Page 3: 1000 articles fetched
Fetching page 4/15...
Page 4: 1000 articles fetched
Fetching page 5/15...
Page 5: 999 articles fetched
Fetching page 6/15...
Page 6: 999 articles fetched
Fetching page 7/15...
Page 7: 997 articles fetched
Fetching page 8/15...
Page 8: 1000 articles fetched
Fetching page 9/15...
Page 9: 1000 articles fetched
Fetching page 10/15...
Page 10: 1000 articles fetched
Fetching page 11/15...
Page 11: 1000 articles fetched
Fetching page 12/15...
Page 12: 999 articles fetched
Fetching page 13/15...
Page 13: 998 articles fetched
Fetching page 14/15...
Page 14: 1000 articles fetched
Fetching page 15/15...
Page 15: 1000 articles fetched
Total articles fetched: 14990


symbol,publishedDate,title,image,site,text,url
str,str,str,str,str,str,str
"""MONRF""","""2025-06-02 11:17:33""","""Moncler: Growth, Low Debt, And…","""https://images.financialmodeli…","""seekingalpha.com""","""The DCF model implemented sugg…","""https://seekingalpha.com/artic…"
"""X""","""2025-06-02 11:15:19""","""Jobs Week Starts with More Tra…","""https://images.financialmodeli…","""zacks.com""","""Currently both the blue-chip D…","""https://www.zacks.com/stock/ne…"
"""CLCO""","""2025-06-02 11:15:16""","""4 Discretionary Stocks to Buy …","""https://images.financialmodeli…","""zacks.com""","""With inflation cooling and con…","""https://www.zacks.com/stock/ne…"
"""NIO""","""2025-06-02 11:15:13""","""NIO's May Deliveries Rise 13% …","""https://images.financialmodeli…","""zacks.com""","""NIO delivered over 23K EVs in …","""https://www.zacks.com/stock/ne…"
"""PLTR""","""2025-06-02 11:15:05""","""Trump administration expands p…","""https://images.financialmodeli…","""proactiveinvestors.com""","""The Trump administration has s…","""https://www.proactiveinvestors…"


### **Step 3: Sentiment Analysis**
- Use `TextBlob` for sentiment analysis on news headlines.


In [None]:
# from py.sentiment_analysis import calculate_stock_sentiment_metrics, get_fundamental_value, calculate_sector_averages

# Load fundamental data and prepare ticker lists
print("Loading fundamental data...")
fundamentals_df = pl.read_csv('data/fundamentals_stock.csv')
fundamentals_pandas = fundamentals_df.to_pandas().set_index('Ticker')
all_tickers = set(news_df['symbol'].to_list() + fundamentals_df['Ticker'].to_list())
EXCLUDED_SYMBOLS = {'AI', 'S', 'A', 'U', 'E', 'US', 'ET', 'TSXV', 'CODI', 'C'}

print(f"Loaded {len(fundamentals_df)} stocks, {len(all_tickers)} unique tickers")

# Execute sentiment analysis
print("Analyzing sentiment for stock symbols...")
sentiment_metrics = calculate_stock_sentiment_metrics(news_df, all_tickers, EXCLUDED_SYMBOLS)
sentiment_df = pl.DataFrame([{
    "symbol": symbol, "articlesInLastWeek": metrics["articlesInLastWeek"],
    "companyNewsScore": metrics["companyNewsScore"], 
    "bearishPercent": metrics["sentiment"]["bearishPercent"],
    "bullishPercent": metrics["sentiment"]["bullishPercent"],
    "averageSentimentScore": metrics["averageSentimentScore"],
    "totalArticles": metrics["totalArticles"]
} for symbol, metrics in sentiment_metrics.items()]).sort(["articlesInLastWeek", "companyNewsScore"], descending=[True, True])

# Add fundamental data and sector averages
sector_averages = calculate_sector_averages(sentiment_df, fundamentals_pandas)
sentiment_with_fundamentals = sentiment_df.with_columns([
    pl.col("symbol").map_elements(lambda x: sector_averages.get(get_fundamental_value(x, 'Sector', 'Unknown'), {}).get('sectorAverageBullishPercent', 0), return_dtype=pl.Float64).alias("sectorAverageBullishPercent"),
    pl.col("symbol").map_elements(lambda x: sector_averages.get(get_fundamental_value(x, 'Sector', 'Unknown'), {}).get('sectorAverageNewsScore', 0), return_dtype=pl.Float64).alias("sectorAverageNewsScore"),
    pl.col("symbol").map_elements(lambda x: get_fundamental_value(x, 'Sector', 'Unknown'), return_dtype=pl.Utf8).alias("sector"),
    pl.col("symbol").map_elements(lambda x: get_fundamental_value(x, 'Market Cap'), return_dtype=pl.Float64).alias("marketCap"),
    pl.col("symbol").map_elements(lambda x: get_fundamental_value(x, 'P/E (trailing)'), return_dtype=pl.Float64).alias("peRatio"),
    pl.col("symbol").map_elements(lambda x: get_fundamental_value(x, 'Price'), return_dtype=pl.Float64).alias("price")
])

# Screen stocks and analyze sectors
comprehensive_screened = sentiment_with_fundamentals.filter((pl.col("articlesInLastWeek") >= 3) & (pl.col("companyNewsScore") >= 0.45)).sort(["companyNewsScore", "articlesInLastWeek"], descending=[True, True])
sector_summary = sentiment_with_fundamentals.filter(pl.col("sector") != "Unknown").group_by("sector").agg([
    pl.count("symbol").alias("stock_count"), pl.mean("companyNewsScore").alias("avg_news_score"),
    pl.mean("bullishPercent").alias("avg_bullish_percent"), pl.mean("articlesInLastWeek").alias("avg_articles"),
    pl.mean("marketCap").alias("avg_market_cap"), pl.mean("peRatio").alias("avg_pe_ratio")
]).sort("avg_news_score", descending=True)

print(f"\nScreened {len(comprehensive_screened)} stocks, {len(sector_averages)} sectors")
display(comprehensive_screened.head())
display(sector_summary)

# Export results
# sentiment_with_fundamentals.write_csv("data/combined_sentiment_fundamentals.csv")
# comprehensive_screened.write_csv("data/screened_stocks.csv")
# print("Files saved: combined_sentiment_fundamentals.csv and screened_stocks.csv")

Loading fundamental data...
Loaded 504 stocks, 4273 unique tickers
Analyzing sentiment for stock symbols...

Screened 1697 stocks, 11 sectors


symbol,articlesInLastWeek,companyNewsScore,bearishPercent,bullishPercent,averageSentimentScore,totalArticles,sectorAverageBullishPercent,sectorAverageNewsScore,sector,marketCap,peRatio,price
str,i64,f64,f64,f64,f64,i64,f64,f64,str,f64,f64,f64
"""BGM""",4,0.9,0.0,1.0,0.8,4,0.0,0.0,"""Unknown""",0.0,0.0,0.0
"""LPCN""",4,0.8,0.0,1.0,0.6,4,0.0,0.0,"""Unknown""",0.0,0.0,0.0
"""CAE""",4,0.8,0.0,1.0,0.6,4,0.0,0.0,"""Unknown""",0.0,0.0,0.0
"""NSCIF""",3,0.7803,0.0,1.0,0.5606,3,0.0,0.0,"""Unknown""",0.0,0.0,0.0
"""HBFG""",5,0.7788,0.0,1.0,0.5576,5,0.0,0.0,"""Unknown""",0.0,0.0,0.0


sector,stock_count,avg_news_score,avg_bullish_percent,avg_articles,avg_market_cap,avg_pe_ratio
str,u32,f64,f64,f64,f64,f64
"""Utilities""",28,0.589464,0.582643,17.035714,4.2542e10,21.169643
"""Communication Services""",19,0.583947,0.591137,27.263158,3.9771e11,
"""Healthcare""",52,0.571731,0.530044,20.230769,8.9752e10,
"""Technology""",77,0.569574,0.52053,17.0,2.1666e11,
"""Consumer Cyclical""",51,0.566929,0.507137,12.45098,1.1478e11,
…,…,…,…,…,…,…
"""Industrials""",62,0.561069,0.471273,10.5,5.9014e10,
"""Energy""",21,0.559405,0.389214,6.333333,7.0332e10,
"""Real Estate""",28,0.559225,0.426325,8.5,3.6466e10,
"""Financial Services""",59,0.551597,0.390469,11.949153,1.0424e11,


### **Step 4: Select Top 100 stocks (by `averageSentimentScore`)**

In [4]:
selected_stocks = (comprehensive_screened
    .filter(pl.col("sector") != 'Unknown')
    .sort("averageSentimentScore", descending=True)
    .head(100))

screened_tickers = selected_stocks['symbol'].to_list()
print(f"\nSelected {len(screened_tickers)} stocks with highest sentiment scores")
display(selected_stocks.head())

# Sector distribution and summary statistics
sector_distribution = selected_stocks.group_by("sector").agg([
    pl.count("symbol").alias("stock_count"),
    pl.mean("averageSentimentScore").alias("avg_sentiment_score"),
    pl.mean("companyNewsScore").alias("avg_news_score"),
    pl.mean("bullishPercent").alias("avg_bullish_percent")
]).sort("stock_count", descending=True)

print(f"\nSector distribution:")
display(sector_distribution.head())

# Show summary statistics
print(f"\nSummary statistics for selected stocks:")
print(f"Average sentiment score: {selected_stocks['averageSentimentScore'].mean():.4f}")
print(f"Min sentiment score: {selected_stocks['averageSentimentScore'].min():.4f}")
print(f"Max sentiment score: {selected_stocks['averageSentimentScore'].max():.4f}")
print(f"Average company news score: {selected_stocks['companyNewsScore'].mean():.4f}")
print(f"Number of stocks: {selected_stocks['symbol'].count()}")
print(f"Number of unique sectors: {selected_stocks['sector'].n_unique()}")


Selected 100 stocks with highest sentiment scores


symbol,articlesInLastWeek,companyNewsScore,bearishPercent,bullishPercent,averageSentimentScore,totalArticles,sectorAverageBullishPercent,sectorAverageNewsScore,sector,marketCap,peRatio,price
str,i64,f64,f64,f64,f64,i64,f64,f64,str,f64,f64,f64
"""BBY""",24,0.7785,0.0,1.0,0.5571,24,0.5071,0.5669,"""Consumer Cyclical""",14031000000.0,16.17,66.28
"""CHTR""",5,0.71,0.0,0.8,0.42,5,0.5911,0.5839,"""Communication Services""",54738000000.0,11.06,396.27
"""NXPI""",4,0.7,0.0,0.5,0.4,4,0.5205,0.5696,"""Technology""",48285000000.0,20.84,191.13
"""ZTS""",5,0.6844,0.0,1.0,0.3689,5,0.53,0.5717,"""Healthcare""",75075000000.0,30.33,168.63
"""FI""",6,0.67,0.0,0.6667,0.3401,6,0.5205,0.5696,"""Technology""",90256000000.0,28.81,162.79



Sector distribution:


sector,stock_count,avg_sentiment_score,avg_news_score,avg_bullish_percent
str,u32,f64,f64,f64
"""Technology""",19,0.248995,0.624479,0.698784
"""Industrials""",17,0.231253,0.615629,0.742094
"""Consumer Cyclical""",13,0.256838,0.628408,0.717069
"""Utilities""",13,0.247046,0.623523,0.677785
"""Healthcare""",11,0.233018,0.6165,0.756909



Summary statistics for selected stocks:
Average sentiment score: 0.2430
Min sentiment score: 0.1737
Max sentiment score: 0.5571
Average company news score: 0.6215
Number of stocks: 100
Number of unique sectors: 11
