In [3]:
# Source: https://www.youtube.com/watch?v=UNCLrS7_MfA

In [1]:
import sqlite3
import pandas as pd

# Use a raw string (r"") or forward slashes ("/") for the file path
conn = sqlite3.connect(r"C:\Users\henry\Data Projects\Stock_Optimization\finviz_news.db")

# Read the table into a DataFrame
df = pd.read_sql_query("SELECT * FROM finviz_news Order By day desc", conn)
print(df.head())

# Close the connection
conn.close()

      id ticker         day      time  \
0  52901   AAPL  2025-10-23  16:32:00   
1  52902   AAPL  2025-10-23  08:04:00   
2  52903   AAPL  2025-10-23  15:27:00   
3  52904   AAPL  2025-10-23  15:07:00   
4  52905   AAPL  2025-10-23  14:50:00   

                                               title  \
0  Apple Earnings Quality and Margins Exceed Fore...   
1  Breakout Watch: Why Nvidia, Apple, Meta And Ot...   
2  Who's Paying for the White House Ballroom? Bil...   
3  Apple loses landmark £1.5bn lawsuit over App S...   
4  Apple's (AAPL) iPhone Strength and Services Gr...   

                      source  \
0              GuruFocus.com   
1  Investor's Business Daily   
2                Barrons.com   
3              The Telegraph   
4             Insider Monkey   

                                                link  
0  https://finance.yahoo.com/news/apple-earnings-...  
1  https://www.investors.com/research/ibd-stock-a...  
2  https://www.barrons.com/articles/white-house-b...  
3  

In [2]:
import re
import pandas as pd
import numpy as np

In [3]:
def clean_text(s:str)->str:
    """Cleans up strings: handle missing values, removes whitespaces, removes URL"""
    if pd.isna(s):
        return ""
    s = s.strip()
    s = re.sub(r"http\S+|www\.\S+", "", s)
    s = re.sub(r"\s+", " ", s)
    return s

In [4]:
def compose_text(row, include_source=True):
    """Builds out the text to be scored by combining news headline with 
        its source in case of added signal/bias from source"""
    title = clean_text(row.get("title", ""))
    if include_source:
        src = clean_text(row.get("source",""))
        return f"{title} - {src}" if src else title
    return title

In [5]:
def finbert_scores(texts):
    from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
    import torch
    model_name = "ProsusAI/finbert"
    tok = AutoTokenizer.from_pretrained(model_name)
    mdl = AutoModelForSequenceClassification.from_pretrained(model_name)
    pipe = TextClassificationPipeline(model=mdl, tokenizer=tok, return_all_scores=True)
    
    scores=[]
    bs=32
    for i in range(0,len(texts),bs):
        batch = texts[i:i+bs]
        outs=pipe(batch) #batch processing for the data
        for dist in outs:
            d={x["label"].lower(): x["score"] for x in dist}
            score = float(d.get("positive", 0.0) - d.get("negative", 0.0))
            scores.append(score)
    return np.array(scores, dtype=float)

In [6]:
def vader_scores(texts):
    import nltk
    from nltk.sentiment import SentimentIntensityAnalyzer
    try:
        nltk.data.find('sentiment/vader_lexicon.zip')
    except LookupError:
        nltk.download('vader_lexicon')
    sia = SentimentIntensityAnalyzer()
    return np.array([sia.polarity_scores(t)["compound"] for t in texts], dtype=float)

In [7]:
def add_sentiment(df:pd.DataFrame, include_source_in_text=True, source_weights:dict | None = None):
    out = df.copy()
    texts = out.apply(lambda r: compose_text(r, include_source=include_source_in_text), axis=1).tolist()
    try:
        scores = finbert_scores(texts)
    except Exception:
        scores = vader_scores(texts)
    
    out["sentiment_score"] = scores
    
    if source_weights:
        w = out["source"].map(source_weights).finllna(1.0)
        out["sentiment_score"] = out["sentiment_score"] * w
    return out

In [8]:
# df_scored = add_sentiment(df, include_source_in_text=True,
#                           source_weights={"CNBC TV": 1.05, "Insider Monkey": 0.95})

df_scored = add_sentiment(df, include_source_in_text=True)
print(df_scored.head())    # has `sentiment_score

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


      id ticker         day      time  \
0  52901   AAPL  2025-10-23  16:32:00   
1  52902   AAPL  2025-10-23  08:04:00   
2  52903   AAPL  2025-10-23  15:27:00   
3  52904   AAPL  2025-10-23  15:07:00   
4  52905   AAPL  2025-10-23  14:50:00   

                                               title  \
0  Apple Earnings Quality and Margins Exceed Fore...   
1  Breakout Watch: Why Nvidia, Apple, Meta And Ot...   
2  Who's Paying for the White House Ballroom? Bil...   
3  Apple loses landmark £1.5bn lawsuit over App S...   
4  Apple's (AAPL) iPhone Strength and Services Gr...   

                      source  \
0              GuruFocus.com   
1  Investor's Business Daily   
2                Barrons.com   
3              The Telegraph   
4             Insider Monkey   

                                                link  sentiment_score  
0  https://finance.yahoo.com/news/apple-earnings-...         0.004235  
1  https://www.investors.com/research/ibd-stock-a...        -0.002968  
2  http

In [17]:
def upsert_scored(df_scored, db, table="finviz_news_scored"):
    cols = ["id","ticker","day","time","title","source","link","sentiment_score"]
    rows = list(df_scored[cols].itertuples(index=False, name=None))

    with sqlite3.connect(db) as conn:
        # 1) Ensure table exists with a primary key on id
        conn.execute(f"""
            CREATE TABLE IF NOT EXISTS {table} (
                id INTEGER PRIMARY KEY,
                ticker TEXT,
                day TEXT,
                time TEXT,
                title TEXT,
                source TEXT,
                link TEXT,
                sentiment_score REAL
            )
        """)

        # 2) Upsert rows
        conn.executemany(f"""
            INSERT INTO {table} (id, ticker, day, time, title, source, link, sentiment_score)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?)
            ON CONFLICT(id) DO UPDATE SET
                ticker = excluded.ticker,
                day = excluded.day,
                time = excluded.time,
                title = excluded.title,
                source = excluded.source,
                link = excluded.link,
                sentiment_score = excluded.sentiment_score
        """, rows)
        conn.commit()
    

In [18]:
db = "finviz_news.db"
upsert_scored(df_scored, db)

In [19]:
print(df_scored)

          id ticker         day      time  \
0      52901   AAPL  2025-10-23  16:32:00   
1      52902   AAPL  2025-10-23  08:04:00   
2      52903   AAPL  2025-10-23  15:27:00   
3      52904   AAPL  2025-10-23  15:07:00   
4      52905   AAPL  2025-10-23  14:50:00   
...      ...    ...         ...       ...   
10111   2796    DIA  2025-04-25  08:40:00   
10112   2797    DIA  2025-04-25  06:15:00   
10113   2798    DIA  2025-04-24  11:42:00   
10114   2799    DIA  2025-04-24  04:30:00   
10115   2800    DIA  2025-04-23  13:00:00   

                                                   title  \
0      Apple Earnings Quality and Margins Exceed Fore...   
1      Breakout Watch: Why Nvidia, Apple, Meta And Ot...   
2      Who's Paying for the White House Ballroom? Bil...   
3      Apple loses landmark £1.5bn lawsuit over App S...   
4      Apple's (AAPL) iPhone Strength and Services Gr...   
...                                                  ...   
10111  Bull Market Indicated By Zweig B