# Full Hybrid Sentiment + LSTM Stock Prediction Notebook

**End-to-end notebook**: downloads FNSPID & price data (via Kaggle/HuggingFace), merges & normalizes, computes hybrid sentiment (VADER + TextBlob + FinBERT), engineers features, trains a hybrid LSTM model (price sequence + sentiment branch), evaluates and plots results.

## 1 — Install required packages

In [None]:
# Install dependencies (run in Colab or local environment with internet)
# In Colab, pip install is fine. If you run locally, ensure PyTorch is installed for your CUDA/CPU.
import sys
!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install kaggle yfinance pandas numpy matplotlib seaborn scikit-learn torch torchvision torchaudio transformers nltk textblob vaderSentiment tweepy praw --quiet
print('Install finished; restart runtime if required.')

Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.3
Install finished; restart runtime if required.


In [None]:
# NLTK downloads
import nltk
nltk.download('vader_lexicon')
print('NLTK resources downloaded')

NLTK resources downloaded


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


## 2 — Kaggle authentication and dataset download

In [None]:
# Kaggle auth & dataset download (Colab)
from google.colab import files
import os, sys, time

print("Please upload your kaggle.json (from your Kaggle account -> Account -> Create API token)")
uploaded = files.upload()

# move to ~/.kaggle
os.makedirs(os.path.expanduser("~/.kaggle"), exist_ok=True)
with open('kaggle.json','wb') as f:
    f.write(list(uploaded.values())[0])
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

print("Downloading FNSPID dataset from Kaggle (this may take a while)...")
!kaggle datasets download -d elsabetyemane/financial-news-and-stock-price-integration-dataset -q

print("Unzipping FNSPID...")
!unzip -q financial-news-and-stock-price-integration-dataset.zip -d fnspid_data

print("Downloading full price history (HuggingFace mirror)...")
!wget -q https://huggingface.co/datasets/Zihan1004/FNSPID/resolve/main/Stock_price/full_history.zip

print("Unzipping prices...")
!unzip -q full_history.zip -d stock_prices

print("Datasets downloaded and unzipped: fnspid_data/  stock_prices/")

Please upload your kaggle.json (from your Kaggle account -> Account -> Create API token)


Saving kaggle.json to kaggle.json
Downloading FNSPID dataset from Kaggle (this may take a while)...
Dataset URL: https://www.kaggle.com/datasets/elsabetyemane/financial-news-and-stock-price-integration-dataset
License(s): unknown
Unzipping FNSPID...
Downloading full price history (HuggingFace mirror)...
Unzipping prices...
Datasets downloaded and unzipped: fnspid_data/  stock_prices/


## 3 — Load, normalize & merge datasets
This cell reads price CSVs and FNSPID news CSV, normalizes dates and tickers, and merges into `final_df`.

In [None]:
# Load and normalize data
import pandas as pd, glob, os
from datetime import datetime, timezone

# Load all price CSVs from the unzipped folder
price_files = sorted(glob.glob('./stock_prices/full_history/*.csv'))
if not price_files:
    raise FileNotFoundError('No price files found in ./stock_prices/full_history/. Make sure the full_history.zip unzipped correctly.')

price_list = []
for f in price_files:
    df = pd.read_csv(f)
    # derive ticker from filename (some datasets name files with ticker)
    ticker = os.path.basename(f).replace('.csv','')
    df['stock'] = ticker
    price_list.append(df)

prices_df = pd.concat(price_list, ignore_index=True, sort=False)

# Load news CSV (path in FNSPID package)
news_path = 'fnspid_data/modularization-demo/data/raw_analyst_ratings.csv'
if not os.path.exists(news_path):
    # try to find any CSV inside fnspid_data
    candidates = glob.glob('fnspid_data/**/*.csv', recursive=True)
    if candidates:
        news_path = candidates[0]
    else:
        raise FileNotFoundError('No news CSV found in fnspid_data. Check the downloaded archive.')
news_df = pd.read_csv(news_path)

print('Loaded: prices rows =', len(prices_df), 'news rows =', len(news_df))

# Normalize dates: parse, coerce errors, convert to UTC then drop tz and normalize to midnight
prices_df['date'] = pd.to_datetime(prices_df['date'], utc=True, errors='coerce').dt.tz_localize(None).dt.normalize()
news_df['date'] = pd.to_datetime(news_df['date'], utc=True, errors='coerce').dt.tz_localize(None).dt.normalize()

# Clean tickers/stock column names (some files use different column names)
if 'stock' not in news_df.columns:
    # try common alternatives
    for c in ['stock_symbol','ticker','symbol','stock_symbol_y']:
        if c in news_df.columns:
            news_df = news_df.rename(columns={c:'stock'})
            break
# Ensure stock column exists
if 'stock' not in news_df.columns:
    raise KeyError('news_df does not contain a stock column. Available columns: ' + ','.join(news_df.columns))

prices_df['stock'] = prices_df['stock'].astype(str).str.strip()
news_df['stock'] = news_df['stock'].astype(str).str.strip()

# Quick debug print
print('Price sample dates/ticker:', prices_df['date'].iloc[0], prices_df['stock'].iloc[0])
print('News sample dates/ticker :', news_df['date'].iloc[0], news_df['stock'].iloc[0])

# Merge on date + stock
final_df = pd.merge(news_df, prices_df, on=['date','stock'], how='inner')
print('Merged final_df shape =', final_df.shape)
if final_df.empty:
    print('WARNING: Merged dataframe is empty — check date ranges and ticker naming.')
else:
    display(final_df.head())

Loaded: prices rows = 29677722 news rows = 1407328
Price sample dates/ticker: 2023-12-28 00:00:00 A
News sample dates/ticker : 2020-06-05 00:00:00 A
Merged final_df shape = (33898, 12)


Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock,volume,open,high,low,close,adj close
0,0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05,A,2844700.0,92.129997,93.040001,90.089996,90.379997,88.321449
1,1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03,A,1382500.0,90.650002,91.139999,90.260002,90.489998,88.42894
2,2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26,A,3173400.0,86.230003,86.790001,85.639999,86.129997,84.168251
3,3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22,A,5063100.0,85.0,87.669998,84.199997,84.980003,83.044434
4,4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22,A,5063100.0,85.0,87.669998,84.199997,84.980003,83.044434


## 4 — Sentiment extraction (VADER + TextBlob + FinBERT)
This cell defines functions to compute hybrid sentiment per headline. FinBERT will download model weights the first time it's run (internet required).

In [None]:
# Sentiment functions
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import numpy as np

# Load VADER
vader = SentimentIntensityAnalyzer()

# FinBERT model name (ProsusAI/finbert)
finbert_name = "ProsusAI/finbert"

print('Loading FinBERT tokenizer and model (this may take a few minutes)...')
tokenizer = AutoTokenizer.from_pretrained(finbert_name)
finbert = AutoModelForSequenceClassification.from_pretrained(finbert_name)
finbert.eval()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
finbert.to(device)
print('FinBERT loaded on', device)

def finbert_sentiment(text, tokenizer=tokenizer, model=finbert, device=device):
    if not isinstance(text, str) or text.strip()=='':
        return 0.0
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=256).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1).cpu().numpy()[0]
    # map to scalar: positive - negative
    score = float(probs[0] - probs[1])
    return score

def hybrid_sentiment(text):
    # compute VADER, TextBlob, FinBERT and weighted hybrid
    try:
        v = vader.polarity_scores(text)['compound']
    except Exception:
        v = 0.0
    t = TextBlob(text).sentiment.polarity
    f = finbert_sentiment(text)
    return 0.2*v + 0.2*t + 0.6*f

# Quick demo (comment out in large runs)
print('Demo hybrid sentiment:', hybrid_sentiment('Company reports record profits and beats expectations'))

Loading FinBERT tokenizer and model (this may take a few minutes)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

FinBERT loaded on cpu
Demo hybrid sentiment: 0.630218421535492


## 5 — Aggregate sentiments per day, engineer features, and prepare dataset
This cell computes sentiment per headline, aggregates daily, merges with OHLCV, calculates moving averages and volatility, and shifts the target (next day close).

In [None]:
# Compute sentiment per headline (this may take time on large datasets)
import tqdm
if 'final_df' not in globals():
    raise RuntimeError('final_df not found — run previous merge cell first.')

# Only keep necessary columns to save memory; adjust column names as needed
text_col = None
for c in ['headline','title','news_headline','text','body']:
    if c in final_df.columns:
        text_col = c
        break
if text_col is None:
    raise KeyError('No headline/title column found in merged data. Columns: ' + ','.join(final_df.columns))

print('Using text column:', text_col)

# To speed up, you may sample or limit the dataset during testing
sample_frac = 1.0  # set to 0.1 for quick debug
df_text = final_df[['date','stock', text_col]].copy()
df_text = df_text.rename(columns={text_col:'headline'})
if sample_frac < 1.0:
    df_text = df_text.sample(frac=sample_frac, random_state=42)

# Compute sentiment (use tqdm for progress)
sentiments = []
for h in tqdm.tqdm(df_text['headline'].astype(str).tolist(), desc='Sentiment'):
    sentiments.append(hybrid_sentiment(h))
df_text['sentiment'] = sentiments

# Aggregate to daily per stock
agg = df_text.groupby(['date','stock']).agg(
    sent_mean=('sentiment','mean'),
    sent_sum=('sentiment','sum'),
    sent_count=('sentiment','count')
).reset_index()

# Prepare price features (use OHLCV columns — adapt names if different)
prices = prices_df.copy()
# Ensure date normalization (in case)
prices['date'] = pd.to_datetime(prices['date']).dt.normalize()

# We'll compute daily features per stock and then merge
merged = pd.merge(prices, agg, on=['date','stock'], how='left')
merged['sent_mean'] = merged['sent_mean'].fillna(0.0)
merged['sent_sum'] = merged['sent_sum'].fillna(0.0)
merged['sent_count'] = merged['sent_count'].fillna(0)

# Sort and compute indicators per stock
merged = merged.sort_values(['stock','date']).reset_index(drop=True)
merged['return'] = merged.groupby('stock')['close'].pct_change()
merged['ma5'] = merged.groupby('stock')['close'].transform(lambda x: x.rolling(5).mean())
merged['ma10'] = merged.groupby('stock')['close'].transform(lambda x: x.rolling(10).mean())
merged['volatility'] = merged.groupby('stock')['return'].transform(lambda x: x.rolling(5).std())

# Drop rows with NaN indicators
merged = merged.dropna().reset_index(drop=True)

# Shift target (next-day close) per stock
merged['target'] = merged.groupby('stock')['close'].shift(-1)
merged = merged.dropna().reset_index(drop=True)

print('Prepared merged feature dataframe shape:', merged.shape)
display(merged.head())

Using text column: headline


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Sentiment:  80%|████████  | 27285/33898 [1:20:57<20:40,  5.33it/s][A
Sentiment:  80%|████████  | 27286/33898 [1:20:57<22:20,  4.93it/s][A
Sentiment:  80%|████████  | 27287/33898 [1:20:58<24:38,  4.47it/s][A
Sentiment:  81%|████████  | 27288/33898 [1:20:58<25:15,  4.36it/s][A
Sentiment:  81%|████████  | 27289/33898 [1:20:58<23:27,  4.70it/s][A
Sentiment:  81%|████████  | 27290/33898 [1:20:59<41:07,  2.68it/s][A
Sentiment:  81%|████████  | 27291/33898 [1:20:59<34:06,  3.23it/s][A
Sentiment:  81%|████████  | 27292/33898 [1:20:59<29:59,  3.67it/s][A
Sentiment:  81%|████████  | 27293/33898 [1:20:59<26:40,  4.13it/s][A
Sentiment:  81%|████████  | 27294/33898 [1:20:59<23:19,  4.72it/s][A
Sentiment:  81%|████████  | 27295/33898 [1:21:00<30:31,  3.61it/s][A
Sentiment:  81%|████████  | 27296/33898 [1:21:00<32:00,  3.44it/s][A
Sentiment:  81%|████████  | 27297/33898 [1:21:00<31:37,  3.48it/s][A
Sentiment:  81%|████████ 

In [None]:
# Save merged so we don’t lose it on Colab resets
merged.to_csv("merged_features_backup.csv", index=False)
print("💾 Saved merged_features_backup.csv")

## 6 — Create sequences, scale features, and split into train/val/test per stock
This builds PyTorch datasets for training. You can adjust sequence length and batch size.

In [None]:
# ---------------------------------------------
# FIXED STEP 6 — safely restores 'merged' if RAM reset
# ---------------------------------------------

import os
import pandas as pd

# Try restore merged if runtime lost variables
if 'merged' not in globals():
    print("❌ merged is not in memory — Colab probably reset RAM.")

    if os.path.exists("merged_features_backup.csv"):
        print("🔄 Loading merged_features_backup.csv ...")
        merged = pd.read_csv("merged_features_backup.csv")

        # Convert 'date' back to datetime and normalize again
        if 'date' in merged.columns:
            merged['date'] = pd.to_datetime(merged['date'], errors='coerce').dt.normalize()

        print("✅ merged successfully restored from backup CSV!")
    else:
        raise RuntimeError(
            "❌ merged is not in memory AND merged_features_backup.csv not found.\n"
            "→ RE-RUN Step 5 BEFORE running Step 6."
        )

print("Step 6 started — merged dataframe exists.")
print("merged shape:", merged.shape)
print("merged columns:", list(merged.columns))

# ---------------------------------------------
# Feature columns
# ---------------------------------------------
feature_cols = [
    'open','high','low','close','volume',
    'sent_mean','sent_sum','sent_count',
    'return','ma5','ma10','volatility'
]

# Column validation
missing = [c for c in feature_cols if c not in merged.columns]
if missing:
    raise KeyError(
        f"❌ Missing feature columns: {missing}\n"
        f"Available columns: {list(merged.columns)}"
    )

print("All required feature columns found ✓")

# ---------------------------------------------
# SCALE FEATURES
# ---------------------------------------------
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

merged_scaled = merged.copy()
merged_scaled[feature_cols + ['target']] = scaler.fit_transform(
    merged[feature_cols + ['target']]
)

print("Scaling complete ✓")

# ---------------------------------------------
# CREATE SEQUENCES
# ---------------------------------------------
import numpy as np

seq_len = 30
X_seqs = []
y_seqs = []

for stock in merged_scaled['stock'].unique():
    df_s = merged_scaled[merged_scaled['stock'] == stock].reset_index(drop=True)
    X_vals = df_s[feature_cols].values.astype('float32')
    y_vals = df_s['target'].values.astype('float32')

    for i in range(len(df_s) - seq_len):
        X_seqs.append(X_vals[i:i+seq_len])
        y_seqs.append(y_vals[i+seq_len])

X = np.array(X_seqs)
y = np.array(y_seqs)

print("Sequence creation complete ✓")
print("X shape:", X.shape, "  y shape:", y.shape)

# ---------------------------------------------
# TRAIN / VAL / TEST SPLIT
# ---------------------------------------------
n = len(X)
train_end = int(n * 0.8)
val_end = int(n * 0.9)

X_train, y_train = X[:train_end], y[:train_end]
X_val, y_val = X[train_end:val_end], y[train_end:val_end]
X_test, y_test = X[val_end:], y[val_end:]

import torch
from torch.utils.data import DataLoader, TensorDataset

batch_size = 64

train_loader = DataLoader(TensorDataset(torch.tensor(X_train), torch.tensor(y_train)),
                          batch_size=batch_size, shuffle=False)
val_loader = DataLoader(TensorDataset(torch.tensor(X_val), torch.tensor(y_val)),
                        batch_size=batch_size, shuffle=False)
test_loader = DataLoader(TensorDataset(torch.tensor(X_test), torch.tensor(y_test)),
                         batch_size=batch_size, shuffle=False)

print("Data loaders ready ✓")

## 7 — Hybrid LSTM model (price sequence + sentiment branch) and training loop

In [None]:
import torch.nn as nn
import torch.optim as optim
import math

class HybridModel(nn.Module):
    def __init__(self, num_features, lstm_hidden=64):
        super().__init__()
        self.lstm = nn.LSTM(input_size=num_features, hidden_size=lstm_hidden, batch_first=True)
        # sentiment branch: we feed last time-step features through a small MLP
        self.sent_fc = nn.Sequential(
            nn.Linear(num_features, 32),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        self.final = nn.Sequential(
            nn.Linear(lstm_hidden + 32, 64),
            nn.ReLU(),
            nn.Linear(64,1)
        )
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        last = lstm_out[:, -1, :]
        sent = self.sent_fc(x[:, -1, :])
        cat = torch.cat([last, sent], dim=1)
        out = self.final(cat)
        return out.squeeze(1)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = HybridModel(num_features=len(feature_cols), lstm_hidden=64).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

# Training loop with early stopping
best_val = float('inf')
patience = 0
max_patience = 5
n_epochs = 25

for epoch in range(n_epochs):
    model.train()
    train_losses = []
    for xb, yb in train_loader:
        xb = xb.to(device).float()
        yb = yb.to(device).float()
        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        train_losses.append(loss.item())
    # Validation
    model.eval()
    val_losses = []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb = xb.to(device).float()
            yb = yb.to(device).float()
            preds = model(xb)
            loss = criterion(preds, yb)
            val_losses.append(loss.item())
    train_loss = sum(train_losses)/len(train_losses)
    val_loss = sum(val_losses)/len(val_losses)
    val_rmse = math.sqrt(val_loss)
    print(f'Epoch {epoch+1}/{n_epochs} Train MSE: {train_loss:.6f} Val RMSE: {val_rmse:.6f}')
    # early stopping
    if val_rmse < best_val:
        best_val = val_rmse
        torch.save(model.state_dict(), 'best_hybrid_model.pth')
        patience = 0
    else:
        patience += 1
        if patience >= max_patience:
            print('Early stopping triggered.')
            break

# Load best model
model.load_state_dict(torch.load('best_hybrid_model.pth'))

## 8 — Evaluate on test set and plot predictions vs actuals

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
model.eval()
y_true = []
y_pred = []
with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(device).float()
        preds = model(xb).cpu().numpy()
        y_pred.extend(preds.tolist())
        y_true.extend(yb.numpy().tolist())

y_true = np.array(y_true)
y_pred = np.array(y_pred)
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
mae = mean_absolute_error(y_true, y_pred)
direction = (np.sign(y_pred[1:]-y_pred[:-1]) == np.sign(y_true[1:]-y_true[:-1])).mean() * 100 if len(y_true)>1 else np.nan
print(f'Test RMSE: {rmse:.4f}, MAE: {mae:.4f}, Directional Acc: {direction:.2f}%')

# Plot a subset
import matplotlib.pyplot as plt
plt.figure(figsize=(12,5))
plt.plot(y_true[:500], label='Actual')
plt.plot(y_pred[:500], label='Predicted')
plt.legend()
plt.title('Predicted vs Actual (test subset)')
plt.show()

# Save model and scaler
torch.save(model.state_dict(), 'hybrid_stock_model.pth')
import joblib
joblib.dump(scaler, 'scaler.save')
print('Saved: hybrid_stock_model.pth, scaler.save')