In [18]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
import re
import string
from sentence_transformers import SentenceTransformer
import nltk



In [19]:
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print(f"Using device: {device}")


Using device: mps


In [24]:
filepath_num = 'data/stock_data.csv'
filepath_sent = 'data/annotated_sp500_news_2024_AAPL.csv'

data_num = pd.read_csv(filepath_num)
data_sent = pd.read_csv(filepath_sent)

In [25]:
data_sent.head()

Unnamed: 0,date,title,content,sentiment_score
0,06/01/2024,Barron's Weekend Stock Picks: Abercrombie & Fi...,Benzinga reviews this weekend's top stories co...,0.01
1,06/01/2024,"What That Famous Investing Quote About Bulls, ...",Motley Fool co-founder David Gardner responds ...,-0.42
2,07/01/2024,"Apple's Busy Week: Tech Standoff, Legal Battle...","It was an eventful week for Apple Inc. AAPL, w...",0.02
3,07/01/2024,Stocks Decline And Fed Rate Cuts Imperiled To ...,"After stellar gains of 107% in 2023, the Magni...",-0.43
4,07/01/2024,"The Last Time Apple Spent This Much Money, It ...",Spending on research and development has nearl...,0.37


In [26]:
data_num.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,Dec 30 2024,252.23,253.5,250.75,252.2,251.59,35557500
1,Dec 27 2024,257.83,258.7,253.06,255.59,254.97,42355300
2,Dec 26 2024,258.19,260.1,257.63,259.02,258.4,27237100
3,Dec 24 2024,255.49,258.21,255.29,258.2,257.58,23234700
4,Dec 23 2024,254.77,255.65,253.45,255.27,254.66,40858800


In [27]:
# Check if 'Unnamed: 0' exists in the columns and drop it
if 'Unnamed: 0' in data_sent.columns:
    data_sent = data_sent.drop(columns=['Unnamed: 0'])
data_sent.rename(columns={'Time': 'Date'}, inplace=True)
data_sent.head()

Unnamed: 0,date,title,content,sentiment_score
0,06/01/2024,Barron's Weekend Stock Picks: Abercrombie & Fi...,Benzinga reviews this weekend's top stories co...,0.01
1,06/01/2024,"What That Famous Investing Quote About Bulls, ...",Motley Fool co-founder David Gardner responds ...,-0.42
2,07/01/2024,"Apple's Busy Week: Tech Standoff, Legal Battle...","It was an eventful week for Apple Inc. AAPL, w...",0.02
3,07/01/2024,Stocks Decline And Fed Rate Cuts Imperiled To ...,"After stellar gains of 107% in 2023, the Magni...",-0.43
4,07/01/2024,"The Last Time Apple Spent This Much Money, It ...",Spending on research and development has nearl...,0.37


In [29]:
if 'date' in data_num.columns:
    data_num.rename(columns={'date':'Date'}, inplace=True)
if 'date' in data_sent.columns:
    data_sent.rename(columns={'date':'Date'}, inplace=True)


In [31]:
data_num['Date'] = pd.to_datetime(data_num['Date'])  
# sentiment dates are in dd/mm/YYYY format, so force dayfirst
data_sent['Date']  = pd.to_datetime(data_sent['Date'], format='%d/%m/%Y', dayfirst=True)


In [32]:
# 3) Aggregate sentiment per date
agg_sent = (
    data_sent
    .groupby('Date', as_index=False)['sentiment_score']
    .sum()
    .rename(columns={'sentiment_score':'aggregate_sentiment_score'})
)

# 4) Merge on Date (inner join will only keep dates present in both)
merged = pd.merge(
    data_num,
    agg_sent,
    on='Date',
    how='inner'
)

In [33]:
# 5) Select the columns you need
final_data = merged[['Date','Open','Close','High','Volume','aggregate_sentiment_score']]

# 6) Quick sanity‐check
print(final_data.shape)
print(final_data.head())

(242, 6)
        Date    Open   Close    High    Volume  aggregate_sentiment_score
0 2024-12-30  252.23  252.20  253.50  35557500                       1.68
1 2024-12-27  257.83  255.59  258.70  42355300                       3.10
2 2024-12-26  258.19  259.02  260.10  27237100                       1.97
3 2024-12-24  255.49  258.20  258.21  23234700                       3.55
4 2024-12-23  254.77  255.27  255.65  40858800                       1.80


In [35]:
# Calculate next-day return
final_data['Movement'] = ((final_data['Open'].shift(-1) - final_data['Close']) / data_num['Close'])

# Shift Movement to align with current day
final_data['Movement'] = final_data['Movement'].shift(1)

# Drop the last row because there's no next day to compare
final_data = final_data.dropna(subset=['Movement'])

# Convert Movement into binary (1 for up if Movement > 0, else 0)
final_data['Movement'] = (final_data['Movement'] > 0).astype(int)

# Calculate daily returns (in percentage)
final_data['Daily_Return'] = final_data['Close'].pct_change() * 100

# Choose a rolling window (e.g., 10 days) for volatility
window_size = 5
final_data['Volatility'] = final_data['Daily_Return'].rolling(window=window_size).std()

# Drop the initial NaNs from rolling calculation
final_data = final_data.dropna(subset=['Volatility'])

# Define a rolling window size for sentiment volatility
sentiment_window_size = 5  # You can adjust the window size as needed

# Calculate rolling standard deviation of the aggregate sentiment score
final_data['sentiment_volatility'] = final_data['aggregate_sentiment_score'].rolling(window=sentiment_window_size).std()

# Create lagged sentiment volatility to avoid data leakage
final_data['sentiment_volatility_lag1'] = final_data['sentiment_volatility'].shift(1)

# Create lagged sentiment volatility to avoid data leakage
final_data['aggregate_sentiment_score_lag1'] = final_data['aggregate_sentiment_score'].shift(1)

# Drop rows with NaN values introduced by rolling and shifting
final_data = final_data.dropna(subset=['sentiment_volatility_lag1', 'aggregate_sentiment_score_lag1'])

# Create lagged features to avoid data leakage
final_data['Close_lag1'] = final_data['Close'].shift(1)
final_data['High_lag1'] = final_data['High'].shift(1)
final_data['Volume_lag1'] = final_data['Volume'].shift(1)
final_data['Daily_Return_lag1'] = final_data['Daily_Return'].shift(1)
final_data['Volatility_lag1'] = final_data['Volatility'].shift(1)

# Drop rows with NaN values introduced by shifting
final_data = final_data.dropna(subset=['Close_lag1', 'High_lag1', 'Volume_lag1', 'Daily_Return_lag1', 'Volatility_lag1', 'sentiment_volatility_lag1'])


final_data.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data['Movement'] = ((final_data['Open'].shift(-1) - final_data['Close']) / data_num['Close'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data['Movement'] = final_data['Movement'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data['Movement'] = (final_data['Movement']

Unnamed: 0,Date,Open,Close,High,Volume,aggregate_sentiment_score,Movement,Daily_Return,Volatility,sentiment_volatility,sentiment_volatility_lag1,aggregate_sentiment_score_lag1,Close_lag1,High_lag1,Volume_lag1,Daily_Return_lag1,Volatility_lag1
12,2024-12-11,247.96,246.49,250.8,45205500,3.4,0,-0.592838,1.355352,0.873453,0.855032,3.35,247.96,248.74,32777700.0,-0.068512,1.365181
13,2024-12-10,246.89,247.77,248.21,36914800,1.16,1,0.519291,0.683943,0.921754,0.873453,3.4,246.49,250.8,45205500.0,-0.592838,1.355352
14,2024-12-09,241.83,246.75,247.24,44649200,2.13,0,-0.411672,0.62293,0.939058,0.921754,1.16,247.77,248.21,36914800.0,0.519291,0.683943
15,2024-12-06,242.91,242.84,244.63,36870000,1.63,0,-1.5846,0.773264,1.01046,0.939058,2.13,246.75,247.24,44649200.0,-0.411672,0.62293
16,2024-12-05,243.99,243.04,244.54,40033900,0.17,1,0.082359,0.793463,1.195019,1.01046,1.63,242.84,244.63,36870000.0,-1.5846,0.773264
17,2024-12-04,242.87,243.01,244.11,44383900,1.55,0,-0.012344,0.800175,0.733635,1.195019,0.17,243.04,244.54,40033900.0,0.082359,0.793463
18,2024-12-03,239.81,242.65,242.76,38861000,2.54,0,-0.148142,0.679832,0.896203,0.733635,1.55,243.01,244.11,44383900.0,-0.012344,0.800175
19,2024-12-02,237.27,239.59,240.79,48137100,2.35,0,-1.261076,0.777886,0.933124,0.896203,2.54,242.65,242.76,38861000.0,-0.148142,0.679832
20,2024-11-29,234.81,237.33,237.81,28481400,0.46,0,-0.943278,0.605604,1.074723,0.933124,2.35,239.59,240.79,48137100.0,-1.261076,0.777886
21,2024-11-27,234.47,234.93,235.69,33498400,0.5,0,-1.01125,0.557946,0.985672,1.074723,0.46,237.33,237.81,28481400.0,-0.943278,0.605604


In [36]:
final_data.to_csv('data/merged_AAPL.csv')

In [38]:
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print(f"Using device: {device}")


Using device: mps


In [39]:
filepath_num = 'data/stock_data.csv'
filepath_sent = 'data/annotated_sp500_news_2024_AAPL.csv'

data_num = pd.read_csv(filepath_num)
data_sent = pd.read_csv(filepath_sent)

In [40]:
data_sent.head()

Unnamed: 0,date,title,content,sentiment_score
0,06/01/2024,Barron's Weekend Stock Picks: Abercrombie & Fi...,Benzinga reviews this weekend's top stories co...,0.01
1,06/01/2024,"What That Famous Investing Quote About Bulls, ...",Motley Fool co-founder David Gardner responds ...,-0.42
2,07/01/2024,"Apple's Busy Week: Tech Standoff, Legal Battle...","It was an eventful week for Apple Inc. AAPL, w...",0.02
3,07/01/2024,Stocks Decline And Fed Rate Cuts Imperiled To ...,"After stellar gains of 107% in 2023, the Magni...",-0.43
4,07/01/2024,"The Last Time Apple Spent This Much Money, It ...",Spending on research and development has nearl...,0.37


In [42]:
import re
import nltk

# Download the sentence tokenizer only once
nltk.download("punkt_tab")  

from nltk.tokenize import sent_tokenize

# ----------------------------------------
# 1. Minimal cleaning function (unchanged)
# ----------------------------------------
def minimal_clean_text(text: str, to_lowercase: bool = False) -> str:
    """
    Minimal cleaning for transformer-based models:
    1. Remove HTML tags.
    2. Remove/replacing weird characters if needed.
    3. (Optional) Convert to lowercase if you're using an uncased model.
    """
    # 1. Remove HTML tags:
    text = re.sub(r'<.*?>', '', text)

    # 2. Remove or replace unwanted characters (non-ASCII -> space here):
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)

    # 3. (Optional) Lowercasing:
    if to_lowercase:
        text = text.lower()

    # Remove excessive whitespace
    text = " ".join(text.split())

    return text

# --------------------------------------------
# 2. Helper function: remove duplicate sentences
# --------------------------------------------
def remove_duplicate_sentences(text: str) -> str:
    """
    Splits the text into sentences (using NLTK), 
    then removes exact duplicate sentences. 
    Joins back with a space.
    """
    sentences = sent_tokenize(text)
    seen = set()
    unique_sentences = []
    
    for s in sentences:
        s = s.strip()
        if s not in seen:
            unique_sentences.append(s)
            seen.add(s)
            
    return " ".join(unique_sentences)

# ---------------------------------------------
# 3. Clean the content only (exclude the Title)
# ---------------------------------------------
data_sent['cleaned_content'] = data_sent['content'].apply(
    lambda x: minimal_clean_text(x, to_lowercase=False)
)

# ---------------------------------------------
# 4. Remove duplicate sentences from each entry
# ---------------------------------------------
data_sent['cleaned_content'] = data_sent['cleaned_content'].apply(remove_duplicate_sentences)

# (The group-by and merging part has been removed)


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/skhanna/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [43]:
data_sent = data_sent.rename(columns={'date': 'Date'})

In [44]:
# Step 1: Parse day-first
data_sent['Date'] = pd.to_datetime(data_sent['Date'], dayfirst=True)
# 2. Sort by date in ascending order
data_sent.sort_values(by='Date', inplace=True)
data_sent.head()


Unnamed: 0,Date,title,content,sentiment_score,cleaned_content
0,2024-01-06,Barron's Weekend Stock Picks: Abercrombie & Fi...,Benzinga reviews this weekend's top stories co...,0.01,Benzinga reviews this weekend's top stories co...
1,2024-01-06,"What That Famous Investing Quote About Bulls, ...",Motley Fool co-founder David Gardner responds ...,-0.42,Motley Fool co-founder David Gardner responds ...
2,2024-01-07,"Apple's Busy Week: Tech Standoff, Legal Battle...","It was an eventful week for Apple Inc. AAPL, w...",0.02,"It was an eventful week for Apple Inc. AAPL, w..."
3,2024-01-07,Stocks Decline And Fed Rate Cuts Imperiled To ...,"After stellar gains of 107% in 2023, the Magni...",-0.43,"After stellar gains of 107% in 2023, the Magni..."
4,2024-01-07,"The Last Time Apple Spent This Much Money, It ...",Spending on research and development has nearl...,0.37,Spending on research and development has nearl...


In [45]:
if 'Unnamed: 0' in data_sent.columns:
    data_sent = data_sent.drop(columns=['Unnamed: 0'])
data_sent.head()

Unnamed: 0,Date,title,content,sentiment_score,cleaned_content
0,2024-01-06,Barron's Weekend Stock Picks: Abercrombie & Fi...,Benzinga reviews this weekend's top stories co...,0.01,Benzinga reviews this weekend's top stories co...
1,2024-01-06,"What That Famous Investing Quote About Bulls, ...",Motley Fool co-founder David Gardner responds ...,-0.42,Motley Fool co-founder David Gardner responds ...
2,2024-01-07,"Apple's Busy Week: Tech Standoff, Legal Battle...","It was an eventful week for Apple Inc. AAPL, w...",0.02,"It was an eventful week for Apple Inc. AAPL, w..."
3,2024-01-07,Stocks Decline And Fed Rate Cuts Imperiled To ...,"After stellar gains of 107% in 2023, the Magni...",-0.43,"After stellar gains of 107% in 2023, the Magni..."
4,2024-01-07,"The Last Time Apple Spent This Much Money, It ...",Spending on research and development has nearl...,0.37,Spending on research and development has nearl...


In [46]:
if 'Unnamed: 0' in data_num.columns:
    data_num = data_num.drop(columns=['Unnamed: 0'])
data_num.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,Dec 30 2024,252.23,253.5,250.75,252.2,251.59,35557500
1,Dec 27 2024,257.83,258.7,253.06,255.59,254.97,42355300
2,Dec 26 2024,258.19,260.1,257.63,259.02,258.4,27237100
3,Dec 24 2024,255.49,258.21,255.29,258.2,257.58,23234700
4,Dec 23 2024,254.77,255.65,253.45,255.27,254.66,40858800


In [47]:
# Merge on the 'date' column, keeping only dates present in both dataframes
data_num['Date'] = pd.to_datetime(data_num['Date'])
merged_df = pd.merge(data_num, data_sent, on='Date', how='inner')

merged_df.head()


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,title,content,sentiment_score,cleaned_content
0,2024-12-30,252.23,253.5,250.75,252.2,251.59,35557500,"Dow Jones 2024 Scoreboard: Top 5 Winners, Lose...",The Dow Jones Industrial Average hit an all-ti...,-0.03,The Dow Jones Industrial Average hit an all-ti...
1,2024-12-30,252.23,253.5,250.75,252.2,251.59,35557500,Wall Street Stories of 2024 Likely to Stay Hot...,"Here, we discuss some top Wall Street events o...",0.07,"Here, we discuss some top Wall Street events o..."
2,2024-12-30,252.23,253.5,250.75,252.2,251.59,35557500,Here's Why You Should Stay Away From RCI Stock...,Roger Communications is suffering due to inten...,-0.03,Roger Communications is suffering due to inten...
3,2024-12-30,252.23,253.5,250.75,252.2,251.59,35557500,Industry Comparison: Evaluating Apple Against ...,In the fast-paced and highly competitive busin...,0.01,In the fast-paced and highly competitive busin...
4,2024-12-30,252.23,253.5,250.75,252.2,251.59,35557500,Apple's Chinese Rival Huawei Slashes Prices Fo...,Huawei's market share gains jumped by 42% in Q...,-0.39,Huawei's market share gains jumped by 42% in Q...


In [48]:
merged_df.value_counts

<bound method DataFrame.value_counts of            Date    Open   High     Low   Close  Adj Close    Volume  \
0    2024-12-30  252.23  253.5  250.75  252.20     251.59  35557500   
1    2024-12-30  252.23  253.5  250.75  252.20     251.59  35557500   
2    2024-12-30  252.23  253.5  250.75  252.20     251.59  35557500   
3    2024-12-30  252.23  253.5  250.75  252.20     251.59  35557500   
4    2024-12-30  252.23  253.5  250.75  252.20     251.59  35557500   
...         ...     ...    ...     ...     ...        ...       ...   
6046 2024-01-08  182.09  185.6  181.50  185.56     184.21  59144500   
6047 2024-01-08  182.09  185.6  181.50  185.56     184.21  59144500   
6048 2024-01-08  182.09  185.6  181.50  185.56     184.21  59144500   
6049 2024-01-08  182.09  185.6  181.50  185.56     184.21  59144500   
6050 2024-01-08  182.09  185.6  181.50  185.56     184.21  59144500   

                                                  title  \
0     Dow Jones 2024 Scoreboard: Top 5 Winners, 

In [49]:
merged_df.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'title',
       'content', 'sentiment_score', 'cleaned_content'],
      dtype='object')

In [50]:
merged_df.to_csv('data/multimodal_AAPL_all.csv')