#### Load the Crypto News Dataset

In [1]:
import pandas as pd

# Load your headlines (assumes columns: date, headline)
news_df = pd.read_csv("../data/bitcoin_news_headlines.csv", parse_dates=['date'])
news_df.head()


Unnamed: 0,date,headline
0,2025-04-29,Ethereum’s ‘capitulation’ suggests ETH price i...
1,2025-04-28,Arizona state approves first ever US Bitcoin r...
2,2025-04-27,Bitcoin Continues To Flow Out Of Major Exchang...
3,2025-04-26,Swiss National Bank Rejects Bitcoin Reserve Pr...
4,2025-04-26,XRP price prediction as world’s first spot ETF...


#### Load FinBert Sentiment Pipeline

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)


  from .autonotebook import tqdm as notebook_tqdm
Device set to use mps:0


#### Classify Sentiment for Each Headline

In [3]:
# Run FinBERT on each headline
results = sentiment_pipeline(news_df['headline'].tolist())

# Map FinBERT labels to numerical scores
label_to_score = {
    "positive": 1,
    "neutral": 0,
    "negative": -1
}

# Add scores to the dataframe
news_df['sentiment'] = [label_to_score[r['label'].lower()] for r in results]


#### Aggregate Sentiment per Day

In [4]:
# Compute average sentiment per day
daily_sentiment_df = news_df.groupby('date')['sentiment'].mean().reset_index()
daily_sentiment_df = daily_sentiment_df.rename(columns={'sentiment': 'daily_sentiment'})

# Save to CSV
daily_sentiment_df.to_csv("../data/daily_sentiment.csv", index=False)
daily_sentiment_df.head()


Unnamed: 0,date,daily_sentiment
0,2024-12-31,0.2
1,2025-01-01,0.5
2,2025-01-03,0.0
3,2025-01-04,0.0
4,2025-01-05,0.0


#### Load Both Datasets and Merge Them on Date

In [5]:
features_df = pd.read_csv("../data/features_btc.csv", parse_dates=['Date'])
sentiment_df = pd.read_csv("../data/daily_sentiment.csv", parse_dates=['date'])

# Merge sentiment into features dataset
features_with_sentiment = features_df.merge(
    sentiment_df,
    left_on='Date',
    right_on='date',
    how='left'  # Keeps all rows from features_df, even if no sentiment
)

# Drop redundant 'date' column
features_with_sentiment = features_with_sentiment.drop(columns=['date'])

# Optional: Fill missing sentiment with 0 or leave NaN
features_with_sentiment['daily_sentiment'] = features_with_sentiment['daily_sentiment'].fillna(0)

features_with_sentiment.to_csv("../data/features_btc_with_sentiment.csv", index=False)
features_with_sentiment.tail()


Unnamed: 0,Date,Open,High,Low,Close,Volume,hash-rate,difficulty,output-volume,volatility,ma_7,ma_30,close_lag_1,close_lag_3,hashrate_lag_1,daily_sentiment
1789,2025-04-03,82487.476562,83909.296875,81282.101562,83102.828125,36852112080,859547300.0,113757500000000.0,669511.875273,2627.195312,83227.41183,84484.832031,82485.710938,82548.914062,972645600.0,-0.666667
1790,2025-04-04,83100.25,84696.148438,81670.75,83843.804688,45157640207,1017885000.0,113757500000000.0,585001.785377,3025.398438,83154.648438,84258.840104,83102.828125,85169.171875,859547300.0,0.0
1791,2025-04-05,83844.703125,84207.015625,82377.734375,83504.796875,14380803631,948981300.0,116403900000000.0,361328.652989,1829.28125,83284.25,84043.609115,83843.804688,82485.710938,1017885000.0,1.0
1792,2025-04-06,83504.507812,83704.71875,77097.742188,78214.484375,36294853736,875826800.0,121507800000000.0,439315.961134,6606.976562,82695.672991,83759.336198,83504.796875,83102.828125,948981300.0,0.0
1793,2025-04-07,78221.335938,81119.0625,74436.679688,79235.335938,91262424987,875826800.0,121507800000000.0,792091.204219,6682.382812,82222.304688,83528.694271,78214.484375,83843.804688,875826800.0,0.0
