# Sentiment Analysis for BTC/USD News

This notebook fetches BTC-related news, use VADER sentiment scores to determine whether a news article requires GPT

In [None]:
from huggingface_hub import login
ID = "your_huggingface_token_here"
login(ID)

#Fetch Dataset （From 2021 to 2023)



In [None]:
import kagglehub
import pandas as pd

# Load data into panda dataframe
path = kagglehub.dataset_download("oliviervha/crypto-news")
df = pd.read_csv(path  +"/cryptonews.csv")

# Filther and display data
df = df[['date', 'title', 'text']]
df.head()

Downloading from https://www.kaggle.com/api/v1/datasets/download/oliviervha/crypto-news?dataset_version_number=10...


100%|██████████| 3.99M/3.99M [00:00<00:00, 5.98MB/s]

Extracting files...





Unnamed: 0,date,title,text
0,2023-12-19 06:40:41,Grayscale CEO Calls for Simultaneous Approval ...,Grayscale CEO Michael Sonnenshein believes the...
1,2023-12-19 06:03:24,Indian Government is Actively Collaborating Wi...,"In an exclusive interview with CryptoNews, Man..."
2,2023-12-19 05:55:14,Judge Approves Settlement: Binance to Pay $1.5...,According to the Federal Court ruling on Decem...
3,2023-12-19 05:35:26,Why a gold rush for inscriptions has broken ha...,Some suggest EVM inscriptions are the latest w...
4,2023-12-19 05:31:08,‘Concerning precedent’ — bloXroute Labs' MEV r...,A decision by bloXroute Labs to start censorin...


# VADER Sentiment Analyzer

In [None]:
!pip install nltk
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer



[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [None]:
vader = SentimentIntensityAnalyzer()

df['title_vader'] = df['title'].apply(lambda x: vader.polarity_scores(x)['compound'])
df['text_vader'] = df['text'].apply(lambda x: vader.polarity_scores(x)['compound'])

#Select Articles for LLM Analysis

In [None]:
import numpy as np

# Define condition to select which article to analyze
def require_analysis(score, mode = "hybrid", threshold = 0.1):

  if mode == "unclear":
    return np.abs(score) < threshold

  if mode == "extreme":
    return np.abs(score) > threshold

  if mode == "hybrid":
    return np.abs(score) < threshold/2 or np.abs(score) > 1 - threshold/2

  if mode == "all":
    return True

# Create new column to indicate article that requires analysis
df["require_analysis"] = df.apply(
    lambda row: require_analysis(row["title_vader"]) or require_analysis(row["text_vader"]),
    axis=1
)

In [None]:
# Counting no. of articles that require analysis
count_true = df["require_analysis"].sum()
count = len(df)

print(f"Number of entries requiring analysis: {count_true} out of {count}")

Number of entries requiring analysis: 18338 out of 31037


#LLM Analysis for Hidden Implications

In [None]:
!pip install -q transformers accelerate bitsandbytes
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m62.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m50.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from tqdm import tqdm

# Prepare Model
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    load_in_4bit=True,
    trust_remote_code=True
)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Define prompt template
def generate_prompt(title, text):
    return (
        f"[TITLE]\n{title}\n\n"
        f"[TEXT]\n{text}\n\n"
        "Summarize the potential short-term impact of this news on the Bitcoin/USD market."
    )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
n = 100

# Filter and limit articles that require analysis
to_analyze = df[df["require_analysis"] == True].copy().head(n)


# Generate prompt
to_analyze["prompt"] = to_analyze.apply(
    lambda row: generate_prompt(row["title"], row["text"]), axis=1
)

# Summarize using LLM model
batch_size = 8
prompts = to_analyze["prompt"].tolist()
summaries = []

for i in tqdm(range(0, len(prompts), batch_size), desc="Generating summaries"):
    batch_prompts = prompts[i:i + batch_size]
    responses = pipe(batch_prompts, max_new_tokens=200, do_sample=False, temperature=0.0)

    for response in responses:
        # Safely extract the generated text
        if isinstance(response, list):
            full_text = response[0]["generated_text"]
        else:
            full_text = response["generated_text"]

        # Extract the summary after the target phrase
        if "Bitcoin/USD market." in full_text:
            summary = full_text.split("Bitcoin/USD market.")[-1].strip()
        else:
            summary = full_text.strip()  # fallback if phrase not found

        summaries.append(summary)

# Assign summary back
to_analyze["btc_summary"] = summaries

# Integrate into original dataframe
df["btc_summary"] = None  # initialize
df.loc[to_analyze.index, "btc_summary"] = to_analyze["btc_summary"].values

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating summaries:   8%|▊         | 1/13 [01:44<20:51, 104.26s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for 

In [None]:
df.to_csv("processed_news.csv", index=False)

from google.colab import files
files.download("processed_news.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
df.head()

Unnamed: 0,date,title,text,title_vader,text_vader,require_analysis,btc_summary
0,2023-12-19 06:40:41,Grayscale CEO Calls for Simultaneous Approval ...,Grayscale CEO Michael Sonnenshein believes the...,0.4767,0.0,True,[ANALYSIS]\nThe approval of spot Bitcoin ETFs ...
1,2023-12-19 06:03:24,Indian Government is Actively Collaborating Wi...,"In an exclusive interview with CryptoNews, Man...",0.3182,0.4019,False,
2,2023-12-19 05:55:14,Judge Approves Settlement: Binance to Pay $1.5...,According to the Federal Court ruling on Decem...,0.4019,-0.2023,False,
3,2023-12-19 05:35:26,Why a gold rush for inscriptions has broken ha...,Some suggest EVM inscriptions are the latest w...,-0.4767,-0.128,False,
4,2023-12-19 05:31:08,‘Concerning precedent’ — bloXroute Labs' MEV r...,A decision by bloXroute Labs to start censorin...,-0.5574,-0.4939,False,


# Final Sentiment Analysis: FinBERT

In [None]:
df = pd.read_csv("processed_news.csv")
df.head()

Unnamed: 0,date,title,text,title_vader,text_vader,require_analysis,btc_summary
0,2023-12-19 06:40:41,Grayscale CEO Calls for Simultaneous Approval ...,Grayscale CEO Michael Sonnenshein believes the...,0.4767,0.0,True,[ANALYSIS]\nThe approval of spot Bitcoin ETFs ...
1,2023-12-19 06:03:24,Indian Government is Actively Collaborating Wi...,"In an exclusive interview with CryptoNews, Man...",0.3182,0.4019,False,
2,2023-12-19 05:55:14,Judge Approves Settlement: Binance to Pay $1.5...,According to the Federal Court ruling on Decem...,0.4019,-0.2023,False,
3,2023-12-19 05:35:26,Why a gold rush for inscriptions has broken ha...,Some suggest EVM inscriptions are the latest w...,-0.4767,-0.128,False,
4,2023-12-19 05:31:08,‘Concerning precedent’ — bloXroute Labs' MEV r...,A decision by bloXroute Labs to start censorin...,-0.5574,-0.4939,False,


In [None]:
# Load FinBERT model
finbert = pipeline("sentiment-analysis", model="ProsusAI/finbert")

# Label-to-score mapping
bert_score_map = {
    "positive": 1,
    "neutral": 0,
    "negative": -1
}

# Define a reusable function to get weighted scores in batch
def batch_weighted_scores(texts):
    # Drop NaNs
    mask = texts.notnull()
    clean_texts = texts[mask].tolist()

    # Run FinBERT in batch
    results = finbert(clean_texts, batch_size=16, truncation=True)

    # Extract weighted scores
    weighted = [
        bert_score_map[r['label'].lower()] * r['score'] for r in results
    ]

    # Reassign to full-length array
    full_result = pd.Series(index=texts.index, dtype=float)
    full_result[mask] = weighted
    full_result[~mask] = 0.0

    return full_result

df['title_bert_weighted'] = batch_weighted_scores(df['title'])
df['text_bert_weighted'] = batch_weighted_scores(df['text'])
df['summary_bert_weighted'] = batch_weighted_scores(df['btc_summary'])

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
df.to_csv("article_score.csv", index=False)
files.download("article_score.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Converting into Time Series Data for Downstream Analysis

In [None]:
import pandas as pd

# Convert 'date' to datetime, drop unused text columns
df['date'] = pd.to_datetime(df['date'], format='mixed', errors='coerce')
df = df.set_index('date')
df = df.drop(columns=['title', 'text', 'btc_summary'], errors='ignore')  # in case still present

# Convert into hourly index
hourly_df = df.resample('H').mean()

# Convert index into column
hourly_df = hourly_df.reset_index()
hourly_df = hourly_df.fillna(0)

hourly_df.head()

  hourly_df = df.resample('H').mean()


Unnamed: 0,date,title_vader,text_vader,require_analysis,title_bert_weighted,text_bert_weighted,summary_bert_weighted
0,2021-10-12 20:00:00,0.0,0.0,1.0,0.0,0.869037,0.0
1,2021-10-12 21:00:00,0.0,0.0,0.0,0.0,0.0,0.0
2,2021-10-12 22:00:00,0.0,0.0,0.0,0.0,0.0,0.0
3,2021-10-12 23:00:00,0.0,0.0,0.0,0.0,0.0,0.0
4,2021-10-13 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
hourly_df.to_csv("hourly_sentiment.csv", index=False)
files.download("hourly_sentiment.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>