In [1]:
import pandas as pd

# Load tweets
df = pd.read_csv("datasets/Bitcoin_tweets.csv", parse_dates=['date'])

df['date'] = pd.to_datetime(df['date'], errors='coerce')
print(df['date'].dtype)  # should be datetime64[ns]

print(df['date'].isna().sum())
print(df['date'].min(), df['date'].max())

datetime64[ns]
0
2021-02-05 10:52:04 2021-03-12 23:59:14


In [2]:
# Keep only the specified columns if they exist in the dataframe
df_filtered = df.copy()      
columns_to_keep = ['user_created', 'user_followers', 'date', 'text']
df_filtered = df_filtered[columns_to_keep]
filters = {
    'user_created': lambda df: df['user_created'] < df['date'] - pd.Timedelta(days=30),
    'user_followers': lambda df: df['user_followers'] > 1000
}

# Apply filters
for col, condition in filters.items():
    df_filtered = df_filtered[condition(df_filtered)]
    
df.to_csv("datasets/filtered_tweets.csv", index=False)
df_filtered

Unnamed: 0,user_created,user_followers,date,text
0,2009-04-26 20:05:09,8534,2021-02-10 23:59:04,Blue Ridge Bank shares halted by NYSE after #b...
1,2019-10-17 20:12:10,6769,2021-02-10 23:58:48,"😎 Today, that's this #Thursday, we will do a ""..."
4,2016-02-03 13:15:55,1249,2021-02-10 23:54:06,This network is secured by 9 508 nodes as of t...
7,2018-08-03 21:30:08,4052,2021-02-10 23:52:42,🔄 Prices update in $EUR (1 hour):\n\n$BTC - ...
9,2009-04-26 20:05:09,8534,2021-02-10 23:52:08,.@Tesla’s #bitcoin investment is revolutionary...
...,...,...,...,...
48559,2018-05-25 21:11:08,3255,2021-03-11 22:38:25,🔄 Prices update in $USD (1 hour):\n\n$BTC - 57...
48566,2017-01-01 03:36:19,1891,2021-03-11 22:35:29,#Bitcoin reversed yesterday's weakness to fini...
48571,2018-05-13 04:06:51,2415,2021-03-11 22:32:42,Goldman Sachs Evaluates #Bitcoin as Client Dem...
48575,2012-11-03 22:31:45,12014,2021-03-11 22:31:36,#Bitcoin Big Double pay again!!! Once it hit $...


In [3]:
import re
df_clean = df_filtered.copy()

def clean_tweet(text):
    text = re.sub(r"http\S+", "", text)  # remove URLs
    text = re.sub(r"@\w+", "", text)     # remove mentions
    text = re.sub(r"#", "", text)        # remove hashtag symbol
    text = re.sub(r"\s+", " ", text).strip()  # remove extra spaces
    return text

df_clean['clean_text'] = df_clean['text'].astype(str).apply(clean_tweet)
df_clean.drop(columns=['text'], inplace=True)  # drop original text column
df_clean.to_csv("datasets/cleaned_tweets.csv", index=False)
df_clean

Unnamed: 0,user_created,user_followers,date,clean_text
0,2009-04-26 20:05:09,8534,2021-02-10 23:59:04,Blue Ridge Bank shares halted by NYSE after bi...
1,2019-10-17 20:12:10,6769,2021-02-10 23:58:48,"😎 Today, that's this Thursday, we will do a ""🎬..."
4,2016-02-03 13:15:55,1249,2021-02-10 23:54:06,This network is secured by 9 508 nodes as of t...
7,2018-08-03 21:30:08,4052,2021-02-10 23:52:42,🔄 Prices update in $EUR (1 hour): $BTC - 37082...
9,2009-04-26 20:05:09,8534,2021-02-10 23:52:08,.’s bitcoin investment is revolutionary for cr...
...,...,...,...,...
48559,2018-05-25 21:11:08,3255,2021-03-11 22:38:25,🔄 Prices update in $USD (1 hour): $BTC - 57800...
48566,2017-01-01 03:36:19,1891,2021-03-11 22:35:29,Bitcoin reversed yesterday's weakness to finis...
48571,2018-05-13 04:06:51,2415,2021-03-11 22:32:42,Goldman Sachs Evaluates Bitcoin as Client Dema...
48575,2012-11-03 22:31:45,12014,2021-03-11 22:31:36,Bitcoin Big Double pay again!!! Once it hit $5...


In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline

# Load FinBERT
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
finbert = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Run on a sample
df_clean['sentiment'] = df_clean['clean_text'].apply(lambda x: finbert(x[:512])[0]['label'])  # truncate to 512 tokens

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


In [5]:
df_sentiment = df_clean.copy()
df_sentiment.to_csv("datasets/sentiment_tweets.csv", index=False)
df_sentiment

Unnamed: 0,user_created,user_followers,date,clean_text,sentiment
0,2009-04-26 20:05:09,8534,2021-02-10 23:59:04,Blue Ridge Bank shares halted by NYSE after bi...,Negative
1,2019-10-17 20:12:10,6769,2021-02-10 23:58:48,"😎 Today, that's this Thursday, we will do a ""🎬...",Neutral
4,2016-02-03 13:15:55,1249,2021-02-10 23:54:06,This network is secured by 9 508 nodes as of t...,Negative
7,2018-08-03 21:30:08,4052,2021-02-10 23:52:42,🔄 Prices update in $EUR (1 hour): $BTC - 37082...,Neutral
9,2009-04-26 20:05:09,8534,2021-02-10 23:52:08,.’s bitcoin investment is revolutionary for cr...,Neutral
...,...,...,...,...,...
48559,2018-05-25 21:11:08,3255,2021-03-11 22:38:25,🔄 Prices update in $USD (1 hour): $BTC - 57800...,Neutral
48566,2017-01-01 03:36:19,1891,2021-03-11 22:35:29,Bitcoin reversed yesterday's weakness to finis...,Positive
48571,2018-05-13 04:06:51,2415,2021-03-11 22:32:42,Goldman Sachs Evaluates Bitcoin as Client Dema...,Positive
48575,2012-11-03 22:31:45,12014,2021-03-11 22:31:36,Bitcoin Big Double pay again!!! Once it hit $5...,Positive


In [17]:
df_sentiment = pd.read_csv("datasets/sentiment_tweets.csv", parse_dates=['date'])
sentiment_score = {'Positive': 1, 'Neutral': 0, 'Negative': -1}
df_sentiment['sentiment_score'] = df_sentiment['sentiment'].map(sentiment_score)

# Aggregate sentiment per day
daily_sentiment = df_sentiment.groupby(df_sentiment['date'].dt.floor('d'))['sentiment_score'].mean().reset_index()
daily_sentiment.columns = ['date', 'avg_sentiment']

daily_sentiment

Unnamed: 0,date,avg_sentiment
0,2021-02-05,0.072808
1,2021-02-06,0.08121
2,2021-02-07,0.084731
3,2021-02-08,0.098643
4,2021-02-09,0.110154
5,2021-02-10,0.088905
6,2021-02-13,0.059687
7,2021-02-14,0.129588
8,2021-02-15,0.1639
9,2021-02-18,0.107724


In [47]:
import yfinance as yf

# Load BTC data from yfinance
btc = yf.download("BTC-USD", start="2021-02-05", end="2021-03-12")
btc = btc[['Open', 'Close']].reset_index()
btc.columns = ['date', 'btc_open', 'btc_close']
# Merge sentiment with price
btc['date'] = pd.to_datetime(btc['date'])  # Convert 'date' column to datetime
merged = pd.merge(daily_sentiment, btc, on='date', how='inner')

# Calculate daily return
merged['btc_return'] = merged['btc_close'].pct_change()

[*********************100%***********************]  1 of 1 completed


In [48]:
merged

Unnamed: 0,date,avg_sentiment,btc_open,btc_close,btc_return
0,2021-02-05,0.072808,36931.546875,38144.308594,
1,2021-02-06,0.08121,38138.386719,39266.011719,0.029407
2,2021-02-07,0.084731,39250.191406,38903.441406,-0.009234
3,2021-02-08,0.098643,38886.828125,46196.464844,0.187465
4,2021-02-09,0.110154,46184.992188,46481.105469,0.006162
5,2021-02-10,0.088905,46469.761719,44918.183594,-0.033625
6,2021-02-13,0.059687,47491.203125,47105.515625,0.048696
7,2021-02-14,0.129588,47114.507812,48717.289062,0.034216
8,2021-02-15,0.1639,48696.535156,47945.058594,-0.015851
9,2021-02-18,0.107724,52140.972656,51679.796875,0.077896


In [54]:
import yfinance as yf
import pandas as pd
import plotly.graph_objects as go


# Create the candlestick chart
fig = go.Figure(data=[go.Candlestick(
    x=merged['date'],
    open=merged['btc_open'],
    close=merged['btc_close'],
    name='BTC Price',
    increasing_line_color='green', decreasing_line_color='red',
)])

# Add sentiment line (you can adjust this for how you want sentiment displayed)
fig.add_trace(go.Scatter(
    x=merged['date'],
    y=merged['avg_sentiment'].apply(lambda x: 1 if x == 'POSITIVE' else (-1 if x == 'NEGATIVE' else 0)),  # Map sentiment to numerical values for plotting
    mode='markers',
    name='Sentiment',
    marker=dict(color='blue', size=5)
))

# Layout updates for better visibility
fig.update_layout(
    title="Bitcoin Price and Sentiment Analysis",
    xaxis_title="Date",
    yaxis_title="Price (USD)",
    xaxis_rangeslider_visible=False,
    template="plotly_dark",
)

fig.show()

In [26]:
correlation = merged[['avg_sentiment', 'btc_return']].corr().iloc[0,1]
print(f"Correlation between sentiment and return: {correlation:.4f}")

KeyError: "['avg_sentiment'] not in index"

In [None]:
# Keep only the specified columns if they exist in the dataframe
df_filtered = df.copy()      
columns_to_keep = ['user_created', 'user_followers', 'date', 'text']
df_filtered = df_filtered[columns_to_keep]
filters = {
    'user_created': lambda df: df['user_created'] < df['date'] - pd.Timedelta(days=30),
    'user_followers': lambda df: df['user_followers'] > 1000
}

# Apply filters
for col, condition in filters.items():
    df_filtered = df_filtered[condition(df_filtered)]
    
df.to_csv("datasets/filtered_tweets.csv", index=False)
df_filtered

In [None]:
import re
df_clean = df_filtered.copy()

def clean_tweet(text):
    text = re.sub(r"http\S+", "", text)  # remove URLs
    text = re.sub(r"@\w+", "", text)     # remove mentions
    text = re.sub(r"#", "", text)        # remove hashtag symbol
    text = re.sub(r"\s+", " ", text).strip()  # remove extra spaces
    return text

df_clean['clean_text'] = df_clean['text'].astype(str).apply(clean_tweet)
df_clean.drop(columns=['text'], inplace=True)  # drop original text column
df_clean.to_csv("datasets/cleaned_tweets.csv", index=False)
df_clean

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline

# Load FinBERT
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
finbert = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Run on a sample
df_clean['sentiment'] = df_clean['clean_text'].apply(lambda x: finbert(x[:512])[0]['label'])  # truncate to 512 tokens

In [None]:
df_sentiment = df_clean.copy()
df_sentiment.to_csv("datasets/sentiment_tweets.csv", index=False)

In [None]:
sentiment_score = {'Positive': 1, 'Neutral': 0, 'Negative': -1}
df_clean['sentiment_score'] = df_clean['sentiment'].map(sentiment_score)

# Aggregate sentiment per day
hourly_sentiment = df_clean.groupby(df_clean['date'].dt.floor('h'))['sentiment_score'].mean().reset_index()
hourly_sentiment.columns = ['date', 'avg_sentiment']

hourly_sentiment

In [None]:
import yfinance as yf

# Load BTC data from yfinance
btc = yf.download("BTC-USD", start="2021-01-01", end="2021-12-31")
btc = btc[['Close']].reset_index()
btc.columns = ['date', 'btc_close']

# Merge sentiment with price
btc['date'] = pd.to_datetime(btc['date'])  # Convert 'date' column to datetime
merged = pd.merge(hourly_sentiment, btc, on='date', how='inner')

# Calculate daily return
merged['btc_return'] = merged['btc_close'].pct_change()

In [None]:
import yfinance as yf
import pandas as pd
import plotly.graph_objects as go

# Load BTC data from yfinance
btc = yf.download("BTC-USD", start="2021-01-01", end="2021-12-31")
btc = btc[['Open', 'High', 'Low', 'Close']].reset_index()
btc.columns = ['date', 'btc_open', 'btc_high', 'btc_low', 'btc_close']

# Merge sentiment with price using df_clean to include the 'sentiment' column
btc['date'] = pd.to_datetime(btc['date'])  # Convert 'date' column to datetime
merged = pd.merge(df_clean[['date', 'sentiment']], btc, on='date', how='inner')

# Calculate daily return
merged['btc_return'] = merged['btc_close'].pct_change()

# Create the candlestick chart
fig = go.Figure(data=[go.Candlestick(
    x=merged['date'],
    open=merged['btc_open'],
    high=merged['btc_high'],
    low=merged['btc_low'],
    close=merged['btc_close'],
    name='BTC Price',
    increasing_line_color='green', decreasing_line_color='red',
)])

# Add sentiment line (you can adjust this for how you want sentiment displayed)
fig.add_trace(go.Scatter(
    x=merged['date'],
    y=merged['sentiment'].apply(lambda x: 1 if x == 'POSITIVE' else (-1 if x == 'NEGATIVE' else 0)),  # Map sentiment to numerical values for plotting
    mode='markers',
    name='Sentiment',
    marker=dict(color='blue', size=5)
))

# Layout updates for better visibility
fig.update_layout(
    title="Bitcoin Price and Sentiment Analysis",
    xaxis_title="Date",
    yaxis_title="Price (USD)",
    xaxis_rangeslider_visible=False,
    template="plotly_dark",
)

fig.show()

In [None]:
correlation = merged[['avg_sentiment', 'btc_return']].corr().iloc[0,1]
print(f"Correlation between sentiment and return: {correlation:.4f}")

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def sharpe_ratio(returns):
    return np.mean(returns) / np.std(returns) * np.sqrt(252)

def geometric_mean_return(returns):
    return (np.prod(1 + returns) ** (1 / len(returns))) - 1

def max_drawdown(returns):
    cum_returns = (1 + returns).cumprod()
    peak = cum_returns.cummax()
    drawdown = (cum_returns - peak) / peak
    return drawdown.min()

def evaluate_performance(data):
    print("Sharpe Ratio:", sharpe_ratio(data["Strategy_Return"].dropna()))
    print("Geometric Mean Return:", geometric_mean_return(data["Strategy_Return"].dropna()))
    print("Max Drawdown:", max_drawdown(data["Strategy_Return"].dropna()))
    data[["Return", "Strategy_Return"]].cumsum().plot(title="Cumulative Returns")
    plt.show()

evaluate_performance(btc)


In [None]:
# Keep only the specified columns if they exist in the dataframe
df_filtered = df.copy()      
columns_to_keep = ['user_created', 'user_followers', 'date', 'text']
df_filtered = df_filtered[columns_to_keep]
filters = {
    'user_created': lambda df: df['user_created'] < df['date'] - pd.Timedelta(days=30),
    'user_followers': lambda df: df['user_followers'] > 1000
}

# Apply filters
for col, condition in filters.items():
    df_filtered = df_filtered[condition(df_filtered)]
    
df.to_csv("datasets/filtered_tweets.csv", index=False)
df_filtered

In [None]:
import re
df_clean = df_filtered.copy()

def clean_tweet(text):
    text = re.sub(r"http\S+", "", text)  # remove URLs
    text = re.sub(r"@\w+", "", text)     # remove mentions
    text = re.sub(r"#", "", text)        # remove hashtag symbol
    text = re.sub(r"\s+", " ", text).strip()  # remove extra spaces
    return text

df_clean['clean_text'] = df_clean['text'].astype(str).apply(clean_tweet)
df_clean.drop(columns=['text'], inplace=True)  # drop original text column
df_clean.to_csv("datasets/cleaned_tweets.csv", index=False)
df_clean

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline

# Load FinBERT
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
finbert = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Run on a sample
df_clean['sentiment'] = df_clean['clean_text'].apply(lambda x: finbert(x[:512])[0]['label'])  # truncate to 512 tokens

In [None]:
df_sentiment = df_clean.copy()
df_sentiment.to_csv("datasets/sentiment_tweets.csv", index=False)

In [None]:
sentiment_score = {'Positive': 1, 'Neutral': 0, 'Negative': -1}
df_clean['sentiment_score'] = df_clean['sentiment'].map(sentiment_score)

# Aggregate sentiment per day
hourly_sentiment = df_clean.groupby(df_clean['date'].dt.floor('h'))['sentiment_score'].mean().reset_index()
hourly_sentiment.columns = ['date', 'avg_sentiment']

hourly_sentiment

In [None]:
import yfinance as yf

# Load BTC data from yfinance
btc = yf.download("BTC-USD", start="2021-01-01", end="2021-12-31")
btc = btc[['Close']].reset_index()
btc.columns = ['date', 'btc_close']

# Merge sentiment with price
btc['date'] = pd.to_datetime(btc['date'])  # Convert 'date' column to datetime
merged = pd.merge(hourly_sentiment, btc, on='date', how='inner')

# Calculate daily return
merged['btc_return'] = merged['btc_close'].pct_change()

In [None]:
import yfinance as yf
import pandas as pd
import plotly.graph_objects as go

# Load BTC data from yfinance
btc = yf.download("BTC-USD", start="2021-01-01", end="2021-12-31")
btc = btc[['Open', 'High', 'Low', 'Close']].reset_index()
btc.columns = ['date', 'btc_open', 'btc_high', 'btc_low', 'btc_close']

# Merge sentiment with price using df_clean to include the 'sentiment' column
btc['date'] = pd.to_datetime(btc['date'])  # Convert 'date' column to datetime
merged = pd.merge(df_clean[['date', 'sentiment']], btc, on='date', how='inner')

# Calculate daily return
merged['btc_return'] = merged['btc_close'].pct_change()

# Create the candlestick chart
fig = go.Figure(data=[go.Candlestick(
    x=merged['date'],
    open=merged['btc_open'],
    high=merged['btc_high'],
    low=merged['btc_low'],
    close=merged['btc_close'],
    name='BTC Price',
    increasing_line_color='green', decreasing_line_color='red',
)])

# Add sentiment line (you can adjust this for how you want sentiment displayed)
fig.add_trace(go.Scatter(
    x=merged['date'],
    y=merged['sentiment'].apply(lambda x: 1 if x == 'POSITIVE' else (-1 if x == 'NEGATIVE' else 0)),  # Map sentiment to numerical values for plotting
    mode='markers',
    name='Sentiment',
    marker=dict(color='blue', size=5)
))

# Layout updates for better visibility
fig.update_layout(
    title="Bitcoin Price and Sentiment Analysis",
    xaxis_title="Date",
    yaxis_title="Price (USD)",
    xaxis_rangeslider_visible=False,
    template="plotly_dark",
)

fig.show()

In [None]:
merged

In [None]:
correlation = merged[['avg_sentiment', 'btc_return']].corr().iloc[0,1]
print(f"Correlation between sentiment and return: {correlation:.4f}")

In [None]:
# Keep only the specified columns if they exist in the dataframe
df_filtered = df.copy()      
columns_to_keep = ['user_created', 'user_followers', 'date', 'text']
df_filtered = df_filtered[columns_to_keep]
filters = {
    'user_created': lambda df: df['user_created'] < df['date'] - pd.Timedelta(days=30),
    'user_followers': lambda df: df['user_followers'] > 1000
}

# Apply filters
for col, condition in filters.items():
    df_filtered = df_filtered[condition(df_filtered)]
    
df.to_csv("datasets/filtered_tweets.csv", index=False)
df_filtered

In [None]:
import re
df_clean = df_filtered.copy()

def clean_tweet(text):
    text = re.sub(r"http\S+", "", text)  # remove URLs
    text = re.sub(r"@\w+", "", text)     # remove mentions
    text = re.sub(r"#", "", text)        # remove hashtag symbol
    text = re.sub(r"\s+", " ", text).strip()  # remove extra spaces
    return text

df_clean['clean_text'] = df_clean['text'].astype(str).apply(clean_tweet)
df_clean.drop(columns=['text'], inplace=True)  # drop original text column
df_clean.to_csv("datasets/cleaned_tweets.csv", index=False)
df_clean

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline

# Load FinBERT
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
finbert = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Run on a sample
df_clean['sentiment'] = df_clean['clean_text'].apply(lambda x: finbert(x[:512])[0]['label'])  # truncate to 512 tokens

In [None]:
df_sentiment = df_clean.copy()
df_sentiment.to_csv("datasets/sentiment_tweets.csv", index=False)

In [None]:
sentiment_score = {'Positive': 1, 'Neutral': 0, 'Negative': -1}
df_clean['sentiment_score'] = df_clean['sentiment'].map(sentiment_score)

# Aggregate sentiment per day
hourly_sentiment = df_clean.groupby(df_clean['date'].dt.floor('h'))['sentiment_score'].mean().reset_index()
hourly_sentiment.columns = ['date', 'avg_sentiment']

hourly_sentiment

In [None]:
import yfinance as yf

# Load BTC data from yfinance
btc = yf.download("BTC-USD", start="2021-01-01", end="2021-12-31")
btc = btc[['Close']].reset_index()
btc.columns = ['date', 'btc_close']

# Merge sentiment with price
btc['date'] = pd.to_datetime(btc['date'])  # Convert 'date' column to datetime
merged = pd.merge(hourly_sentiment, btc, on='date', how='inner')

# Calculate daily return
merged['btc_return'] = merged['btc_close'].pct_change()

In [None]:
import yfinance as yf
import pandas as pd
import plotly.graph_objects as go

# Load BTC data from yfinance
btc = yf.download("BTC-USD", start="2021-01-01", end="2021-12-31")
btc = btc[['Open', 'High', 'Low', 'Close']].reset_index()
btc.columns = ['date', 'btc_open', 'btc_high', 'btc_low', 'btc_close']

# Merge sentiment with price using df_clean to include the 'sentiment' column
btc['date'] = pd.to_datetime(btc['date'])  # Convert 'date' column to datetime
merged = pd.merge(df_clean[['date', 'sentiment']], btc, on='date', how='inner')

# Calculate daily return
merged['btc_return'] = merged['btc_close'].pct_change()

# Create the candlestick chart
fig = go.Figure(data=[go.Candlestick(
    x=merged['date'],
    open=merged['btc_open'],
    high=merged['btc_high'],
    low=merged['btc_low'],
    close=merged['btc_close'],
    name='BTC Price',
    increasing_line_color='green', decreasing_line_color='red',
)])

# Add sentiment line (you can adjust this for how you want sentiment displayed)
fig.add_trace(go.Scatter(
    x=merged['date'],
    y=merged['sentiment'].apply(lambda x: 1 if x == 'POSITIVE' else (-1 if x == 'NEGATIVE' else 0)),  # Map sentiment to numerical values for plotting
    mode='markers',
    name='Sentiment',
    marker=dict(color='blue', size=5)
))

# Layout updates for better visibility
fig.update_layout(
    title="Bitcoin Price and Sentiment Analysis",
    xaxis_title="Date",
    yaxis_title="Price (USD)",
    xaxis_rangeslider_visible=False,
    template="plotly_dark",
)

fig.show()

In [None]:
merged

In [None]:
correlation = merged[['avg_sentiment', 'btc_return']].corr().iloc[0,1]
print(f"Correlation between sentiment and return: {correlation:.4f}")

In [None]:
# Keep only the specified columns if they exist in the dataframe
df_filtered = df.copy()      
columns_to_keep = ['user_created', 'user_followers', 'date', 'text']
df_filtered = df_filtered[columns_to_keep]
filters = {
    'user_created': lambda df: df['user_created'] < df['date'] - pd.Timedelta(days=30),
    'user_followers': lambda df: df['user_followers'] > 1000
}

# Apply filters
for col, condition in filters.items():
    df_filtered = df_filtered[condition(df_filtered)]
    
df.to_csv("datasets/filtered_tweets.csv", index=False)
df_filtered

In [None]:
import re
df_clean = df_filtered.copy()

def clean_tweet(text):
    text = re.sub(r"http\S+", "", text)  # remove URLs
    text = re.sub(r"@\w+", "", text)     # remove mentions
    text = re.sub(r"#", "", text)        # remove hashtag symbol
    text = re.sub(r"\s+", " ", text).strip()  # remove extra spaces
    return text

df_clean['clean_text'] = df_clean['text'].astype(str).apply(clean_tweet)
df_clean.drop(columns=['text'], inplace=True)  # drop original text column
df_clean.to_csv("datasets/cleaned_tweets.csv", index=False)
df_clean

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline

# Load FinBERT
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
finbert = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Run on a sample
df_clean['sentiment'] = df_clean['clean_text'].apply(lambda x: finbert(x[:512])[0]['label'])  # truncate to 512 tokens

In [None]:
df_sentiment = df_clean.copy()
df_sentiment.to_csv("datasets/sentiment_tweets.csv", index=False)

In [None]:
sentiment_score = {'Positive': 1, 'Neutral': 0, 'Negative': -1}
df_clean['sentiment_score'] = df_clean['sentiment'].map(sentiment_score)

# Aggregate sentiment per day
hourly_sentiment = df_clean.groupby(df_clean['date'].dt.floor('h'))['sentiment_score'].mean().reset_index()
hourly_sentiment.columns = ['date', 'avg_sentiment']

hourly_sentiment

In [None]:
import yfinance as yf

# Load BTC data from yfinance
btc = yf.download("BTC-USD", start="2021-01-01", end="2021-12-31")
btc = btc[['Close']].reset_index()
btc.columns = ['date', 'btc_close']

# Merge sentiment with price
btc['date'] = pd.to_datetime(btc['date'])  # Convert 'date' column to datetime
merged = pd.merge(hourly_sentiment, btc, on='date', how='inner')

# Calculate daily return
merged['btc_return'] = merged['btc_close'].pct_change()

In [None]:
import yfinance as yf
import pandas as pd
import plotly.graph_objects as go

# Load BTC data from yfinance
btc = yf.download("BTC-USD", start="2021-01-01", end="2021-12-31")
btc = btc[['Open', 'High', 'Low', 'Close']].reset_index()
btc.columns = ['date', 'btc_open', 'btc_high', 'btc_low', 'btc_close']

# Merge sentiment with price using df_clean to include the 'sentiment' column
btc['date'] = pd.to_datetime(btc['date'])  # Convert 'date' column to datetime
merged = pd.merge(df_clean[['date', 'sentiment']], btc, on='date', how='inner')

# Calculate daily return
merged['btc_return'] = merged['btc_close'].pct_change()

# Create the candlestick chart
fig = go.Figure(data=[go.Candlestick(
    x=merged['date'],
    open=merged['btc_open'],
    high=merged['btc_high'],
    low=merged['btc_low'],
    close=merged['btc_close'],
    name='BTC Price',
    increasing_line_color='green', decreasing_line_color='red',
)])

# Add sentiment line (you can adjust this for how you want sentiment displayed)
fig.add_trace(go.Scatter(
    x=merged['date'],
    y=merged['sentiment'].apply(lambda x: 1 if x == 'POSITIVE' else (-1 if x == 'NEGATIVE' else 0)),  # Map sentiment to numerical values for plotting
    mode='markers',
    name='Sentiment',
    marker=dict(color='blue', size=5)
))

# Layout updates for better visibility
fig.update_layout(
    title="Bitcoin Price and Sentiment Analysis",
    xaxis_title="Date",
    yaxis_title="Price (USD)",
    xaxis_rangeslider_visible=False,
    template="plotly_dark",
)

fig.show()

In [None]:
merged

In [None]:
correlation = merged[['avg_sentiment', 'btc_return']].corr().iloc[0,1]
print(f"Correlation between sentiment and return: {correlation:.4f}")

In [None]:
# Keep only the specified columns if they exist in the dataframe
df_filtered = df.copy()      
columns_to_keep = ['user_created', 'user_followers', 'date', 'text']
df_filtered = df_filtered[columns_to_keep]
filters = {
    'user_created': lambda df: df['user_created'] < df['date'] - pd.Timedelta(days=30),
    'user_followers': lambda df: df['user_followers'] > 1000
}

# Apply filters
for col, condition in filters.items():
    df_filtered = df_filtered[condition(df_filtered)]
    
df.to_csv("datasets/filtered_tweets.csv", index=False)
df_filtered

In [None]:
import re
df_clean = df_filtered.copy()

def clean_tweet(text):
    text = re.sub(r"http\S+", "", text)  # remove URLs
    text = re.sub(r"@\w+", "", text)     # remove mentions
    text = re.sub(r"#", "", text)        # remove hashtag symbol
    text = re.sub(r"\s+", " ", text).strip()  # remove extra spaces
    return text

df_clean['clean_text'] = df_clean['text'].astype(str).apply(clean_tweet)
df_clean.drop(columns=['text'], inplace=True)  # drop original text column
df_clean.to_csv("datasets/cleaned_tweets.csv", index=False)
df_clean

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline

# Load FinBERT
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
finbert = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Run on a sample
df_clean['sentiment'] = df_clean['clean_text'].apply(lambda x: finbert(x[:512])[0]['label'])  # truncate to 512 tokens

In [None]:
df_sentiment = df_clean.copy()
df_sentiment.to_csv("datasets/sentiment_tweets.csv", index=False)

In [None]:
sentiment_score = {'Positive': 1, 'Neutral': 0, 'Negative': -1}
df_clean['sentiment_score'] = df_clean['sentiment'].map(sentiment_score)

# Aggregate sentiment per day
hourly_sentiment = df_clean.groupby(df_clean['date'].dt.floor('h'))['sentiment_score'].mean().reset_index()
hourly_sentiment.columns = ['date', 'avg_sentiment']

hourly_sentiment

In [None]:
import yfinance as yf

# Load BTC data from yfinance
btc = yf.download("BTC-USD", start="2021-01-01", end="2021-12-31")
btc = btc[['Close']].reset_index()
btc.columns = ['date', 'btc_close']

# Merge sentiment with price
btc['date'] = pd.to_datetime(btc['date'])  # Convert 'date' column to datetime
merged = pd.merge(hourly_sentiment, btc, on='date', how='inner')

# Calculate daily return
merged['btc_return'] = merged['btc_close'].pct_change()

In [None]:
import yfinance as yf
import pandas as pd
import plotly.graph_objects as go

# Load BTC data from yfinance
btc = yf.download("BTC-USD", start="2021-01-01", end="2021-12-31")
btc = btc[['Open', 'High', 'Low', 'Close']].reset_index()
btc.columns = ['date', 'btc_open', 'btc_high', 'btc_low', 'btc_close']

# Merge sentiment with price using df_clean to include the 'sentiment' column
btc['date'] = pd.to_datetime(btc['date'])  # Convert 'date' column to datetime
merged = pd.merge(df_clean[['date', 'sentiment']], btc, on='date', how='inner')

# Calculate daily return
merged['btc_return'] = merged['btc_close'].pct_change()

# Create the candlestick chart
fig = go.Figure(data=[go.Candlestick(
    x=merged['date'],
    open=merged['btc_open'],
    high=merged['btc_high'],
    low=merged['btc_low'],
    close=merged['btc_close'],
    name='BTC Price',
    increasing_line_color='green', decreasing_line_color='red',
)])

# Add sentiment line (you can adjust this for how you want sentiment displayed)
fig.add_trace(go.Scatter(
    x=merged['date'],
    y=merged['sentiment'].apply(lambda x: 1 if x == 'POSITIVE' else (-1 if x == 'NEGATIVE' else 0)),  # Map sentiment to numerical values for plotting
    mode='markers',
    name='Sentiment',
    marker=dict(color='blue', size=5)
))

# Layout updates for better visibility
fig.update_layout(
    title="Bitcoin Price and Sentiment Analysis",
    xaxis_title="Date",
    yaxis_title="Price (USD)",
    xaxis_rangeslider_visible=False,
    template="plotly_dark",
)

fig.show()

In [None]:
merged

In [None]:
correlation = merged[['avg_sentiment', 'btc_return']].corr().iloc[0,1]
print(f"Correlation between sentiment and return: {correlation:.4f}")

In [None]:
# Keep only the specified columns if they exist in the dataframe
df_filtered = df.copy()      
columns_to_keep = ['user_created', 'user_followers', 'date', 'text']
df_filtered = df_filtered[columns_to_keep]
filters = {
    'user_created': lambda df: df['user_created'] < df['date'] - pd.Timedelta(days=30),
    'user_followers': lambda df: df['user_followers'] > 1000
}

# Apply filters
for col, condition in filters.items():
    df_filtered = df_filtered[condition(df_filtered)]
    
df.to_csv("datasets/filtered_tweets.csv", index=False)
df_filtered

Unnamed: 0,user_created,user_followers,date,text
0,2009-04-26 20:05:09,8534,2021-02-10 23:59:04,Blue Ridge Bank shares halted by NYSE after #b...
1,2019-10-17 20:12:10,6769,2021-02-10 23:58:48,"😎 Today, that's this #Thursday, we will do a ""..."
4,2016-02-03 13:15:55,1249,2021-02-10 23:54:06,This network is secured by 9 508 nodes as of t...
7,2018-08-03 21:30:08,4052,2021-02-10 23:52:42,🔄 Prices update in $EUR (1 hour):\n\n$BTC - ...
9,2009-04-26 20:05:09,8534,2021-02-10 23:52:08,.@Tesla’s #bitcoin investment is revolutionary...
...,...,...,...,...
48559,2018-05-25 21:11:08,3255,2021-03-11 22:38:25,🔄 Prices update in $USD (1 hour):\n\n$BTC - 57...
48566,2017-01-01 03:36:19,1891,2021-03-11 22:35:29,#Bitcoin reversed yesterday's weakness to fini...
48571,2018-05-13 04:06:51,2415,2021-03-11 22:32:42,Goldman Sachs Evaluates #Bitcoin as Client Dem...
48575,2012-11-03 22:31:45,12014,2021-03-11 22:31:36,#Bitcoin Big Double pay again!!! Once it hit $...


In [None]:
import re
df_clean = df_filtered.copy()

def clean_tweet(text):
    text = re.sub(r"http\S+", "", text)  # remove URLs
    text = re.sub(r"@\w+", "", text)     # remove mentions
    text = re.sub(r"#", "", text)        # remove hashtag symbol
    text = re.sub(r"\s+", " ", text).strip()  # remove extra spaces
    return text

df_clean['clean_text'] = df_clean['text'].astype(str).apply(clean_tweet)
df_clean.drop(columns=['text'], inplace=True)  # drop original text column
df_clean.to_csv("datasets/cleaned_tweets.csv", index=False)
df_clean

Unnamed: 0,user_created,user_followers,date,clean_text
0,2009-04-26 20:05:09,8534,2021-02-10 23:59:04,Blue Ridge Bank shares halted by NYSE after bi...
1,2019-10-17 20:12:10,6769,2021-02-10 23:58:48,"😎 Today, that's this Thursday, we will do a ""🎬..."
4,2016-02-03 13:15:55,1249,2021-02-10 23:54:06,This network is secured by 9 508 nodes as of t...
7,2018-08-03 21:30:08,4052,2021-02-10 23:52:42,🔄 Prices update in $EUR (1 hour): $BTC - 37082...
9,2009-04-26 20:05:09,8534,2021-02-10 23:52:08,.’s bitcoin investment is revolutionary for cr...
...,...,...,...,...
48559,2018-05-25 21:11:08,3255,2021-03-11 22:38:25,🔄 Prices update in $USD (1 hour): $BTC - 57800...
48566,2017-01-01 03:36:19,1891,2021-03-11 22:35:29,Bitcoin reversed yesterday's weakness to finis...
48571,2018-05-13 04:06:51,2415,2021-03-11 22:32:42,Goldman Sachs Evaluates Bitcoin as Client Dema...
48575,2012-11-03 22:31:45,12014,2021-03-11 22:31:36,Bitcoin Big Double pay again!!! Once it hit $5...


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline

# Load FinBERT
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
finbert = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Run on a sample
df_clean['sentiment'] = df_clean['clean_text'].apply(lambda x: finbert(x[:512])[0]['label'])  # truncate to 512 tokens

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


In [None]:
df_sentiment = df_clean.copy()
df_sentiment.to_csv("datasets/sentiment_tweets.csv", index=False)

In [None]:
sentiment_score = {'Positive': 1, 'Neutral': 0, 'Negative': -1}
df_clean['sentiment_score'] = df_clean['sentiment'].map(sentiment_score)

# Aggregate sentiment per day
hourly_sentiment = df_clean.groupby(df_clean['date'].dt.floor('h'))['sentiment_score'].mean().reset_index()
hourly_sentiment.columns = ['date', 'avg_sentiment']

hourly_sentiment

Unnamed: 0,date,avg_sentiment
0,2021-02-05 10:00:00,0.000000
1,2021-02-05 11:00:00,-0.025641
2,2021-02-05 12:00:00,0.125000
3,2021-02-05 13:00:00,0.132075
4,2021-02-05 14:00:00,0.016949
...,...,...
287,2021-03-12 19:00:00,0.080645
288,2021-03-12 20:00:00,0.080000
289,2021-03-12 21:00:00,0.020833
290,2021-03-12 22:00:00,0.000000


In [None]:
import yfinance as yf

# Load BTC data from yfinance
btc = yf.download("BTC-USD", start="2021-01-01", end="2021-12-31")
btc = btc[['Close']].reset_index()
btc.columns = ['date', 'btc_close']

# Merge sentiment with price
btc['date'] = pd.to_datetime(btc['date'])  # Convert 'date' column to datetime
merged = pd.merge(hourly_sentiment, btc, on='date', how='inner')

# Calculate daily return
merged['btc_return'] = merged['btc_close'].pct_change()

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed


In [None]:
import yfinance as yf
import pandas as pd
import plotly.graph_objects as go

# Load BTC data from yfinance
btc = yf.download("BTC-USD", start="2021-01-01", end="2021-12-31")
btc = btc[['Open', 'High', 'Low', 'Close']].reset_index()
btc.columns = ['date', 'btc_open', 'btc_high', 'btc_low', 'btc_close']

# Merge sentiment with price using df_clean to include the 'sentiment' column
btc['date'] = pd.to_datetime(btc['date'])  # Convert 'date' column to datetime
merged = pd.merge(df_clean[['date', 'sentiment']], btc, on='date', how='inner')

# Calculate daily return
merged['btc_return'] = merged['btc_close'].pct_change()

# Create the candlestick chart
fig = go.Figure(data=[go.Candlestick(
    x=merged['date'],
    open=merged['btc_open'],
    high=merged['btc_high'],
    low=merged['btc_low'],
    close=merged['btc_close'],
    name='BTC Price',
    increasing_line_color='green', decreasing_line_color='red',
)])

# Add sentiment line (you can adjust this for how you want sentiment displayed)
fig.add_trace(go.Scatter(
    x=merged['date'],
    y=merged['sentiment'].apply(lambda x: 1 if x == 'POSITIVE' else (-1 if x == 'NEGATIVE' else 0)),  # Map sentiment to numerical values for plotting
    mode='markers',
    name='Sentiment',
    marker=dict(color='blue', size=5)
))

# Layout updates for better visibility
fig.update_layout(
    title="Bitcoin Price and Sentiment Analysis",
    xaxis_title="Date",
    yaxis_title="Price (USD)",
    xaxis_rangeslider_visible=False,
    template="plotly_dark",
)

fig.show()

[*********************100%***********************]  1 of 1 completed


In [None]:
merged

Unnamed: 0,date,sentiment,btc_open,btc_high,btc_low,btc_close,btc_return


In [None]:
correlation = merged[['avg_sentiment', 'btc_return']].corr().iloc[0,1]
print(f"Correlation between sentiment and return: {correlation:.4f}")

KeyError: "['avg_sentiment'] not in index"