# Imports

In [None]:
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


**Loading csv from Kaggle 'Bitcoin_tweets.csv' containing every post in the time duration: 05.02.2021 - 12.03.2021**

In [6]:
# Load the Bitcoin tweets dataset from a CSV file.
# The 'date' column is parsed as datetime while loading to ensure correct format.
df = pd.read_csv("datasets/Bitcoin_tweets.csv", parse_dates=['date'])

# Re-convert the 'date' column to datetime format just to be safe,
# coercing any invalid parsing to NaT (Not a Time).
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Print the data type of the 'date' column to verify it's datetime64[ns]
print(df['date'].dtype)  # should output: datetime64[ns]

# Count and print the number of rows where the 'date' value couldn't be parsed (NaT)
print(df['date'].isna().sum())

# Print the earliest and latest dates found in the 'date' column
print(df['date'].min(), df['date'].max())


datetime64[ns]
0
2021-02-05 10:52:04 2021-03-12 23:59:14


**Filtering useless columns and accounts with less than 5k followers and accounts created sooner than 30 days of the post, they wrote(bot detection)**

In [7]:
# Make a copy of the original dataframe to preserve the raw data
df_filtered = df.copy()

# Define the columns we want to keep (if they exist in the dataframe)
columns_to_keep = ['user_name', 'user_created', 'user_followers', 'date', 'text']

# Retain only the specified columns; assumes all listed columns are present in df
df_filtered = df_filtered[columns_to_keep]

# Define a dictionary of filtering conditions
filters = {
    # Keep rows where the user's account was created at least 30 days before the tweet
    'user_created': lambda df: df['user_created'] < df['date'] - pd.Timedelta(days=30),

    # Keep rows where the user has more than 5,000 followers
    'user_followers': lambda df: df['user_followers'] > 5000
}

# Apply each filter sequentially to df_filtered
for col, condition in filters.items():
    df_filtered = df_filtered[condition(df_filtered)]

# Save the original (unfiltered) dataframe to a CSV file
df.to_csv("datasets/filtered_tweets.csv", index=False)

# Display the filtered dataframe as output (optional in scripts, useful in notebooks)
df_filtered


Unnamed: 0,user_name,user_created,user_followers,date,text
0,DeSota Wilson,2009-04-26 20:05:09,8534,2021-02-10 23:59:04,Blue Ridge Bank shares halted by NYSE after #b...
1,CryptoND,2019-10-17 20:12:10,6769,2021-02-10 23:58:48,"😎 Today, that's this #Thursday, we will do a ""..."
9,DeSota Wilson,2009-04-26 20:05:09,8534,2021-02-10 23:52:08,.@Tesla’s #bitcoin investment is revolutionary...
12,CPUcoin,2018-08-27 15:42:00,5097,2021-02-10 23:50:59,Join our first virtual crypto meetup of 2021 -...
16,Mr. Anderson,2018-01-01 22:16:16,72542,2021-02-10 23:48:37,@naval #BTC is unconfiscatable \n\nAll roads l...
...,...,...,...,...,...
48534,OKCoin,2014-04-15 12:45:40,109660,2021-03-11 22:53:46,⚡️⚡️⚡️If you missed the live discussion and AM...
48539,DoopieCash®,2018-04-13 09:54:09,8876,2021-03-11 22:52:42,Blast it or go home...😴\n\n$BTC #BTC #Bitcoin ...
48544,Stage Analysis,2013-02-25 16:55:34,8693,2021-03-11 22:50:23,#Bitcoin approaching the all time high. \n\n#c...
48551,Brian Harrington,2009-08-19 20:21:01,8398,2021-03-11 22:43:45,Tweet about #Bitcoin or GTFO @RealSaavedra \n\...


**Data cleaning**

In [8]:
# Create a copy of the filtered DataFrame to keep the original untouched
df_clean = df_filtered.copy()

# Define a function to clean tweet text
def clean_tweet(text):
    text = re.sub(r"http\S+", "", text)          # Remove URLs (e.g., http://example.com)
    text = re.sub(r"@\w+", "", text)             # Remove mentions (e.g., @user123)
    text = re.sub(r"#", "", text)                # Remove the hashtag symbol (#), but keep the word
    text = re.sub(r"\s+", " ", text).strip()     # Remove extra whitespace and leading/trailing spaces
    return text

# Apply the cleaning function to each tweet
# Ensure the 'text' column is treated as string to avoid errors
df_clean['clean_text'] = df_clean['text'].astype(str).apply(clean_tweet)

# Drop the original 'text' column since it's now cleaned and saved as 'clean_text'
df_clean.drop(columns=['text'], inplace=True)

# Save the cleaned DataFrame to a new CSV file
df_clean.to_csv("datasets/cleaned_tweets.csv", index=False)

# Display the cleaned DataFrame (useful in notebooks or interactive sessions)
df_clean


Unnamed: 0,user_name,user_created,user_followers,date,clean_text
0,DeSota Wilson,2009-04-26 20:05:09,8534,2021-02-10 23:59:04,Blue Ridge Bank shares halted by NYSE after bi...
1,CryptoND,2019-10-17 20:12:10,6769,2021-02-10 23:58:48,"😎 Today, that's this Thursday, we will do a ""🎬..."
9,DeSota Wilson,2009-04-26 20:05:09,8534,2021-02-10 23:52:08,.’s bitcoin investment is revolutionary for cr...
12,CPUcoin,2018-08-27 15:42:00,5097,2021-02-10 23:50:59,Join our first virtual crypto meetup of 2021 -...
16,Mr. Anderson,2018-01-01 22:16:16,72542,2021-02-10 23:48:37,BTC is unconfiscatable All roads lead to Bitcoin
...,...,...,...,...,...
48534,OKCoin,2014-04-15 12:45:40,109660,2021-03-11 22:53:46,⚡️⚡️⚡️If you missed the live discussion and AM...
48539,DoopieCash®,2018-04-13 09:54:09,8876,2021-03-11 22:52:42,Blast it or go home...😴 $BTC BTC Bitcoin crypto
48544,Stage Analysis,2013-02-25 16:55:34,8693,2021-03-11 22:50:23,Bitcoin approaching the all time high. cryptoc...
48551,Brian Harrington,2009-08-19 20:21:01,8398,2021-03-11 22:43:45,Tweet about Bitcoin or GTFO The solution is st...


**Using the pretrained model FinBert for financial sentiment analysis**

In [9]:
# Load the pre-trained FinBERT model specifically designed for financial sentiment analysis
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")

# Create a sentiment analysis pipeline using the FinBERT model and tokenizer
finbert = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Apply the sentiment analysis to each cleaned tweet
# FinBERT expects a maximum of 512 tokens, so we truncate longer texts to the first 512 characters
# The result is a dictionary like {'label': 'positive', 'score': 0.99}, so we extract only the label
df_clean['sentiment'] = df_clean['clean_text'].apply(lambda x: finbert(x[:512])[0]['label'])


Device set to use cpu


**Save the created scv as sentiment_tweets.csv. This csv is going to be used a lot in the other files for calculating correlation and the price prediction of bitcoin**

In [None]:
# Make a copy of the cleaned dataframe that now includes sentiment labels
df_sentiment = df_clean.copy()

# Save the sentiment-annotated tweets to a new CSV file
df_sentiment.to_csv("datasets/sentiment_tweets.csv", index=False)

# Display the dataframe with sentiment results (useful in notebooks)
df_sentiment

Unnamed: 0,user_name,user_created,user_followers,date,clean_text,sentiment
0,DeSota Wilson,2009-04-26 20:05:09,8534,2021-02-10 23:59:04,Blue Ridge Bank shares halted by NYSE after bi...,Negative
1,CryptoND,2019-10-17 20:12:10,6769,2021-02-10 23:58:48,"😎 Today, that's this Thursday, we will do a ""🎬...",Neutral
9,DeSota Wilson,2009-04-26 20:05:09,8534,2021-02-10 23:52:08,.’s bitcoin investment is revolutionary for cr...,Neutral
12,CPUcoin,2018-08-27 15:42:00,5097,2021-02-10 23:50:59,Join our first virtual crypto meetup of 2021 -...,Neutral
16,Mr. Anderson,2018-01-01 22:16:16,72542,2021-02-10 23:48:37,BTC is unconfiscatable All roads lead to Bitcoin,Neutral
...,...,...,...,...,...,...
48534,OKCoin,2014-04-15 12:45:40,109660,2021-03-11 22:53:46,⚡️⚡️⚡️If you missed the live discussion and AM...,Neutral
48539,DoopieCash®,2018-04-13 09:54:09,8876,2021-03-11 22:52:42,Blast it or go home...😴 $BTC BTC Bitcoin crypto,Neutral
48544,Stage Analysis,2013-02-25 16:55:34,8693,2021-03-11 22:50:23,Bitcoin approaching the all time high. cryptoc...,Neutral
48551,Brian Harrington,2009-08-19 20:21:01,8398,2021-03-11 22:43:45,Tweet about Bitcoin or GTFO The solution is st...,Neutral
