In [12]:
import pandas as pd

# Load tweets
df = pd.read_csv("datasets/Bitcoin_tweets.csv", parse_dates=['date'])

df['date'] = pd.to_datetime(df['date'], errors='coerce')
print(df['date'].dtype)  # should be datetime64[ns]

print(df['date'].isna().sum())
print(df['date'].min(), df['date'].max())



datetime64[ns]
0
2021-02-05 10:52:04 2021-03-12 23:59:14


In [13]:
# Keep only the specified columns if they exist in the dataframe
df_filtered = df.copy()      
columns_to_keep = ['user_name','user_created', 'user_followers', 'date', 'text']
df_filtered = df_filtered[columns_to_keep]
filters = {
    'user_created': lambda df: df['user_created'] < df['date'] - pd.Timedelta(days=30),
    'user_followers': lambda df: df['user_followers'] > 5000
}

# Apply filters
for col, condition in filters.items():
    df_filtered = df_filtered[condition(df_filtered)]
    
df.to_csv("datasets/filtered_tweets.csv", index=False)
df_filtered

Unnamed: 0,user_name,user_created,user_followers,date,text
0,DeSota Wilson,2009-04-26 20:05:09,8534,2021-02-10 23:59:04,Blue Ridge Bank shares halted by NYSE after #b...
1,CryptoND,2019-10-17 20:12:10,6769,2021-02-10 23:58:48,"üòé Today, that's this #Thursday, we will do a ""..."
9,DeSota Wilson,2009-04-26 20:05:09,8534,2021-02-10 23:52:08,.@Tesla‚Äôs #bitcoin investment is revolutionary...
12,CPUcoin,2018-08-27 15:42:00,5097,2021-02-10 23:50:59,Join our first virtual crypto meetup of 2021 -...
16,Mr. Anderson,2018-01-01 22:16:16,72542,2021-02-10 23:48:37,@naval #BTC is unconfiscatable \n\nAll roads l...
...,...,...,...,...,...
48534,OKCoin,2014-04-15 12:45:40,109660,2021-03-11 22:53:46,‚ö°Ô∏è‚ö°Ô∏è‚ö°Ô∏èIf you missed the live discussion and AM...
48539,DoopieCash¬Æ,2018-04-13 09:54:09,8876,2021-03-11 22:52:42,Blast it or go home...üò¥\n\n$BTC #BTC #Bitcoin ...
48544,Stage Analysis,2013-02-25 16:55:34,8693,2021-03-11 22:50:23,#Bitcoin approaching the all time high. \n\n#c...
48551,Brian Harrington,2009-08-19 20:21:01,8398,2021-03-11 22:43:45,Tweet about #Bitcoin or GTFO @RealSaavedra \n\...


In [14]:
import re
df_clean = df_filtered.copy()

def clean_tweet(text):
    text = re.sub(r"http\S+", "", text)  # remove URLs
    text = re.sub(r"@\w+", "", text)     # remove mentions
    text = re.sub(r"#", "", text)        # remove hashtag symbol
    text = re.sub(r"\s+", " ", text).strip()  # remove extra spaces
    return text

df_clean['clean_text'] = df_clean['text'].astype(str).apply(clean_tweet)
df_clean.drop(columns=['text'], inplace=True)  # drop original text column
df_clean.to_csv("datasets/cleaned_tweets.csv", index=False)
df_clean

Unnamed: 0,user_name,user_created,user_followers,date,clean_text
0,DeSota Wilson,2009-04-26 20:05:09,8534,2021-02-10 23:59:04,Blue Ridge Bank shares halted by NYSE after bi...
1,CryptoND,2019-10-17 20:12:10,6769,2021-02-10 23:58:48,"üòé Today, that's this Thursday, we will do a ""üé¨..."
9,DeSota Wilson,2009-04-26 20:05:09,8534,2021-02-10 23:52:08,.‚Äôs bitcoin investment is revolutionary for cr...
12,CPUcoin,2018-08-27 15:42:00,5097,2021-02-10 23:50:59,Join our first virtual crypto meetup of 2021 -...
16,Mr. Anderson,2018-01-01 22:16:16,72542,2021-02-10 23:48:37,BTC is unconfiscatable All roads lead to Bitcoin
...,...,...,...,...,...
48534,OKCoin,2014-04-15 12:45:40,109660,2021-03-11 22:53:46,‚ö°Ô∏è‚ö°Ô∏è‚ö°Ô∏èIf you missed the live discussion and AM...
48539,DoopieCash¬Æ,2018-04-13 09:54:09,8876,2021-03-11 22:52:42,Blast it or go home...üò¥ $BTC BTC Bitcoin crypto
48544,Stage Analysis,2013-02-25 16:55:34,8693,2021-03-11 22:50:23,Bitcoin approaching the all time high. cryptoc...
48551,Brian Harrington,2009-08-19 20:21:01,8398,2021-03-11 22:43:45,Tweet about Bitcoin or GTFO The solution is st...


In [15]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline

# Load FinBERT
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
finbert = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Run on a sample
df_clean['sentiment'] = df_clean['clean_text'].apply(lambda x: finbert(x[:512])[0]['label'])  # truncate to 512 tokens

Device set to use cpu


In [16]:
df_sentiment = df_clean.copy()
df_sentiment.to_csv("datasets/sentiment_tweets.csv", index=False)
df_sentiment

Unnamed: 0,user_name,user_created,user_followers,date,clean_text,sentiment
0,DeSota Wilson,2009-04-26 20:05:09,8534,2021-02-10 23:59:04,Blue Ridge Bank shares halted by NYSE after bi...,Negative
1,CryptoND,2019-10-17 20:12:10,6769,2021-02-10 23:58:48,"üòé Today, that's this Thursday, we will do a ""üé¨...",Neutral
9,DeSota Wilson,2009-04-26 20:05:09,8534,2021-02-10 23:52:08,.‚Äôs bitcoin investment is revolutionary for cr...,Neutral
12,CPUcoin,2018-08-27 15:42:00,5097,2021-02-10 23:50:59,Join our first virtual crypto meetup of 2021 -...,Neutral
16,Mr. Anderson,2018-01-01 22:16:16,72542,2021-02-10 23:48:37,BTC is unconfiscatable All roads lead to Bitcoin,Neutral
...,...,...,...,...,...,...
48534,OKCoin,2014-04-15 12:45:40,109660,2021-03-11 22:53:46,‚ö°Ô∏è‚ö°Ô∏è‚ö°Ô∏èIf you missed the live discussion and AM...,Neutral
48539,DoopieCash¬Æ,2018-04-13 09:54:09,8876,2021-03-11 22:52:42,Blast it or go home...üò¥ $BTC BTC Bitcoin crypto,Neutral
48544,Stage Analysis,2013-02-25 16:55:34,8693,2021-03-11 22:50:23,Bitcoin approaching the all time high. cryptoc...,Neutral
48551,Brian Harrington,2009-08-19 20:21:01,8398,2021-03-11 22:43:45,Tweet about Bitcoin or GTFO The solution is st...,Neutral
