# 1. Data acquisition

## 1.1 Install the libraries and import them

In [None]:
# Install libraries
# !pip install textblob
# !pip install tweepy
# !pip install flair

In [None]:
# Import the libraries
import pandas as pd
import tweepy
import re
from textblob import TextBlob
from datetime import timedelta
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from flair.models import TextClassifier
from flair.data import Sentence

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!




## 1.2 Set keys and secrets

In [None]:
# Keys and secrets
# api_key = 
# api_key_secret = 

# access_token = 
# access_token_secret = 

## 1.3 Define the search criteria and authenticate the API

In [None]:
# Terms for the query
query = '(stocks OR bitcoin) lang:en'

# Maximum numbers of tweets per query
max_n_tweets = 20

# Date start date format - mm/dd/yyyy
start_date = '04/24/2021'
# Adjust the format to Tweepy - yyyyMMddHHmm
start_date = start_date[-4:] + start_date[:2] + start_date[3:5] + '0000'

end_date = '05/24/2021'
end_date = end_date[-4:] + end_date[:2] + end_date[3:5] + '0000'

In [None]:
# Authentication object
auth = tweepy.OAuthHandler(consumer_key=api_key, 
                                   consumer_secret=api_key_secret)

# Set the access
auth.set_access_token(access_token, access_token_secret)

# Create the API object
api = tweepy.API(auth)

## 1.4 Retrieve the tweets and set the dataframe

In [None]:
# Find the tweets
tweets = api.search_full_archive(environment_name='aml2404',
                                 fromDate=start_date,
                                 toDate=end_date,
                                 query=query,
                                 maxResults=max_n_tweets)

In [None]:
# Print 5 firts results
tweets[0:5]

[Status(_api=<tweepy.api.API object at 0x7f9ad9753f10>, _json={'created_at': 'Sun May 23 23:59:59 +0000 2021', 'id': 1396616935630991361, 'id_str': '1396616935630991361', 'text': 'RT @flurbnb: $120 to one person in 24 hours\n\nRetweet &amp; \n1. Go to https://t.co/xkXkTKZVgT\n2. Search - CHAD (CHAD)\n3. Tap the star ‚≠ê to add t‚Ä¶', 'source': '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>', 'truncated': False, 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 1395646069971640324, 'id_str': '1395646069971640324', 'name': '‚ú® 6Ô∏è‚É£5Ô∏è‚É£üéπ', 'screen_name': 'pamxlaaaa_', 'location': "can't access @pamxlaaa_ ", 'url': 'http://paypal.me/pamxlaaa', 'description': '#LegitCutieBabeGem #ILYAngeLQueeN 6Ô∏è‚É£5Ô∏è‚É£üéπ ‚ú® living by faith ‚ú® Claiming: #IwonHanabi #InezWinners #LegitFAM #LiT  #AP8testi #tigergang #SobatGarangan', 'transla

## 1.5 Passing the Tweepy object to a DataFrame

In [None]:
# Converting the tweepy object into a dataframe
json_data = [tweet._json for tweet in tweets]

df = pd.json_normalize(json_data)
df.head(3)

Unnamed: 0,created_at,id,id_str,text,source,truncated,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,in_reply_to_screen_name,geo,coordinates,place,contributors,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,favorited,retweeted,possibly_sensitive,filter_level,lang,matching_rules,user.id,user.id_str,user.name,user.screen_name,user.location,user.url,user.description,user.translator_type,user.protected,user.verified,user.followers_count,user.friends_count,user.listed_count,user.favourites_count,...,retweeted_status.coordinates,retweeted_status.place,retweeted_status.contributors,retweeted_status.is_quote_status,retweeted_status.extended_tweet.full_text,retweeted_status.extended_tweet.display_text_range,retweeted_status.extended_tweet.entities.hashtags,retweeted_status.extended_tweet.entities.urls,retweeted_status.extended_tweet.entities.user_mentions,retweeted_status.extended_tweet.entities.symbols,retweeted_status.extended_tweet.entities.media,retweeted_status.extended_tweet.extended_entities.media,retweeted_status.quote_count,retweeted_status.reply_count,retweeted_status.retweet_count,retweeted_status.favorite_count,retweeted_status.entities.hashtags,retweeted_status.entities.urls,retweeted_status.entities.user_mentions,retweeted_status.entities.symbols,retweeted_status.favorited,retweeted_status.retweeted,retweeted_status.possibly_sensitive,retweeted_status.filter_level,retweeted_status.lang,entities.hashtags,entities.urls,entities.user_mentions,entities.symbols,retweeted_status.entities.media,retweeted_status.extended_entities.media,display_text_range,entities.media,extended_entities.media,extended_tweet.full_text,extended_tweet.display_text_range,extended_tweet.entities.hashtags,extended_tweet.entities.urls,extended_tweet.entities.user_mentions,extended_tweet.entities.symbols
0,Sun May 23 23:59:59 +0000 2021,1396616935630991361,1396616935630991361,RT @flurbnb: $120 to one person in 24 hours\n\...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",False,,,,,,,,,,False,0,0,0,0,False,False,False,low,en,[{'tag': None}],1395646069971640324,1395646069971640324,‚ú® 6Ô∏è‚É£5Ô∏è‚É£üéπ,pamxlaaaa_,can't access @pamxlaaa_,http://paypal.me/pamxlaaa,#LegitCutieBabeGem #ILYAngeLQueeN 6Ô∏è‚É£5Ô∏è‚É£üéπ ‚ú® li...,none,False,False,182,1324,0,2675,...,,,,False,$120 to one person in 24 hours\n\nRetweet &amp...,"[0, 275]","[{'text': 'cryptocurrency', 'indices': [168, 1...","[{'url': 'https://t.co/xkXkTKZVgT', 'expanded_...",[],[],"[{'id': 1396169137005547522, 'id_str': '139616...","[{'id': 1396169137005547522, 'id_str': '139616...",4.0,1488.0,1373.0,716.0,[],"[{'url': 'https://t.co/xkXkTKZVgT', 'expanded_...",[],[],False,False,False,low,en,[],"[{'url': 'https://t.co/xkXkTKZVgT', 'expanded_...","[{'screen_name': 'flurbnb', 'name': 'Flur ü§´ | ...",[],,,,,,,,,,,
1,Sun May 23 23:59:59 +0000 2021,1396616934834200577,1396616934834200577,RT @BTC_Archive: #Bitcoin chart - 4 hour\nMACD...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",False,,,,,,,,,,False,0,0,0,0,False,False,,low,en,[{'tag': None}],483064211,483064211,Alex Morgan,AlexMorgan1984,,,Find your own wise quotes.\n\n#Bitcoin #Ethere...,none,False,False,405,729,9,38649,...,,,,False,,,,,,,,,20.0,268.0,359.0,2634.0,"[{'text': 'Bitcoin', 'indices': [0, 8]}]",[],[],[],False,False,False,low,en,"[{'text': 'Bitcoin', 'indices': [17, 25]}]",[],"[{'screen_name': 'BTC_Archive', 'name': 'Bitco...",[],"[{'id': 1396458143878680577, 'id_str': '139645...","[{'id': 1396458143878680577, 'id_str': '139645...",,,,,,,,,
2,Sun May 23 23:59:59 +0000 2021,1396616931709313025,1396616931709313025,RT @airdropinspect: New airdrop: Bullcrypto (C...,"<a href=""http://twitter.com/download/android"" ...",False,,,,,,,,,,False,0,0,0,0,False,False,,low,en,[{'tag': None}],1393921526777274368,1393921526777274368,ball,ball50868202,,,,none,False,False,0,72,0,56,...,,,,False,"New airdrop: Bullcrypto (CZB)\nReward: 160,000...","[0, 275]","[{'text': 'Airdrop', 'indices': [212, 220]}, {...","[{'url': 'https://t.co/f7HxC9oRUJ', 'expanded_...",[],[],,,816.0,631.0,6161.0,5502.0,[],"[{'url': 'https://t.co/HLfsfAESHX', 'expanded_...",[],[],False,False,False,low,en,[],[],"[{'screen_name': 'airdropinspect', 'name': 'Ai...",[],,,,,,,,,,,


In [None]:
# Columns of interest
columns = ("""created_at text retweet_count favorite_count 
    user.screen_name user.followers_count""").split()

# Reduced dataframe with the columns of interest
df = df[columns]

df.head()

Unnamed: 0,created_at,text,retweet_count,favorite_count,user.screen_name,user.followers_count
0,Sun May 23 23:59:59 +0000 2021,RT @flurbnb: $120 to one person in 24 hours\n\...,0,0,pamxlaaaa_,182
1,Sun May 23 23:59:59 +0000 2021,RT @BTC_Archive: #Bitcoin chart - 4 hour\nMACD...,0,0,AlexMorgan1984,405
2,Sun May 23 23:59:59 +0000 2021,RT @airdropinspect: New airdrop: Bullcrypto (C...,0,0,ball50868202,0
3,Sun May 23 23:59:58 +0000 2021,RT @100trillionUSD: In the chart you see at wh...,0,0,CryptoEscapades,206
4,Sun May 23 23:59:58 +0000 2021,"RT @TheStalwart: Also this time around, Bitcio...",0,0,tslaqpodcast,4082


In [None]:
# General info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   created_at            20 non-null     object
 1   text                  20 non-null     object
 2   retweet_count         20 non-null     int64 
 3   favorite_count        20 non-null     int64 
 4   user.screen_name      20 non-null     object
 5   user.followers_count  20 non-null     int64 
dtypes: int64(3), object(3)
memory usage: 1.1+ KB


In [None]:
# Check missing values
df.isnull().sum()

created_at              0
text                    0
retweet_count           0
favorite_count          0
user.screen_name        0
user.followers_count    0
dtype: int64

# 2. Pre-processing

## 2.1 Adjust the dataframe content

### 2.1.1 Parse 'create_at'

In [None]:
# Remove the '+0000' and the week-day from 'create_at'
# df['created_at'] = df['created_at'].apply(
    # lambda date: re.sub('\+[0-9]{4}', '', date)[4:])

# Convert to datetime
# https://www.programiz.com/python-programming/datetime/strftime
df['created_at'] = pd.to_datetime(df['created_at'], 
                                  format='%a %b %d %H:%M:%S +%f %Y')

# Adjust the format - Remove hour information
df["created_at"] = pd.to_datetime(df['created_at'].dt.strftime("%m/%d/%y"))

df['created_at'].head()

0   2021-05-23
1   2021-05-23
2   2021-05-23
3   2021-05-23
4   2021-05-23
Name: created_at, dtype: datetime64[ns]

In [None]:
# Remove the hour info
df.head()

Unnamed: 0,created_at,text,retweet_count,favorite_count,user.screen_name,user.followers_count
0,2021-05-23,RT @flurbnb: $120 to one person in 24 hours\n\...,0,0,pamxlaaaa_,182
1,2021-05-23,RT @BTC_Archive: #Bitcoin chart - 4 hour\nMACD...,0,0,AlexMorgan1984,405
2,2021-05-23,RT @airdropinspect: New airdrop: Bullcrypto (C...,0,0,ball50868202,0
3,2021-05-23,RT @100trillionUSD: In the chart you see at wh...,0,0,CryptoEscapades,206
4,2021-05-23,"RT @TheStalwart: Also this time around, Bitcio...",0,0,tslaqpodcast,4082


### 2.1.2 Create shifted dates columns
Creation of three extra columns representing the 1, 3, and 5 days in the future. <br>
This can be used to better evaluate the impact of the tweets in the bitcoin price.

In [None]:
# Create the columns of shifted date
shifted_dates = df['created_at'].apply(lambda x: x + timedelta(days=1))
df.insert(loc=1, column='created_at+1d', value=shifted_dates)

shifted_dates = df['created_at'].apply(lambda x: x + timedelta(days=3))
df.insert(loc=2, column='created_at+3d', value=shifted_dates)

shifted_dates = df['created_at'].apply(lambda x: x + timedelta(days=5))
df.insert(loc=3, column='created_at+5d', value=shifted_dates)

df.head(3)

Unnamed: 0,created_at,created_at+1d,created_at+3d,created_at+5d,text,retweet_count,favorite_count,user.screen_name,user.followers_count
0,2021-05-23,2021-05-24,2021-05-26,2021-05-28,RT @flurbnb: $120 to one person in 24 hours\n\...,0,0,pamxlaaaa_,182
1,2021-05-23,2021-05-24,2021-05-26,2021-05-28,RT @BTC_Archive: #Bitcoin chart - 4 hour\nMACD...,0,0,AlexMorgan1984,405
2,2021-05-23,2021-05-24,2021-05-26,2021-05-28,RT @airdropinspect: New airdrop: Bullcrypto (C...,0,0,ball50868202,0


### 2.1.3 Create the 'super_user' feature

In [None]:
# List of 'super users' -> People who its tweets may have more influence
super_users_set = {'pamxlaaaa_', # Test
                   'AlexMorgan1984', # Test
                   'elonmusk'}

In [None]:
# Create the feature
super_user = df['user.screen_name'].apply(
    lambda user: 1 if user in super_users_set else 0)
df['super_user'] = super_user
df.head(3)

Unnamed: 0,created_at,created_at+1d,created_at+3d,created_at+5d,text,retweet_count,favorite_count,user.screen_name,user.followers_count,super_user
0,2021-05-23,2021-05-24,2021-05-26,2021-05-28,RT @flurbnb: $120 to one person in 24 hours\n\...,0,0,pamxlaaaa_,182,1
1,2021-05-23,2021-05-24,2021-05-26,2021-05-28,RT @BTC_Archive: #Bitcoin chart - 4 hour\nMACD...,0,0,AlexMorgan1984,405,1
2,2021-05-23,2021-05-24,2021-05-26,2021-05-28,RT @airdropinspect: New airdrop: Bullcrypto (C...,0,0,ball50868202,0,0


### 2.1.4 Clear the text



In [None]:
df['text'][:15]

0     RT @flurbnb: $120 to one person in 24 hours\n\...
1     RT @BTC_Archive: #Bitcoin chart - 4 hour\nMACD...
2     RT @airdropinspect: New airdrop: Bullcrypto (C...
3     RT @100trillionUSD: In the chart you see at wh...
4     RT @TheStalwart: Also this time around, Bitcio...
5     RT @PolaroidPunks_: Check out when Josh the De...
6     RT @La__Cuen: Want to read about how people us...
7     So this just happened:\n@TheBitcoinConf  #Bitc...
8     RT @rektcapital: Just because everybody else i...
9     RT @LEXUS444444: #bitcoin #StellarLumens #XLM ...
10              @leppert Bitcoin is a religion, so yeah
11    @PeterSchiff You watching bitcoin on the 1 min...
12    RT @flurbnb: $1000 to one person in 7 days\n\n...
13    RT @glassnodealerts: üìâ #Bitcoin $BTC Number of...
14    #Bitcoin: BTCUSD ( $BTCUSD) DOWN 6.9696%! Last...
Name: text, dtype: object

In [None]:
# Sample of text containing emojis
df['text'][13]

'RT @glassnodealerts: üìâ #Bitcoin $BTC Number of UTXOs in Profit (7d MA) just reached a 7-month low of 108,074,244.250\n\nView metric:\nhttps://‚Ä¶'

In [None]:
# List of stopwords
stop_words = stopwords.words('english')

def clear_text(text: str):
    # Remove emojis
    text = text.encode('ascii', errors='ignore').decode('ascii')

    # Remove the 'RT's
    text = re.sub('RT\s', '', text)

    # Remove the links
    text = re.sub('http\S*|www\S*', '', text)

    # Remove the breakline signal
    text = re.sub('\\n', '', text)

    # Remove usernames 
    text = re.sub('\@\S*', '', text)

    # Remove hashtags
    text = re.sub('#\S*', '', text)

    # Remove numbers
    text = re.sub('[0-9]+', '', text)

    # Remove punctuation and extra blank spaces 
    text = re.sub('\W+', ' ', text)
    
    # Remove stop words and words with 2 or fewer letters
    text = [word for word in text.split() if 
            (len(word) > 3) and (word not in stop_words)]

    text = ' '.join(text).lower()

    return text

# REF:
# https://jakevdp.github.io/WhirlwindTourOfPython/14-strings-and-regular-expressions.html

In [None]:
df['text'] = df['text'].apply(clear_text)

In [None]:
# Sample of the text after the cleaning
for i in range(15):
    print(df['text'][i])

person hoursretweet search chad chad star
chart hourmacd looks forming bottom strong rebound cards
airdrop bullcrypto reward rate news bondlyfinance thundercoredistribution
chart price level total last moved happened weak
also time around bitcion peaked coin whereas bitcoin peaked within
check josh fuzzy interviewed impromptu live streaming
want read people iran venezuela japan these books sale
happened
just everybody else fearful mean bered colour chartand price
manifests harder than stellar lumens will literally worth share
bitcoin religion yeah
watching bitcoin chart dont
person daysretweet follow
number utxos profit reached month view metric
btcusd btcusd down last


In [None]:
# Sample of text containing emojis
df['text'][13]

'number utxos profit reached month view metric'

# 3. Sentiment Analysis

## 3.1 - Sentiment Analysis with TextBlob



In [None]:
# Sentiment Analysis using TextBlob
df['sentiment textblob'] = df['text'].apply(TextBlob)
df['sentiment textblob'] = df['sentiment textblob'].apply(
    lambda sent: round(sent.polarity, 3))

In [None]:
df.head()

Unnamed: 0,created_at,created_at+1d,created_at+3d,created_at+5d,text,retweet_count,favorite_count,user.screen_name,user.followers_count,super_user,sentiment textblob
0,2021-05-23,2021-05-24,2021-05-26,2021-05-28,person hoursretweet search chad chad star,0,0,pamxlaaaa_,182,1,0.0
1,2021-05-23,2021-05-24,2021-05-26,2021-05-28,chart hourmacd looks forming bottom strong reb...,0,0,AlexMorgan1984,405,1,0.433
2,2021-05-23,2021-05-24,2021-05-26,2021-05-28,airdrop bullcrypto reward rate news bondlyfina...,0,0,ball50868202,0,0,0.0
3,2021-05-23,2021-05-24,2021-05-26,2021-05-28,chart price level total last moved happened weak,0,0,CryptoEscapades,206,0,-0.125
4,2021-05-23,2021-05-24,2021-05-26,2021-05-28,also time around bitcion peaked coin whereas b...,0,0,tslaqpodcast,4082,0,0.0


In [None]:
# Sentiment analysis mean value by day
df[['created_at', 'sentiment textblob']].groupby(['created_at']).mean()

Unnamed: 0_level_0,sentiment textblob
created_at,Unnamed: 1_level_1
2021-05-23,0.0024


## 3.2 - Sentiment Analysis with Vader



In [None]:
# Set the analyzer
sentiment_analyzer_vader = SentimentIntensityAnalyzer()

df['sentiment vader'] = df['text'].apply(
    lambda tx: sentiment_analyzer_vader.polarity_scores(tx))

df[['text', 'sentiment textblob', 'sentiment vader']].head()

Unnamed: 0,text,sentiment textblob,sentiment vader
0,person hoursretweet search chad chad star,0.0,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
1,chart hourmacd looks forming bottom strong reb...,0.433,"{'neg': 0.0, 'neu': 0.68, 'pos': 0.32, 'compou..."
2,airdrop bullcrypto reward rate news bondlyfina...,0.0,"{'neg': 0.0, 'neu': 0.619, 'pos': 0.381, 'comp..."
3,chart price level total last moved happened weak,-0.125,"{'neg': 0.293, 'neu': 0.707, 'pos': 0.0, 'comp..."
4,also time around bitcion peaked coin whereas b...,0.0,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."


## 3.3 - Sentiment Analysis with Flair

In [None]:
# Define the classifier
flair_classifier = TextClassifier.load('en-sentiment')

def my_flair(text: str):
    sentence = Sentence(text)
    flair_classifier.predict(sentence)
    return sentence.labels

df['sentiment flair'] = df['text'].apply(my_flair)

df[['text', 
    'sentiment textblob', 
    'sentiment vader', 
    'sentiment flair']].head()

2021-06-07 20:11:52,670 https://nlp.informatik.hu-berlin.de/resources/models/sentiment-curated-distilbert/sentiment-en-mix-distillbert_4.pt not found in cache, downloading to /tmp/tmp7_lzyowd


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 265512723/265512723 [00:06<00:00, 38090509.86B/s]

2021-06-07 20:11:59,715 copying /tmp/tmp7_lzyowd to cache at /root/.flair/models/sentiment-en-mix-distillbert_4.pt





2021-06-07 20:12:00,697 removing temp file /tmp/tmp7_lzyowd
2021-06-07 20:12:01,609 loading file /root/.flair/models/sentiment-en-mix-distillbert_4.pt


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti‚Ä¶




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti‚Ä¶




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w‚Ä¶




Unnamed: 0,text,sentiment textblob,sentiment vader,sentiment flair
0,person hoursretweet search chad chad star,0.0,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",[NEGATIVE (0.9913)]
1,chart hourmacd looks forming bottom strong reb...,0.433,"{'neg': 0.0, 'neu': 0.68, 'pos': 0.32, 'compou...",[NEGATIVE (0.9773)]
2,airdrop bullcrypto reward rate news bondlyfina...,0.0,"{'neg': 0.0, 'neu': 0.619, 'pos': 0.381, 'comp...",[POSITIVE (0.997)]
3,chart price level total last moved happened weak,-0.125,"{'neg': 0.293, 'neu': 0.707, 'pos': 0.0, 'comp...",[NEGATIVE (0.9999)]
4,also time around bitcion peaked coin whereas b...,0.0,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",[NEGATIVE (0.9922)]
