#### Install dependencies

In [30]:
# !pip install snscrape
# !pip install textblob
# !pip install pandas
# !pip install vaderSentiment
# !pip install nltk
# !pip install transformers
# !pip install torch
# !pip install openai

----
#### Needed imports

In [31]:
import pandas as pd
import snscrape.modules.twitter as sntwitter
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tqdm import tnrange, tqdm_notebook, tqdm
from nltk.tokenize import sent_tokenize, word_tokenize
import regex as re
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from scipy.special import softmax

----
#### Start Mining Tweets

In [32]:
# query = "(crash, OR crashing, OR cair, OR queda, OR subir, OR subida, OR bullish, OR bearish, OR explode, OR exploding) -#BTC -#SafeBlast -#bitcoin -#SOL -#solana -#ADA -#XRP -#SHIB -#BNB -giveaway -congrats -congratulations -giving -link (#eth) until:2022-09-15 since:2022-08-15"
# # query = "(crash, OR crashing, OR cair, OR queda, OR subir, OR subida, OR bullish, OR bearish, OR explode, OR exploding) -#BTC -#SafeBlast -#bitcoin -#SOL -#solana -#ADA -#XRP -#SHIB -#BNB -giveaway -giveaways -congrats -congratulations -winner -giving -link -https -telegram (#eth) until:2022-09-15 since:2022-08-15"
# tweets = []
# limit = 1000

# for tweet in sntwitter.TwitterHashtagScraper(query).get_items():
    
#     if len(tweets) == limit:
#         break
#     else:
#         tweets.append([tweet.date, tweet.url, tweet.user.username, tweet.sourceLabel, tweet.user.location, tweet.content, tweet.likeCount, tweet.retweetCount,  tweet.quoteCount, tweet.replyCount])
        
# df = pd.DataFrame(tweets, columns=['Date', 'TweetURL','User', 'Source', 'Location', 'Tweet', 'Likes_Count','Retweet_Count', 'Quote_Count', 'Reply_Count'])

# df.to_csv('../data/bullishTweets.csv')

# print("Shape: ", df.shape)

In [33]:
df = pd.read_csv('../data/bullishTweets.csv')

----

----
#### Sentiment Analysis with VADER

In [34]:
analyzer = SentimentIntensityAnalyzer()
compound = []
for i,s in enumerate(tqdm(df['Tweet'])):
    vs = analyzer.polarity_scores(s)
    compound.append(vs["compound"])
df["compoundVader"] = compound
df.head(2)

df.to_csv('../data/compoundAnalysis.csv')

100%|██████████| 1000/1000 [00:00<00:00, 5512.13it/s]


#### Sentiment Analysis with TextBlob

In [35]:
compound = []
for i,s in enumerate(tqdm(df['Tweet'])):
    vs = TextBlob(s).sentiment
    compound.append(vs)
df["compoundTextBlob"] = compound
df.head(2)

df.to_csv('../data/compoundAnalysis.csv')

100%|██████████| 1000/1000 [00:00<00:00, 2002.70it/s]


#### Sort vader compound values by descending order

In [36]:
df2 = df.sort_values(by=['compoundVader'], ascending=False)
df2.to_csv('../data/orderedAnalysis.csv')

#### Calculate mean compound value (pensar numa maneira melhor de ver isto, mas para já faz o serviço)

In [37]:
i = 0
for x in df2['compoundVader']:
    i += x

mean = i/len(df2['compoundVader'])
print("Mean: ", mean)

Mean:  0.15064550000000168


---
### Text Cleaning using regex

In [38]:
def cleantxt(text):
    text= re.sub(r'@[A-Za-z0-9]+', '',text)# removed @mentions
    text= re.sub(r'#', '',text)# removed # symbol
    text = re.sub(r'RT[\s]+', '',text)# rmoved RT
    text = re.sub(r'https?:\/\/\s+', '',text)# removed the hyperlink
    text = re.sub(r'\w+:\/\/[a-zA-Z0-9.\/-]+', '',text) # removed any other links (like telegram)
    text = text.replace('\r', '').replace('\n', ' ').replace('\n', ' ').lower() #remove \n and \r an
    text = re.sub(r'[^\x00-\x7f]',r'', text) #remove non utf8/ascii characters such as '\x9a\x91\x97\
    return text


df2["Tweet"] = df2["Tweet"].apply(cleantxt)

df2.to_csv('../data/cleanedTweets.csv')

----
#### Sentiment Analysis using roBERTa Pretrained Model

As the score is returned as an array: [`negative_Value`, `neutral_Value`, `positive_value`]

The compound is calculated by `positive_Value` - `negative_Value`

In [39]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)


def sentiment_score(text):
    text = str(text)
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    return scores


scores = []
for i,s in enumerate(tqdm(df2['Tweet'])):
    res = sentiment_score(s)
    scores.append(res[2] - res[0])

df2["compoundRoBERTa"] = scores
df2.head(2)

df2.to_csv('../data/roBERTaAnalysis.csv')

100%|██████████| 1000/1000 [02:04<00:00,  8.05it/s]


----
#### NLTK pre-processing

In [40]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import string

stopwords = set(stopwords.words('english'))

stemmer = SnowballStemmer('english')

def preprocessamento(tweet):
    # Tokenização
    tokens = word_tokenize(tweet.lower())
    # Remoção de stopwords e caracteres especiais
    tokens = [token for token in tokens if (token not in stopwords) and (token not in string.punctuation)]
    # Stemming
    tokens_stem = [stemmer.stem(token) for token in tokens]
    tweet_preprocessado = " ".join(tokens_stem)
    return tweet_preprocessado

df2['Tweet_NLTK'] = df2['Tweet'].apply(preprocessamento)

----
#### Classifying as Positive or Negative

In [42]:
df2["Sentiment"] = ["positive" if x >= 0 else "negative" for x in df2["compoundVader"]]

df2.to_csv('../data/preprocessedTweets.csv')

In [43]:
from datetime import datetime

# Definir a função de extração da data
def extrair_data(data_str):
    data = datetime.strptime(data_str, "%Y-%m-%d %H:%M:%S%z").date()
    return data

# Aplicar a função à coluna "Date" do DataFrame
df2["Date"].apply(extrair_data).unique()


array([datetime.date(2022, 9, 9), datetime.date(2022, 9, 11),
       datetime.date(2022, 9, 13), datetime.date(2022, 9, 12),
       datetime.date(2022, 9, 14), datetime.date(2022, 9, 10),
       datetime.date(2022, 9, 7), datetime.date(2022, 9, 8),
       datetime.date(2022, 9, 6)], dtype=object)

#### Weighted classification for each Tweet considering num of interactions (likes and retweets)

In [44]:
# scores = []
# for i, s in tqdm(df2.iterrows(), total=df2.shape[0]):
#     scores.append(s["compoundVader"] * ((s["Likes_Count"]+1))* ((s["Retweet_Count"]+1)))
# df2["score"] = scores
# df2.head(2)

----
#### **Teste de Named Entity Recognition**

In [45]:
phrase = "#BTC looks like it's going to crash again, so I'm just going to wait and see what happens."
sentiment = TextBlob(phrase).sentiment
print("Sentiment 1: ", sentiment)

phrase = "#BTC looks like it's going to go bad again, so I'm just going to wait and see what happens."
sentiment = TextBlob(phrase).sentiment
print("Sentiment 2: ", sentiment)

Sentiment 1:  Sentiment(polarity=0.0, subjectivity=0.0)
Sentiment 2:  Sentiment(polarity=-0.6999999999999998, subjectivity=0.6666666666666666)


In [46]:
phrase = "#BTC looks like it's going to crash again, so I'm just going to wait and see what happens."
sentiment = analyzer.polarity_scores(phrase)
print("Sentiment 1: ", sentiment["compound"])

phrase = "#BTC looks like it's going to go bad again, so I'm just going to wait and see what happens."
sentiment = analyzer.polarity_scores(phrase)
print("Sentiment 2: ", sentiment["compound"])

Sentiment 1:  -0.0516
Sentiment 2:  -0.25
