#### Install dependencies

In [None]:
!pip install snscrape
!pip install textblob
!pip install pandas
!pip install vaderSentiment
!pip install tqdm
!pip install nltk

----
#### Needed imports

In [1]:
import pandas as pd
import snscrape.modules.twitter as sntwitter
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tqdm import tnrange, tqdm_notebook, tqdm
from nltk.tokenize import sent_tokenize, word_tokenize

----
#### Start Mining Tweets

In [48]:
query = "(crash, OR crashing, OR cair, OR queda, OR subir, OR subida, OR bullish, OR bearish, OR explode, OR exploding) -#BTC -#SafeBlast -#bitcoin -#SOL -#solana -#ADA -#XRP -#SHIB -#BNB -giveaway -congrats -congratulations -giving -link (#eth) until:2022-09-15 since:2022-08-15"
# query = "(crash, OR crashing, OR cair, OR queda, OR subir, OR subida, OR bullish, OR bearish, OR explode, OR exploding) -#BTC -#SafeBlast -#bitcoin -#SOL -#solana -#ADA -#XRP -#SHIB -#BNB -giveaway -giveaways -congrats -congratulations -winner -giving -link -https -telegram (#eth) until:2022-09-15 since:2022-08-15"
tweets = []
limit = 1000

#TODO: meter aqui a barra de progresso ( https://github.com/tqdm/tqdm )

for tweet in sntwitter.TwitterHashtagScraper(query).get_items():
    
    if len(tweets) == limit:
        break
    else:
        tweets.append([tweet.date, tweet.url, tweet.user.username, tweet.sourceLabel, tweet.user.location, tweet.content, tweet.likeCount, tweet.retweetCount,  tweet.quoteCount, tweet.replyCount])
        
df = pd.DataFrame(tweets, columns=['Date', 'TweetURL','User', 'Source', 'Location', 'Tweet', 'Likes_Count','Retweet_Count', 'Quote_Count', 'Reply_Count'])

df.to_csv('../data/bullishTweets.csv')

print("Shape: ", df.shape)

  tweets.append([tweet.date, tweet.url, tweet.user.username, tweet.sourceLabel, tweet.user.location, tweet.content, tweet.likeCount, tweet.retweetCount,  tweet.quoteCount, tweet.replyCount])


Shape:  (1000, 10)


#### Sentiment Analysis with VADER

In [49]:
analyzer = SentimentIntensityAnalyzer()
compound = []
for i,s in enumerate(tqdm(df['Tweet'])):
    vs = analyzer.polarity_scores(s)
    compound.append(vs["compound"])
df["compoundVader"] = compound
df.head(2)

df.to_csv('../data/compoundAnalysis.csv')

100%|██████████| 1000/1000 [00:00<00:00, 3366.57it/s]


#### Sentiment Analysis with TextBlob

In [50]:
compound = []
for i,s in enumerate(tqdm(df['Tweet'])):
    vs = TextBlob(s).sentiment
    compound.append(vs)
df["compoundTextBlob"] = compound
df.head(2)

df.to_csv('../data/compoundAnalysis.csv')

100%|██████████| 1000/1000 [00:00<00:00, 1779.64it/s]


#### Sort vader compound values by descending order

In [51]:
df2 = df.sort_values(by=['compoundVader'], ascending=False)
df2.to_csv('../data/orderedAnalysis.csv')

#### Calculate mean compound value (pensar numa maneira melhor de ver isto, mas para já faz o serviço)

In [52]:
i = 0
for x in df2['compoundVader']:
    i += x

mean = i/len(df2['compoundVader'])
print("Mean: ", mean)

Mean:  0.15064550000000168


---
### Aplicar o NLTK

----
#### **Teste de Named Entity Recognition**

In [6]:
phrase = "#BTC looks like it's going to crash again, so I'm just going to wait and see what happens."
sentiment = TextBlob(phrase).sentiment
print("Sentiment 1: ", sentiment)

phrase = "#BTC looks like it's going to go bad again, so I'm just going to wait and see what happens."
sentiment = TextBlob(phrase).sentiment
print("Sentiment 2: ", sentiment)

Sentiment 1:  Sentiment(polarity=0.0, subjectivity=0.0)
Sentiment 2:  Sentiment(polarity=-0.6999999999999998, subjectivity=0.6666666666666666)


In [13]:
phrase = "#BTC looks like it's going to crash again, so I'm just going to wait and see what happens."
sentiment = analyzer.polarity_scores(phrase)
print("Sentiment 1: ", sentiment["compound"])

phrase = "#BTC looks like it's going to go bad again, so I'm just going to wait and see what happens."
sentiment = analyzer.polarity_scores(phrase)
print("Sentiment 2: ", sentiment["compound"])

Sentiment 1:  -0.0516
Sentiment 2:  -0.25


----
### TODO:

- Ordenar por sentimento e verificar se corresponde

- Utilizar uma palavra (tipo "money") para substituir pelo BTC, #BTC, Bitcoin, etc.. para verificar se o Vader e o TextBlob conseguem extrair conhecimento com isso, já que é uma palavra que ele deve conhecer o significado e ver se melhora os resultados - Pesquisar sobre Named Entity Recognition

- Fazer os resultados manualmente para 5 ou 10 tweets (que sejam explicitos sobre o seu sentimento) e comparar com os valores previstos pelo Vader e o TextBlob para ver se as falhas nos resultados são deles ou dos Tweets que não dizem merda nenhuma de jeito. Aproveitar para justificar isso no relatório

- Instalar NLTK (nltk.corpus, nltk.tokenize, nltk.probability, word_tokenize) [Ver este link](https://www.analyticsvidhya.com/blog/2021/06/vader-for-sentiment-analysis/)

- Verficar tweets nulos, sem conteudo, etc...

- Verificar a quantidade de interações

- Meter o tqsm a funcionar no scapping dos tweets