In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import NMF

In [2]:
df_tweets = pd.read_csv('./data/btc_tweets_cleaned.csv')

In [3]:
corpus = df_tweets['tweet'].fillna(value='')

In [4]:
for tweet in df_tweets['tweet'].sample(10).values:
    print(tweet)
    print()

square remix cool thing input file transparent transparent tons possibilities like witht logo vritual guts kinda

blockchain ayuda combatir las estafas fraude brasil dice experto

year shaping special

btc breaking structure test

targets

royal defi airdrop liveclick link participate amazing rewards rld value referral

gold going loser break downtrending stock market going higher higher people investing head checked

follow let hunt bitcoins exciting experience gaming platform dividends username bitcoinbre

dctbuy btc sell btc ratio

 lees mal hace aos que existe los aos que quedan



In [5]:
stop_words = ENGLISH_STOP_WORDS.union(['bitcoin'])

In [6]:
tfidf = TfidfVectorizer(stop_words=stop_words,min_df=0.0002)

tweet_word_matrix = tfidf.fit_transform(corpus)

vocab = tfidf.get_feature_names()

## NMF

In [13]:
nmf = NMF(n_components=4)
nmf.fit(tweet_word_matrix)

NMF(n_components=4)

### Tweet/Topic Matrix

In [14]:
tweet_topic_matrix = nmf.transform(tweet_word_matrix)

In [15]:
tweet_topic_matrix_df = pd.DataFrame(tweet_topic_matrix).add_prefix('topic_')

tweet_topic_matrix_df[['tweet', 'tokens']] = df_tweets[['tweet', 'tokens']]
tweet_topic_matrix_df.head()

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,tweet,tokens
0,0.008427,7e-06,0.0,0.175026,yworld finance airdrop liveclick link particip...,"['yworld', 'finance', 'airdrop', 'liveclick', ..."
1,0.0,0.000649,0.001778,3.9e-05,makes new ath,"['makes', 'new', 'ath']"
2,0.0,0.0,0.128482,0.0,btc usd btc usd,"['btc', 'usd', 'btc', 'usd']"
3,0.150875,0.0,0.0,0.0,royal defi airdrop liveclick link participate ...,"['royal', 'defi', 'airdrop', 'liveclick', 'lin..."
4,0.0,0.0,0.0,0.0,,[]


### Word/Topic Matrix

In [16]:
word_topic_matrix_df = pd.DataFrame(nmf.components_, columns=vocab).T.add_prefix('topic_')
word_topic_matrix_df

Unnamed: 0,topic_0,topic_1,topic_2,topic_3
aan,0.000000,0.000111,0.000417,0.000000e+00
aapl,0.000000,0.000022,0.000288,1.503593e-07
aave,0.000049,0.000000,0.003653,1.378159e-04
abajo,0.000000,0.000000,0.000419,0.000000e+00
abd,0.000000,0.000000,0.001129,0.000000e+00
...,...,...,...,...
zrx,0.000065,0.000010,0.002475,2.056288e-04
zuckerberg,0.000000,0.000090,0.000203,2.211700e-06
zukunft,0.000000,0.001072,0.000082,0.000000e+00
zum,0.000000,0.003091,0.001185,1.818430e-06


## Topic Interpretation

In [17]:
for tweet in tweet_topic_matrix_df.sort_values(by='topic_0', ascending=False).head(10)['tweet'].values:
    print(tweet)
    print()

royal defi airdrop liveclick link participate amazing rewards rld value referral

royal defi airdrop liveclick link participate amazing rewards rld value referral

royal defi airdrop liveclick link participate amazing rewards rld value referral

royal defi airdrop liveclick link participate amazing rewards rld value referral

royal defi airdrop liveclick link participate amazing rewards rld value referral

royal defi airdrop liveclick link participate amazing rewards rld value referral

royal defi airdrop liveclick link participate amazing rewards rld value referral

royal defi airdrop liveclick link participate amazing rewards rld value referral

royal defi airdrop liveclick link participate amazing rewards rld value referral

royal defi airdrop liveclick link participate amazing rewards rld value referral



In [18]:
word_topic_matrix_df.sort_values(by='topic_0', ascending=False).head(10)

Unnamed: 0,topic_0,topic_1,topic_2,topic_3
rld,2.240378,0.0,0.0,0.0
royal,2.233455,0.0,0.0,0.0
defi,2.219397,0.0,0.001085,0.0
liveclick,1.925286,0.0,0.0,1.179759
rewards,1.918888,0.0,0.0,1.180568
participate,1.916131,0.0,0.0,1.187234
airdrop,1.909415,0.0,0.0,1.185873
amazing,1.908919,0.006455,0.0,1.18324
referral,1.902039,0.0,0.0,1.177803
value,1.896742,2e-05,0.000574,1.166125


Topic 0: Hype Topics. To get people into buying the coin

In [19]:
def top_tweets(tweet_topic_matrix_df, topic, n_tweets):
    return (tweet_topic_matrix_df
            .sort_values(by=topic, ascending=False)
            .head(n_tweets)['tweet']
            .values)

def top_words(word_topic_matrix_df, topic, n_words):
    return (word_topic_matrix_df
            .sort_values(by=topic, ascending=False)
            .head(n_words))[topic]

In [20]:
for tweet in top_tweets(tweet_topic_matrix_df, 'topic_1', 5):
    print(tweet)
    print()

happy birthday

happy birthday

happy birthday

happy birthday

happy birthday



In [21]:
top_words(word_topic_matrix_df, 'topic_1', 10)

happy          3.492126
birthday       3.102443
whitepaper     0.395779
day            0.378784
paper          0.354763
white          0.351575
halloween      0.282809
anniversary    0.280153
years          0.214211
today          0.176119
Name: topic_1, dtype: float64

Topic 1 is celebrating Bitcoin and when the white paper was initially released

In [22]:
for tweet in top_tweets(tweet_topic_matrix_df, 'topic_2', 5):
    print(tweet)
    print()

nexobuy btc sell btc ratio buy btc sell btc ratio

apmbuy btc sell btc ratio buy btc sell btc ratio

spndbuy btc sell btc ratio buy btc sell btc ratio

dmtbuy btc sell btc ratio buy btc sell btc ratio

lbabuy btc sell btc ratio buy btc sell btc ratio



In [23]:
top_words(word_topic_matrix_df, 'topic_2', 10)

btc       2.818284
ratio     1.462735
sell      1.448496
usdt      0.940785
buy       0.514024
price     0.344469
eth       0.327537
usd       0.263669
update    0.181514
yen       0.150736
Name: topic_2, dtype: float64

Topic 3: Seems to be talking about the update on Bitcoins price and what will happen to it 

In [24]:
for tweet in top_tweets(tweet_topic_matrix_df, 'topic_3', 5):
    print(tweet)
    print()

hugeswap airdrop liveclick link participate amazing rewards huswap value referral

hugeswap airdrop liveclick link participate amazing rewards huswap value referral

hugeswap airdrop liveclick link participate amazing rewards huswap value referral

hugeswap airdrop liveclick link participate amazing rewards huswap value referral

hugeswap airdrop liveclick link participate amazing rewards huswap value referral



In [25]:
top_words(word_topic_matrix_df, 'topic_2', 10)

btc       2.818284
ratio     1.462735
sell      1.448496
usdt      0.940785
buy       0.514024
price     0.344469
eth       0.327537
usd       0.263669
update    0.181514
yen       0.150736
Name: topic_2, dtype: float64