In [7]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import NMF

In [2]:
df_tweets = pd.read_csv('./data/btc_tweets_cleaned.csv')

In [4]:
corpus = df_tweets['tweet'].fillna(value='')

In [5]:
for tweet in df_tweets['tweet'].sample(10).values:
    print(tweet)
    print()

narrative working cash exchange value social contract like cash cash fast primary feature try deposit cash shop bank tell fast

black swap airdrop liveclick link participate amazing rewards bls value referral

royal defi airdrop liveclick link participate amazing rewards rld value referral

yworld finance airdrop liveclick link participate amazing rewards ywld value referral

nan

speed finance airdrop liveclick link participate amazing rewards spfi value referral

vote trump buy

royal defi airdrop liveclick link participate amazing rewards rld value referral

amazing artwork

nan



In [8]:
stop_words = ENGLISH_STOP_WORDS.union(['bitcoin'])

In [18]:
tfidf = TfidfVectorizer(stop_words=stop_words,min_df=0.0002)

tweet_word_matrix = tfidf.fit_transform(corpus)

vocab = tfidf.get_feature_names()

## NMF

In [63]:
nmf = NMF(n_components=3)
nmf.fit(tweet_word_matrix)

NMF(n_components=3)

### Tweet/Topic Matrix

In [64]:
tweet_topic_matrix = nmf.transform(tweet_word_matrix)

In [65]:
tweet_topic_matrix_df = pd.DataFrame(tweet_topic_matrix).add_prefix('topic_')

tweet_topic_matrix_df[['tweet', 'tokens']] = df_tweets[['tweet', 'tokens']]
tweet_topic_matrix_df.head()

Unnamed: 0,topic_0,topic_1,topic_2,tweet,tokens
0,0.089789,0.00011,6e-05,yworld finance airdrop liveclick link particip...,"['yworld', 'finance', 'airdrop', 'liveclick', ..."
1,1e-05,0.00065,0.001778,makes new ath,"['makes', 'new', 'ath']"
2,0.0,0.0,0.12848,btc usd btc usd,"['btc', 'usd', 'btc', 'usd']"
3,0.14531,0.0,0.0,royal defi airdrop liveclick link participate ...,"['royal', 'defi', 'airdrop', 'liveclick', 'lin..."
4,0.0,0.0,0.0,,[]


### Word/Topic Matrix

In [66]:
word_topic_matrix_df = pd.DataFrame(nmf.components_, columns=vocab).T.add_prefix('topic_')
word_topic_matrix_df

Unnamed: 0,topic_0,topic_1,topic_2
aan,0.000000e+00,0.000111,0.000417
aapl,0.000000e+00,0.000022,0.000288
aave,9.798329e-05,0.000000,0.003653
abajo,0.000000e+00,0.000000,0.000419
abd,0.000000e+00,0.000000,0.001129
...,...,...,...
zrx,1.390053e-04,0.000010,0.002475
zuckerberg,1.728993e-07,0.000090,0.000203
zukunft,0.000000e+00,0.001072,0.000082
zum,0.000000e+00,0.003091,0.001185


## Topic Interpretation

In [67]:
for tweet in tweet_topic_matrix_df.sort_values(by='topic_0', ascending=False).head(10)['tweet'].values:
    print(tweet)
    print()

royal defi airdrop liveclick link participate amazing rewards rld value referral

royal defi airdrop liveclick link participate amazing rewards rld value referral

royal defi airdrop liveclick link participate amazing rewards rld value referral

royal defi airdrop liveclick link participate amazing rewards rld value referral

royal defi airdrop liveclick link participate amazing rewards rld value referral

royal defi airdrop liveclick link participate amazing rewards rld value referral

royal defi airdrop liveclick link participate amazing rewards rld value referral

royal defi airdrop liveclick link participate amazing rewards rld value referral

royal defi airdrop liveclick link participate amazing rewards rld value referral

royal defi airdrop liveclick link participate amazing rewards rld value referral



In [68]:
word_topic_matrix_df.sort_values(by='topic_0', ascending=False).head(10)

Unnamed: 0,topic_0,topic_1,topic_2
liveclick,2.108045,0.0,0.0
participate,2.103336,0.0,0.0
rewards,2.102987,0.0,0.0
airdrop,2.097099,0.0,0.0
amazing,2.09561,0.007078,0.0
referral,2.087534,0.0,0.0
value,2.078319,0.000595,0.001055
link,2.03435,0.004544,0.016534
rld,1.871876,0.0,0.0
royal,1.864863,0.0,0.0


Topic 0: Hype Topics. To get people into buying the coin

In [69]:
def top_tweets(tweet_topic_matrix_df, topic, n_tweets):
    return (tweet_topic_matrix_df
            .sort_values(by=topic, ascending=False)
            .head(n_tweets)['tweet']
            .values)

def top_words(word_topic_matrix_df, topic, n_words):
    return (word_topic_matrix_df
            .sort_values(by=topic, ascending=False)
            .head(n_words))[topic]

In [70]:
for tweet in top_tweets(tweet_topic_matrix_df, 'topic_1', 5):
    print(tweet)
    print()

happy birthday

happy birthday

happy birthday

happy birthday

happy birthday



In [71]:
top_words(word_topic_matrix_df, 'topic_1', 10)

happy          3.492094
birthday       3.102414
whitepaper     0.395780
day            0.378788
paper          0.354764
white          0.351577
halloween      0.282808
anniversary    0.280152
years          0.214214
today          0.176124
Name: topic_1, dtype: float64

Topic 1 is celebrating Bitcoin and when the white paper was initially released

In [72]:
for tweet in top_tweets(tweet_topic_matrix_df, 'topic_2', 5):
    print(tweet)
    print()

spndbuy btc sell btc ratio buy btc sell btc ratio

ionbuy btc sell btc ratio buy btc sell btc ratio

dmtbuy btc sell btc ratio buy btc sell btc ratio

ognbuy btc sell btc ratio buy btc sell btc ratio

aergobuy btc sell btc ratio buy btc sell btc ratio



In [73]:
top_words(word_topic_matrix_df, 'topic_2', 10)

btc       2.818242
ratio     1.462762
sell      1.448524
usdt      0.940838
buy       0.514040
price     0.344453
eth       0.327546
usd       0.263664
update    0.181511
yen       0.150731
Name: topic_2, dtype: float64

Topic 3: Seems to be talking about the update on Bitcoins price and what will happen to it 