In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Load data
# use skip initialspace in case headers has hidden space

df = pd.read_csv("C:/Users/storm/Desktop/METIS/Project/project 4_TOPIC MODELLING/dataset/crypto_currency_reddit_data.csv", usecols = ['title'], skipinitialspace=True)


In [3]:
df.head()

Unnamed: 0,title
0,CryptoNick is deleting all of his BitConnect v...
1,"I will tell you exactly what is going on here,..."
2,Robinhood is launching a Crypto Trading app to...
3,"Checkmate, Bill."
4,Delta's app store description seems appropriat...


In [4]:
#Remove Numbers

df['title'] = df['title'].str.replace('\d+', '')

In [5]:
#Convert dataframe to series object to be able to parse into CountVectorizer

tex = df['title']
type(tex)

pandas.core.series.Series

In [6]:

vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, ngram_range=(1,3), max_df=10, use_idf=True)
doc_word = vectorizer.fit_transform(tex)

In [7]:
# vectorizer = CountVectorizer(stop_words='english', lowercase=True, ngram_range=(1,3), max_df=10)
# doc_word = vectorizer.fit_transform(tex)
# doc_word.shape

In [8]:
pd.DataFrame(doc_word.toarray(), index=df, columns=vectorizer.get_feature_names()).head()

Unnamed: 0,abandoned,abandoned reddit,abandoned reddit account,ability,ability buy,ability buy sell,able,able security,able security audit,able use,...,zero bulletproofs,zero bulletproofs upgrade,zero customer,zero customer support,zero fee,zero fee exchange,zimbabwe,zimbabwe banned,zimbabwe banned dollar,zoom
"(CryptoNick is deleting all of his BitConnect videos, and so are his buddies. Please never forget what he and his cohorts did to so many people, and how much money those people lost in the process thanks to CryptoNick, Trevon James, and Craig Grant!,)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(I will tell you exactly what is going on here, this is critical information to understand if you are going to make money in this space. How prices work, and what moves them - and it's not money invested/withdrawn.,)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(Robinhood is launching a Crypto Trading app to compete with Coinbase,)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(Checkmate, Bill.,)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(Delta's app store description seems appropriate today.,)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
#Latent Semantic Analysis (LSA) 

lsa = TruncatedSVD(4)
doc_topic = lsa.fit_transform(doc_word)
lsa.explained_variance_ratio_

array([0.00253011, 0.00193535, 0.0021144 , 0.00208001])

In [10]:
#Vt Matrix
#May add index = ["component_1","component_2", "component_3", "component_4"]

topic_word = pd.DataFrame(lsa.components_.round(3),             
             columns = vectorizer.get_feature_names())
topic_word

Unnamed: 0,abandoned,abandoned reddit,abandoned reddit account,ability,ability buy,ability buy sell,able,able security,able security audit,able use,...,zero bulletproofs,zero bulletproofs upgrade,zero customer,zero customer support,zero fee,zero fee exchange,zimbabwe,zimbabwe banned,zimbabwe banned dollar,zoom
0,-0.001,-0.001,-0.001,0.0,0.0,0.0,0.001,-0.0,-0.0,0.001,...,0.0,0.0,0.0,0.0,0.003,0.003,0.001,0.001,0.001,0.016
1,0.003,0.003,0.003,0.001,0.001,0.001,0.004,0.002,0.002,0.003,...,0.001,0.001,0.001,0.001,0.0,0.0,0.001,0.001,0.001,-0.028
2,-0.002,-0.002,-0.002,-0.002,-0.002,-0.002,0.002,-0.0,-0.0,0.002,...,-0.003,-0.003,-0.002,-0.002,-0.002,-0.002,-0.001,-0.001,-0.001,0.04
3,-0.003,-0.003,-0.003,-0.002,-0.002,-0.002,0.0,0.001,0.001,-0.0,...,-0.002,-0.002,-0.002,-0.002,-0.003,-0.003,0.002,0.002,0.002,-0.035


In [11]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [12]:
display_topics(lsa, vectorizer.get_feature_names(), 10)


Topic  0
reduced, withdrawal, withdrawal fees, fees reduced, withdrawal fees reduced, come, make known, known binance, known binance want, want withdrawal

Topic  1
request, request network, update, th, musk, elon, elon musk, project, hodl, project update

Topic  2
win, request, request network, update, thanks, remember, crypto market, craig, wright, craig wright

Topic  3
fraud scam, bitcoin fraud scam, bitcoin fraud, fraud, moon, lambo, moon lambo, thank, wright, thank cryptocurrency


In [13]:
#U Matrix
#May add columns = ["component_1","component_2","component_3","component_4"]

Vt = pd.DataFrame(doc_topic.round(5),
             index = df)
Vt

Unnamed: 0,0,1,2,3
"(CryptoNick is deleting all of his BitConnect videos, and so are his buddies. Please never forget what he and his cohorts did to so many people, and how much money those people lost in the process thanks to CryptoNick, Trevon James, and Craig Grant!,)",0.01054,0.00352,0.03979,-0.00647
"(I will tell you exactly what is going on here, this is critical information to understand if you are going to make money in this space. How prices work, and what moves them - and it's not money invested/withdrawn.,)",0.00364,0.02744,-0.00798,-0.03105
"(Robinhood is launching a Crypto Trading app to compete with Coinbase,)",0.00998,0.01442,0.00068,-0.01538
"(Checkmate, Bill.,)",-0.00060,-0.00505,-0.00625,0.00501
"(Delta's app store description seems appropriate today.,)",0.00999,0.02465,-0.00449,0.00150
...,...,...,...,...
"(Elon Musk: ""Whoever owns the early BTC deserves a Nobel prize in delayed gratification."",)",0.00769,0.14197,-0.08662,-0.16663
"(Harvesting the body heat of , people can mine BTC/month,)",-0.00548,-0.01098,-0.01706,-0.00219
"(IBM Launches A Blockchain Based Global Payments Network Using Stellar's Cryptocurrency,)",0.00451,0.01624,0.00995,-0.00862
"( Million Tethers have been printed in the past days. Tether has still not provided an audit of funds backing USDT.,)",0.01288,0.01913,0.00183,-0.00503


In [14]:
#NMF

In [15]:
nmf_model = NMF(4)
doc_topic = nmf_model.fit_transform(doc_word)

In [16]:
#The H matrix (topic_word) shows the 2 resulting topics, and the terms associated with each topic
#May add index = ["component_1","component_2", "component_3", "component_4"]

topic_word = pd.DataFrame(nmf_model.components_.round(3),
                          columns = vectorizer.get_feature_names())
topic_word

Unnamed: 0,abandoned,abandoned reddit,abandoned reddit account,ability,ability buy,ability buy sell,able,able security,able security audit,able use,...,zero bulletproofs,zero bulletproofs upgrade,zero customer,zero customer support,zero fee,zero fee exchange,zimbabwe,zimbabwe banned,zimbabwe banned dollar,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.003,0.003,0.0,0.0,0.0,0.0
1,0.001,0.001,0.001,0.001,0.001,0.001,0.0,0.0,0.0,0.0,...,0.0,0.0,0.001,0.001,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.001,0.001,0.001,0.0,0.0,0.0,0.0,...,0.0,0.0,0.001,0.001,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.002,0.002,0.0,0.0,0.0,0.0


In [17]:
display_topics(nmf_model, vectorizer.get_feature_names(), 10)


Topic  0
reduced, withdrawal, withdrawal fees, withdrawal fees reduced, fees reduced, come, want withdrawal fees, lets make, make known, make known binance

Topic  1
request, request network, update, th, satoshi, project, network project, project update, request network project, network project update

Topic  2
win, thanks, real, real world adoption, world adoption, real world, venezuela, hyperinflation, adoption, usd

Topic  3
fraud scam, bitcoin fraud scam, bitcoin fraud, fraud, wright, craig, craig wright, charlie, charlie lee, lee


In [18]:
#The W matrix shows the comments started with, and how each comments is made up of the 2 resulting topics
#May add columns = ["component_1","component_2","component_3","component_4"]

H = pd.DataFrame(doc_topic.round(5),
             index = df)
H

Unnamed: 0,0,1,2,3
"(CryptoNick is deleting all of his BitConnect videos, and so are his buddies. Please never forget what he and his cohorts did to so many people, and how much money those people lost in the process thanks to CryptoNick, Trevon James, and Craig Grant!,)",0.00482,0.00724,0.01317,0.02372
"(I will tell you exactly what is going on here, this is critical information to understand if you are going to make money in this space. How prices work, and what moves them - and it's not money invested/withdrawn.,)",0.00264,0.01245,0.00012,0.00117
"(Robinhood is launching a Crypto Trading app to compete with Coinbase,)",0.00000,0.00415,0.00012,0.00121
"(Checkmate, Bill.,)",0.00000,0.00000,0.00000,0.00000
"(Delta's app store description seems appropriate today.,)",0.00000,0.00488,0.00000,0.00014
...,...,...,...,...
"(Elon Musk: ""Whoever owns the early BTC deserves a Nobel prize in delayed gratification."",)",0.00000,0.01768,0.00000,0.00000
"(Harvesting the body heat of , people can mine BTC/month,)",0.00001,0.00158,0.00053,0.00009
"(IBM Launches A Blockchain Based Global Payments Network Using Stellar's Cryptocurrency,)",0.00052,0.00978,0.00649,0.00304
"( Million Tethers have been printed in the past days. Tether has still not provided an audit of funds backing USDT.,)",0.00000,0.00757,0.00024,0.00077


In [19]:
#LDA

In [20]:
# gensim
from gensim import corpora, models, similarities, matutils

# sklearn
from sklearn.feature_extraction.text import CountVectorizer

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [21]:
#convert sparse scipy matrix to a gensim-friendly object(Corpus)

corpus = matutils.Sparse2Corpus(doc_word)

In [22]:
#save a mapping (dict) of row id to word (token) for later use by gensim:

id2word = dict((v, k) for k, v in vectorizer.vocabulary_.items())

In [23]:
len(id2word)

13709

In [24]:
# Create lda model (equivalent to "fit" in sklearn)
lda = models.LdaModel(corpus=corpus, num_topics=4, id2word=id2word, passes=10)

2020-06-07 19:05:40,944 : INFO : using symmetric alpha at 0.25
2020-06-07 19:05:40,944 : INFO : using symmetric eta at 0.25
2020-06-07 19:05:40,947 : INFO : using serial LDA version on this node
2020-06-07 19:05:40,952 : INFO : running online (multi-pass) LDA training, 4 topics, 10 passes over the supplied corpus of 13709 documents, updating model once every 2000 documents, evaluating perplexity every 13709 documents, iterating 50x with a convergence threshold of 0.001000
2020-06-07 19:05:40,958 : INFO : PROGRESS: pass 0, at document #2000/13709
2020-06-07 19:05:41,178 : INFO : merging changes from 2000 documents into a model of 13709 documents
2020-06-07 19:05:41,182 : INFO : topic #0 (0.250): 0.003*"announce today" + 0.002*"appeal rejected explanation" + 0.002*"afford lambo" + 0.002*"acceptable" + 0.002*"amazon petition accept" + 0.002*"action video" + 0.002*"attack outed paying" + 0.002*"ago invested nzd" + 0.002*"ask coins" + 0.002*"appeals explaining"
2020-06-07 19:05:41,182 : INF

2020-06-07 19:05:42,199 : INFO : topic #1 (0.250): 0.002*"anonymity" + 0.002*"active time posted" + 0.002*"assets market" + 0.002*"accept crypto" + 0.002*"approach bitcoin" + 0.002*"africa" + 0.002*"accept ethereum" + 0.002*"august showing shaky" + 0.002*"announce support" + 0.002*"bank punished overcharging"
2020-06-07 19:05:42,200 : INFO : topic #2 (0.250): 0.002*"answered" + 0.002*"amex mooneybookers et" + 0.002*"absolute fucking" + 0.002*"allegedly" + 0.002*"anonymous creator" + 0.002*"allow" + 0.002*"actually going price" + 0.002*"bang" + 0.002*"advertised" + 0.002*"america"
2020-06-07 19:05:42,201 : INFO : topic #3 (0.250): 0.002*"allowing risk free" + 0.002*"accidentally" + 0.002*"bank directly sell" + 0.002*"accept bitcoin" + 0.002*"annual transaction" + 0.002*"banks credit" + 0.002*"anchored" + 0.002*"appeals explaining appeal" + 0.002*"assets pay" + 0.002*"bad collapsed"
2020-06-07 19:05:42,201 : INFO : topic diff=0.040277, rho=0.408248
2020-06-07 19:05:42,422 : INFO : -11.06

2020-06-07 19:05:43,192 : INFO : topic diff=0.022284, rho=0.336061
2020-06-07 19:05:43,196 : INFO : PROGRESS: pass 1, at document #10000/13709
2020-06-07 19:05:43,334 : INFO : merging changes from 2000 documents into a model of 13709 documents
2020-06-07 19:05:43,338 : INFO : topic #0 (0.250): 0.002*"acceptable" + 0.002*"anonymity features" + 0.002*"actually won webby" + 0.002*"admits cryptocurrencies threat" + 0.002*"authorities make" + 0.002*"algorithm make attack" + 0.002*"banned dollar" + 0.002*"avoid traps" + 0.002*"audit confirms" + 0.002*"actually"
2020-06-07 19:05:43,338 : INFO : topic #1 (0.250): 0.002*"author" + 0.002*"anonymity" + 0.002*"asks" + 0.002*"aug" + 0.002*"asked" + 0.002*"account registration opened" + 0.002*"australian banks" + 0.002*"allow cryptocurrency trading" + 0.002*"accept ethereum" + 0.002*"authorities"
2020-06-07 19:05:43,339 : INFO : topic #2 (0.250): 0.002*"affair block" + 0.002*"answered" + 0.002*"account day btc" + 0.002*"america" + 0.002*"anonymous c

2020-06-07 19:05:44,197 : INFO : topic #2 (0.250): 0.003*"absurd journey reason" + 0.002*"backed partnership recently" + 0.002*"americans congressman" + 0.002*"announces ripple" + 0.002*"attack obsolete" + 0.002*"affair" + 0.002*"amex mooneybookers et" + 0.002*"asked government" + 0.002*"account provided zero" + 0.002*"account weekly"
2020-06-07 19:05:44,198 : INFO : topic #3 (0.250): 0.002*"algorithm make" + 0.002*"active" + 0.002*"banks credit" + 0.002*"ans" + 0.002*"banks gatekeepers" + 0.002*"athene" + 0.002*"adopt" + 0.002*"absolutely unacceptable" + 0.002*"aware anybody team" + 0.002*"available"
2020-06-07 19:05:44,198 : INFO : topic diff=0.014317, rho=0.318554
2020-06-07 19:05:44,203 : INFO : PROGRESS: pass 2, at document #8000/13709
2020-06-07 19:05:44,332 : INFO : merging changes from 2000 documents into a model of 13709 documents
2020-06-07 19:05:44,335 : INFO : topic #0 (0.250): 0.002*"actually" + 0.002*"bank profit years" + 0.002*"avoid traps" + 0.002*"bank liechtenstein gr

2020-06-07 19:05:45,198 : INFO : topic #0 (0.250): 0.002*"ad morning" + 0.002*"acceptable" + 0.002*"actually" + 0.002*"authorities make" + 0.002*"antonopoulos lashes" + 0.002*"audit confirms" + 0.002*"ago day normal" + 0.002*"ban south" + 0.002*"accepted sign million" + 0.002*"anonymity features"
2020-06-07 19:05:45,199 : INFO : topic #1 (0.250): 0.002*"australian banks announce" + 0.002*"aug" + 0.002*"attack" + 0.002*"accessible think" + 0.002*"africa" + 0.002*"anonymously easily make" + 0.002*"anonymity" + 0.002*"author rich" + 0.002*"allegedly help pay" + 0.002*"algorithm"
2020-06-07 19:05:45,199 : INFO : topic #2 (0.250): 0.002*"americans congressman" + 0.002*"absurd journey reason" + 0.002*"banking financial crisis" + 0.002*"affair" + 0.002*"apl just engaged" + 0.002*"amex mooneybookers et" + 0.002*"backed partnership recently" + 0.002*"alerts bitcoin" + 0.002*"apollo currency team" + 0.002*"america"
2020-06-07 19:05:45,200 : INFO : topic #3 (0.250): 0.002*"absolutely unacceptable

2020-06-07 19:05:46,042 : INFO : topic #3 (0.250): 0.002*"banana" + 0.002*"anchored" + 0.002*"banks credit" + 0.002*"allows promotion phishing" + 0.002*"banks gatekeepers" + 0.002*"absolutely unacceptable" + 0.002*"available" + 0.002*"announced insights" + 0.002*"abandoned reddit" + 0.002*"aware anybody team"
2020-06-07 19:05:46,042 : INFO : topic diff=0.011539, rho=0.303525
2020-06-07 19:05:46,074 : INFO : PROGRESS: pass 4, at document #2000/13709
2020-06-07 19:05:46,195 : INFO : merging changes from 2000 documents into a model of 13709 documents
2020-06-07 19:05:46,199 : INFO : topic #0 (0.250): 0.002*"authorities make" + 0.002*"ago invested nzd" + 0.002*"ago day normal" + 0.002*"acceptable" + 0.002*"attack trinity" + 0.002*"bank liechtenstein grew" + 0.002*"anonymity features" + 0.002*"announcing million" + 0.002*"announce today breyer" + 0.002*"ask ye"
2020-06-07 19:05:46,199 : INFO : topic #1 (0.250): 0.002*"account looks" + 0.002*"asked" + 0.002*"available people" + 0.002*"annual

2020-06-07 19:05:46,899 : INFO : topic #2 (0.250): 0.002*"answered" + 0.002*"amex mooneybookers et" + 0.002*"americans congressman" + 0.002*"anonymous creator" + 0.002*"allegedly" + 0.002*"allow" + 0.002*"bang" + 0.002*"america" + 0.002*"asked government" + 0.002*"andrew yang accepting"
2020-06-07 19:05:46,900 : INFO : topic #3 (0.250): 0.002*"allowing risk free" + 0.002*"bad collapsed" + 0.002*"banks credit" + 0.002*"accidentally" + 0.002*"active" + 0.002*"algorithm make" + 0.002*"banks gatekeepers" + 0.002*"anchored" + 0.002*"absolutely unacceptable" + 0.002*"account furniture outlets"
2020-06-07 19:05:46,900 : INFO : topic diff=0.011865, rho=0.290441
2020-06-07 19:05:47,095 : INFO : -10.625 per-word bound, 1579.8 perplexity estimate based on a held-out corpus of 1709 documents with 458 words
2020-06-07 19:05:47,096 : INFO : PROGRESS: pass 4, at document #13709/13709
2020-06-07 19:05:47,201 : INFO : merging changes from 1709 documents into a model of 13709 documents
2020-06-07 19:05:

2020-06-07 19:05:47,877 : INFO : topic #1 (0.250): 0.002*"asked" + 0.002*"allow cryptocurrency trading" + 0.002*"annual inflation drops" + 0.002*"aug" + 0.002*"anonymity" + 0.002*"authorities" + 0.002*"author" + 0.002*"asks" + 0.002*"attack" + 0.002*"account registration opened"
2020-06-07 19:05:47,878 : INFO : topic #2 (0.250): 0.002*"affair block" + 0.002*"answered" + 0.002*"account day btc" + 0.002*"america" + 0.002*"anonymous creator" + 0.002*"americans congressman" + 0.002*"andrew yang accepting" + 0.002*"asked government" + 0.002*"backed partnership recently" + 0.002*"absurd journey reason"
2020-06-07 19:05:47,878 : INFO : topic #3 (0.250): 0.002*"banks gatekeepers" + 0.002*"accept bitcoin" + 0.002*"banks credit" + 0.002*"bad collapsed" + 0.002*"algorithm make" + 0.002*"anchored" + 0.002*"active" + 0.002*"abandoned reddit" + 0.002*"abandoned" + 0.002*"affiliate program"
2020-06-07 19:05:47,879 : INFO : topic diff=0.007633, rho=0.278915
2020-06-07 19:05:47,884 : INFO : PROGRESS: p

2020-06-07 19:05:48,716 : INFO : PROGRESS: pass 6, at document #8000/13709
2020-06-07 19:05:48,849 : INFO : merging changes from 2000 documents into a model of 13709 documents
2020-06-07 19:05:48,853 : INFO : topic #0 (0.250): 0.002*"actually" + 0.002*"atls" + 0.002*"bank profit years" + 0.002*"acceptable" + 0.002*"avoid traps" + 0.002*"bank liechtenstein grew" + 0.002*"actually won webby" + 0.002*"anonymity features" + 0.002*"bank instalment" + 0.002*"ad morning"
2020-06-07 19:05:48,854 : INFO : topic #1 (0.250): 0.002*"allow cryptocurrency trading" + 0.002*"annual inflation drops" + 0.002*"attack" + 0.002*"allowing employees" + 0.002*"anonymity" + 0.002*"algorithm" + 0.002*"africa" + 0.002*"art" + 0.002*"aug" + 0.002*"ability buy sell"
2020-06-07 19:05:48,855 : INFO : topic #2 (0.250): 0.002*"ago day" + 0.002*"anonymous creator" + 0.002*"americans congressman" + 0.002*"affair" + 0.002*"absurd journey reason" + 0.002*"amex mooneybookers et" + 0.002*"attack obsolete" + 0.002*"apps nati

2020-06-07 19:05:49,706 : INFO : topic #3 (0.250): 0.002*"absolutely unacceptable" + 0.002*"ans" + 0.002*"argentina use crypto" + 0.002*"abandoned" + 0.002*"algorithm make" + 0.002*"absolutely" + 0.002*"affiliate" + 0.002*"antifiat protester" + 0.002*"banks credit" + 0.002*"announces coming"
2020-06-07 19:05:49,707 : INFO : topic diff=0.009501, rho=0.259460
2020-06-07 19:05:49,712 : INFO : PROGRESS: pass 7, at document #6000/13709
2020-06-07 19:05:49,836 : INFO : merging changes from 2000 documents into a model of 13709 documents
2020-06-07 19:05:49,839 : INFO : topic #0 (0.250): 0.002*"actually won webby" + 0.002*"allowing scams" + 0.002*"bank instalment" + 0.002*"acceptable" + 0.002*"actually" + 0.002*"audit confirms" + 0.002*"anonymity features" + 0.002*"ad morning" + 0.002*"atls" + 0.002*"ask"
2020-06-07 19:05:49,839 : INFO : topic #1 (0.250): 0.002*"anonymity" + 0.002*"algorithm" + 0.002*"ability buy sell" + 0.002*"aug" + 0.002*"attack" + 0.002*"authorities" + 0.002*"banking live"

2020-06-07 19:05:50,661 : INFO : topic #2 (0.250): 0.002*"americans congressman" + 0.002*"affair block" + 0.002*"asked government" + 0.002*"account provided zero" + 0.002*"affair" + 0.002*"backed partnership recently" + 0.002*"amex mooneybookers et" + 0.002*"alerts bitcoin" + 0.002*"america" + 0.002*"app developed"
2020-06-07 19:05:50,662 : INFO : topic #3 (0.250): 0.002*"ans" + 0.002*"bank account malta" + 0.002*"algorithm make" + 0.002*"banks credit" + 0.002*"absolutely unacceptable" + 0.002*"active users" + 0.002*"banana" + 0.002*"alts fomos bitcoin" + 0.002*"accidentally" + 0.002*"bad collapsed"
2020-06-07 19:05:50,663 : INFO : topic diff=0.012424, rho=0.251145
2020-06-07 19:05:50,669 : INFO : PROGRESS: pass 8, at document #4000/13709
2020-06-07 19:05:50,793 : INFO : merging changes from 2000 documents into a model of 13709 documents
2020-06-07 19:05:50,796 : INFO : topic #0 (0.250): 0.002*"ad morning" + 0.002*"acceptable" + 0.002*"actually" + 0.002*"authorities make" + 0.002*"audi

2020-06-07 19:05:51,604 : INFO : topic #1 (0.250): 0.002*"anonymity" + 0.002*"account looks" + 0.002*"avoid confusion newcomers" + 0.002*"accepts crypto" + 0.002*"authorities" + 0.002*"banks help enable" + 0.002*"active time posted" + 0.002*"annual inflation drops" + 0.002*"allegedly help pay" + 0.002*"appeal"
2020-06-07 19:05:51,605 : INFO : topic #2 (0.250): 0.002*"backed partnership recently" + 0.002*"amex mooneybookers et" + 0.002*"account provided zero" + 0.002*"approved" + 0.002*"allow" + 0.002*"answered" + 0.002*"app developed" + 0.002*"allegedly" + 0.002*"anonymous creator" + 0.002*"affair"
2020-06-07 19:05:51,606 : INFO : topic #3 (0.250): 0.002*"banana" + 0.002*"banks credit" + 0.002*"anchored" + 0.002*"banks gatekeepers" + 0.002*"absolutely unacceptable" + 0.002*"available" + 0.002*"allows promotion phishing" + 0.002*"abandoned reddit" + 0.002*"aware anybody team" + 0.002*"announced insights"
2020-06-07 19:05:51,607 : INFO : topic diff=0.008944, rho=0.251145
2020-06-07 19:05

2020-06-07 19:05:52,406 : INFO : topic #0 (0.250): 0.002*"atls" + 0.002*"americans buying" + 0.002*"actually won webby" + 0.002*"authorities make" + 0.002*"bank liechtenstein grew" + 0.002*"actually" + 0.002*"banning" + 0.002*"anonymity features" + 0.002*"audit confirms" + 0.002*"acceptable"
2020-06-07 19:05:52,406 : INFO : topic #1 (0.250): 0.002*"anonymity" + 0.002*"authorities" + 0.002*"annual inflation drops" + 0.002*"africa" + 0.002*"assets market" + 0.002*"active time posted" + 0.002*"accept crypto" + 0.002*"ability buy sell" + 0.002*"accessible think" + 0.002*"aug"
2020-06-07 19:05:52,407 : INFO : topic #2 (0.250): 0.002*"answered" + 0.002*"amex mooneybookers et" + 0.002*"americans congressman" + 0.002*"anonymous creator" + 0.002*"allow" + 0.002*"allegedly" + 0.002*"america" + 0.002*"asked government" + 0.002*"bang" + 0.002*"andrew yang accepting"
2020-06-07 19:05:52,408 : INFO : topic #3 (0.250): 0.002*"allowing risk free" + 0.002*"bad collapsed" + 0.002*"banks credit" + 0.002*

In [25]:
#Here are the 10 most important words for each of the 6 topics we found:
    
lda.print_topics()

2020-06-07 19:05:52,730 : INFO : topic #0 (0.250): 0.002*"banned dollar" + 0.002*"authorities make" + 0.002*"actually won webby" + 0.002*"actually" + 0.002*"announcing million" + 0.002*"atls" + 0.002*"americans buying" + 0.002*"banned" + 0.002*"bank liechtenstein grew" + 0.002*"ad morning"
2020-06-07 19:05:52,731 : INFO : topic #1 (0.250): 0.002*"anonymity" + 0.002*"account looks" + 0.002*"avoid confusion newcomers" + 0.002*"accepts crypto" + 0.002*"authorities" + 0.002*"banks help enable" + 0.002*"active time posted" + 0.002*"annual inflation drops" + 0.002*"algorithm" + 0.002*"allegedly help pay"
2020-06-07 19:05:52,732 : INFO : topic #2 (0.250): 0.002*"backed partnership recently" + 0.002*"amex mooneybookers et" + 0.002*"account provided zero" + 0.002*"approved" + 0.002*"allow" + 0.002*"answered" + 0.002*"app developed" + 0.002*"anonymous creator" + 0.002*"allegedly" + 0.002*"affair"
2020-06-07 19:05:52,733 : INFO : topic #3 (0.250): 0.002*"banana" + 0.002*"banks credit" + 0.002*"an

[(0,
  '0.002*"banned dollar" + 0.002*"authorities make" + 0.002*"actually won webby" + 0.002*"actually" + 0.002*"announcing million" + 0.002*"atls" + 0.002*"americans buying" + 0.002*"banned" + 0.002*"bank liechtenstein grew" + 0.002*"ad morning"'),
 (1,
  '0.002*"anonymity" + 0.002*"account looks" + 0.002*"avoid confusion newcomers" + 0.002*"accepts crypto" + 0.002*"authorities" + 0.002*"banks help enable" + 0.002*"active time posted" + 0.002*"annual inflation drops" + 0.002*"algorithm" + 0.002*"allegedly help pay"'),
 (2,
  '0.002*"backed partnership recently" + 0.002*"amex mooneybookers et" + 0.002*"account provided zero" + 0.002*"approved" + 0.002*"allow" + 0.002*"answered" + 0.002*"app developed" + 0.002*"anonymous creator" + 0.002*"allegedly" + 0.002*"affair"'),
 (3,
  '0.002*"banana" + 0.002*"banks credit" + 0.002*"anchored" + 0.002*"banks gatekeepers" + 0.002*"absolutely unacceptable" + 0.002*"available" + 0.002*"allows promotion phishing" + 0.002*"abandoned reddit" + 0.002*"a

In [26]:
# Transform the docs from the word space to the topic space 

lda_corpus = lda[corpus]
lda_corpus

<gensim.interfaces.TransformedCorpus at 0x2367d8fdc88>

In [27]:
# Store the documents' topic vectors in a list 

lda_docs = [doc for doc in lda_corpus]

In [28]:
# Check out the document vectors in the topic space 

lda_docs[0]

[(0, 0.21273905), (1, 0.21273752), (2, 0.21274012), (3, 0.36178333)]