In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load data
# use skip initialspace in case headers has hidden space

df = pd.read_csv("C:/Users/storm/Desktop/METIS/Project/project 4_TOPIC MODELLING/dataset/crypto_currency_reddit_data.csv", usecols = ['title'], skipinitialspace=True)


In [3]:
df.head()

Unnamed: 0,title
0,CryptoNick is deleting all of his BitConnect v...
1,"I will tell you exactly what is going on here,..."
2,Robinhood is launching a Crypto Trading app to...
3,"Checkmate, Bill."
4,Delta's app store description seems appropriat...


In [4]:
#Remove Numbers

df['title'] = df['title'].str.replace('\d+', '')

In [5]:
#Convert dataframe to series object to be able to parse into CountVectorizer

tex = df['title']
type(tex)

pandas.core.series.Series

In [6]:
vectorizer = CountVectorizer(stop_words='english', lowercase=True, ngram_range=(1,3), max_df=10)
doc_word = vectorizer.fit_transform(tex)
doc_word.shape

(934, 13709)

In [7]:
pd.DataFrame(doc_word.toarray(), index=df, columns=vectorizer.get_feature_names()).head()

Unnamed: 0,abandoned,abandoned reddit,abandoned reddit account,ability,ability buy,ability buy sell,able,able security,able security audit,able use,...,zero bulletproofs,zero bulletproofs upgrade,zero customer,zero customer support,zero fee,zero fee exchange,zimbabwe,zimbabwe banned,zimbabwe banned dollar,zoom
"(CryptoNick is deleting all of his BitConnect videos, and so are his buddies. Please never forget what he and his cohorts did to so many people, and how much money those people lost in the process thanks to CryptoNick, Trevon James, and Craig Grant!,)",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"(I will tell you exactly what is going on here, this is critical information to understand if you are going to make money in this space. How prices work, and what moves them - and it's not money invested/withdrawn.,)",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"(Robinhood is launching a Crypto Trading app to compete with Coinbase,)",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"(Checkmate, Bill.,)",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"(Delta's app store description seems appropriate today.,)",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
#Latent Semantic Analysis (LSA) 

lsa = TruncatedSVD(4)
doc_topic = lsa.fit_transform(doc_word)
lsa.explained_variance_ratio_

array([0.00565127, 0.00563794, 0.00473545, 0.00477535])

In [9]:
#Vt Matrix
#May add index = ["component_1","component_2", "component_3", "component_4"]

topic_word = pd.DataFrame(lsa.components_.round(3),             
             columns = vectorizer.get_feature_names())
topic_word

Unnamed: 0,abandoned,abandoned reddit,abandoned reddit account,ability,ability buy,ability buy sell,able,able security,able security audit,able use,...,zero bulletproofs,zero bulletproofs upgrade,zero customer,zero customer support,zero fee,zero fee exchange,zimbabwe,zimbabwe banned,zimbabwe banned dollar,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,-0.001,-0.001,-0.001,-0.0,...,0.0,0.0,0.0,0.0,0.003,0.003,0.001,0.001,0.001,0.0
1,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.001,-0.001,-0.001,-0.0,...,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
2,0.001,0.001,0.001,0.001,0.001,0.001,-0.004,-0.003,-0.003,-0.0,...,0.0,0.0,0.001,0.001,-0.001,-0.001,0.0,0.0,0.0,-0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.001,0.001,0.0,...,0.0,0.0,0.0,0.0,0.001,0.001,-0.001,-0.001,-0.001,0.0


In [10]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [11]:
display_topics(lsa, vectorizer.get_feature_names(), 10)


Topic  0
credit, fee, card, transaction, fraud, credit card, department, fraud department, cash, pay

Topic  1
privacy, duckduckgo, search, data, great, information, user, brave, companies, shows

Topic  2
total, fines penalties, total fines, fines, total fines penalties, penalties, billion total fines, billion total, allowing, mining

Topic  3
total, fines penalties, penalties, billion total, billion total fines, total fines, total fines penalties, fines, profit, wells fargo


In [12]:
#U Matrix
#May add columns = ["component_1","component_2","component_3","component_4"]

Vt = pd.DataFrame(doc_topic.round(5),
             index = df)
Vt

Unnamed: 0,0,1,2,3
"(CryptoNick is deleting all of his BitConnect videos, and so are his buddies. Please never forget what he and his cohorts did to so many people, and how much money those people lost in the process thanks to CryptoNick, Trevon James, and Craig Grant!,)",0.15491,0.19865,-0.11298,-0.36904
"(I will tell you exactly what is going on here, this is critical information to understand if you are going to make money in this space. How prices work, and what moves them - and it's not money invested/withdrawn.,)",0.08717,0.21879,0.04825,0.02985
"(Robinhood is launching a Crypto Trading app to compete with Coinbase,)",0.02325,0.02055,0.05670,0.06837
"(Checkmate, Bill.,)",0.00000,0.00000,-0.00000,0.00000
"(Delta's app store description seems appropriate today.,)",0.02653,0.02236,0.06381,0.09739
...,...,...,...,...
"(Elon Musk: ""Whoever owns the early BTC deserves a Nobel prize in delayed gratification."",)",0.00756,0.00777,-0.00977,0.00543
"(Harvesting the body heat of , people can mine BTC/month,)",-0.00669,0.01103,0.02363,0.00350
"(IBM Launches A Blockchain Based Global Payments Network Using Stellar's Cryptocurrency,)",0.05813,0.04090,-0.01495,-0.00043
"( Million Tethers have been printed in the past days. Tether has still not provided an audit of funds backing USDT.,)",0.00836,0.01055,0.00546,-0.00321


In [13]:
#NMF

In [14]:
nmf_model = NMF(4)
doc_topic = nmf_model.fit_transform(doc_word)

In [15]:
#The H matrix (topic_word) shows the 2 resulting topics, and the terms associated with each topic
#May add index = ["component_1","component_2", "component_3", "component_4"]

topic_word = pd.DataFrame(nmf_model.components_.round(3),
                          columns = vectorizer.get_feature_names())
topic_word

Unnamed: 0,abandoned,abandoned reddit,abandoned reddit account,ability,ability buy,ability buy sell,able,able security,able security audit,able use,...,zero bulletproofs,zero bulletproofs upgrade,zero customer,zero customer support,zero fee,zero fee exchange,zimbabwe,zimbabwe banned,zimbabwe banned dollar,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.012,0.012,0.001,0.001,0.001,0.0
1,0.0,0.0,0.0,0.001,0.001,0.001,0.001,0.001,0.001,0.0,...,0.0,0.0,0.001,0.001,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.001,0.001,0.0
3,0.001,0.001,0.001,0.003,0.003,0.003,0.0,0.0,0.0,0.0,...,0.0,0.0,0.003,0.003,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
display_topics(nmf_model, vectorizer.get_feature_names(), 10)


Topic  0
fee, transaction, credit, fraud, department, fraud department, cash, card, pay, credit card

Topic  1
privacy, duckduckgo, search, data, great, information, brave, user, companies, shows

Topic  2
mining, sherman, rep, card, campaign, donor, brad, campaign donor, rep brad, rep brad sherman

Topic  3
total, billion total fines, billion total, total fines penalties, total fines, fines, fines penalties, penalties, fargo, wells fargo


In [17]:
#The W matrix shows the comments started with, and how each comments is made up of the 2 resulting topics
#May add columns = ["component_1","component_2","component_3","component_4"]

H = pd.DataFrame(doc_topic.round(5),
             index = df)
H

Unnamed: 0,0,1,2,3
"(CryptoNick is deleting all of his BitConnect videos, and so are his buddies. Please never forget what he and his cohorts did to so many people, and how much money those people lost in the process thanks to CryptoNick, Trevon James, and Craig Grant!,)",0.01677,0.00720,0.00217,0.03556
"(I will tell you exactly what is going on here, this is critical information to understand if you are going to make money in this space. How prices work, and what moves them - and it's not money invested/withdrawn.,)",0.01275,0.06851,0.00000,0.03765
"(Robinhood is launching a Crypto Trading app to compete with Coinbase,)",0.00000,0.00148,0.02454,0.00975
"(Checkmate, Bill.,)",0.00000,0.00000,0.00000,0.00000
"(Delta's app store description seems appropriate today.,)",0.00000,0.00197,0.04390,0.00000
...,...,...,...,...
"(Elon Musk: ""Whoever owns the early BTC deserves a Nobel prize in delayed gratification."",)",0.00026,0.00044,0.00020,0.00053
"(Harvesting the body heat of , people can mine BTC/month,)",0.00010,0.00036,0.00172,0.00073
"(IBM Launches A Blockchain Based Global Payments Network Using Stellar's Cryptocurrency,)",0.00600,0.00704,0.01285,0.00185
"( Million Tethers have been printed in the past days. Tether has still not provided an audit of funds backing USDT.,)",0.00004,0.00366,0.00250,0.00213


In [18]:
#LDA

In [19]:
# gensim
from gensim import corpora, models, similarities, matutils

# sklearn
from sklearn.feature_extraction.text import CountVectorizer

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [20]:
#convert sparse scipy matrix to a gensim-friendly object(Corpus)

corpus = matutils.Sparse2Corpus(doc_word)

In [21]:
#save a mapping (dict) of row id to word (token) for later use by gensim:

id2word = dict((v, k) for k, v in vectorizer.vocabulary_.items())

In [22]:
len(id2word)

13709

In [23]:
# Create lda model (equivalent to "fit" in sklearn)
lda = models.LdaModel(corpus=corpus, num_topics=4, id2word=id2word, passes=10)

2020-06-07 19:05:59,201 : INFO : using symmetric alpha at 0.25
2020-06-07 19:05:59,202 : INFO : using symmetric eta at 0.25
2020-06-07 19:05:59,205 : INFO : using serial LDA version on this node
2020-06-07 19:05:59,211 : INFO : running online (multi-pass) LDA training, 4 topics, 10 passes over the supplied corpus of 13709 documents, updating model once every 2000 documents, evaluating perplexity every 13709 documents, iterating 50x with a convergence threshold of 0.001000
2020-06-07 19:05:59,215 : INFO : PROGRESS: pass 0, at document #2000/13709
2020-06-07 19:05:59,652 : INFO : merging changes from 2000 documents into a model of 13709 documents
2020-06-07 19:05:59,655 : INFO : topic #0 (0.250): 0.013*"authorities make" + 0.013*"americans congressman" + 0.011*"algorithm make" + 0.010*"abstraction destroyed wallet" + 0.009*"affair block" + 0.009*"anonymity features" + 0.009*"bank asked" + 0.008*"avoidance aim" + 0.007*"asks crypto" + 0.007*"banks credit"
2020-06-07 19:05:59,656 : INFO : 

2020-06-07 19:06:00,697 : INFO : topic #2 (0.250): 0.017*"anonymity" + 0.012*"answered" + 0.011*"allowing risk free" + 0.009*"anonymous creator" + 0.009*"allow" + 0.008*"bang" + 0.008*"aug" + 0.008*"algorithm" + 0.008*"audit confirms" + 0.007*"affiliate program"
2020-06-07 19:06:00,697 : INFO : topic #3 (0.250): 0.009*"asked government" + 0.009*"bank liechtenstein grew" + 0.008*"active time posted" + 0.008*"americans buying" + 0.007*"backed partnership recently" + 0.007*"account looks" + 0.007*"account provided zero" + 0.007*"allow cryptocurrency trading" + 0.007*"avoid confusion newcomers" + 0.007*"account day btc"
2020-06-07 19:06:00,698 : INFO : topic diff=0.037827, rho=0.408248
2020-06-07 19:06:00,922 : INFO : -8.183 per-word bound, 290.7 perplexity estimate based on a held-out corpus of 1709 documents with 2157 words
2020-06-07 19:06:00,923 : INFO : PROGRESS: pass 0, at document #13709/13709
2020-06-07 19:06:01,045 : INFO : merging changes from 1709 documents into a model of 13709

2020-06-07 19:06:01,751 : INFO : topic #0 (0.250): 0.011*"affair block" + 0.010*"banks credit" + 0.010*"americans congressman" + 0.010*"algorithm make" + 0.010*"anonymity features" + 0.009*"banned dollar" + 0.008*"authorities make" + 0.008*"anchored" + 0.008*"andrew yang accepting" + 0.008*"absolutely unacceptable"
2020-06-07 19:06:01,752 : INFO : topic #1 (0.250): 0.008*"annual inflation drops" + 0.008*"banks gatekeepers" + 0.008*"authorities" + 0.007*"absurd journey reason" + 0.007*"america" + 0.007*"actually won webby" + 0.007*"amex mooneybookers et" + 0.007*"bad collapsed" + 0.007*"active" + 0.006*"banking financial crisis"
2020-06-07 19:06:01,752 : INFO : topic #2 (0.250): 0.012*"anonymity" + 0.011*"asked" + 0.010*"aug" + 0.010*"answered" + 0.009*"acceptable" + 0.009*"anonymous creator" + 0.009*"author" + 0.008*"algorithm" + 0.007*"affiliate program" + 0.007*"algorithm make attack"
2020-06-07 19:06:01,753 : INFO : topic #3 (0.250): 0.010*"asked government" + 0.009*"account provide

2020-06-07 19:06:02,590 : INFO : topic #3 (0.250): 0.012*"backed partnership recently" + 0.011*"asked government" + 0.010*"account provided zero" + 0.007*"account looks" + 0.007*"allow cryptocurrency trading" + 0.007*"avoid confusion newcomers" + 0.006*"banks profit" + 0.006*"ago day normal" + 0.006*"america buys" + 0.006*"adopt"
2020-06-07 19:06:02,591 : INFO : topic diff=0.025278, rho=0.318554
2020-06-07 19:06:02,596 : INFO : PROGRESS: pass 2, at document #8000/13709
2020-06-07 19:06:02,720 : INFO : merging changes from 2000 documents into a model of 13709 documents
2020-06-07 19:06:02,723 : INFO : topic #0 (0.250): 0.011*"banks credit" + 0.011*"americans congressman" + 0.010*"algorithm make" + 0.010*"attack" + 0.008*"apl just engaged" + 0.008*"ability buy sell" + 0.008*"affair block" + 0.008*"andrew yang accepting" + 0.008*"absolutely unacceptable" + 0.007*"aware anybody team"
2020-06-07 19:06:02,724 : INFO : topic #1 (0.250): 0.009*"annual inflation drops" + 0.009*"absurd journey r

2020-06-07 19:06:03,566 : INFO : topic #1 (0.250): 0.010*"absurd journey reason" + 0.007*"amex mooneybookers et" + 0.007*"annual inflation drops" + 0.007*"banking financial crisis" + 0.007*"africa" + 0.007*"actually" + 0.007*"authorities" + 0.006*"america" + 0.006*"active" + 0.006*"anonymously easily make"
2020-06-07 19:06:03,567 : INFO : topic #2 (0.250): 0.012*"aug" + 0.012*"anonymity" + 0.011*"australian banks announce" + 0.010*"affair" + 0.009*"ans" + 0.009*"acceptable" + 0.009*"algorithm" + 0.009*"accessible think" + 0.009*"asked" + 0.008*"answered"
2020-06-07 19:06:03,567 : INFO : topic #3 (0.250): 0.010*"backed partnership recently" + 0.009*"asked government" + 0.009*"account provided zero" + 0.008*"account looks" + 0.008*"ago day normal" + 0.008*"avoid confusion newcomers" + 0.007*"allow cryptocurrency trading" + 0.007*"alerts bitcoin" + 0.007*"available people" + 0.007*"active time posted"
2020-06-07 19:06:03,568 : INFO : topic diff=0.051553, rho=0.303525
2020-06-07 19:06:03,5

2020-06-07 19:06:04,432 : INFO : topic diff=0.020874, rho=0.303525
2020-06-07 19:06:04,437 : INFO : PROGRESS: pass 4, at document #2000/13709
2020-06-07 19:06:04,560 : INFO : merging changes from 2000 documents into a model of 13709 documents
2020-06-07 19:06:04,563 : INFO : topic #0 (0.250): 0.012*"americans congressman" + 0.011*"authorities make" + 0.009*"affair block" + 0.009*"algorithm make" + 0.009*"banks credit" + 0.009*"absolutely unacceptable" + 0.008*"anonymity features" + 0.008*"banned dollar" + 0.007*"bank asked" + 0.007*"anchored"
2020-06-07 19:06:04,564 : INFO : topic #1 (0.250): 0.009*"annual inflation drops" + 0.007*"amex mooneybookers et" + 0.007*"ago invested nzd" + 0.007*"america" + 0.006*"allegedly help pay" + 0.006*"authorities" + 0.006*"active" + 0.006*"actually" + 0.006*"approved" + 0.006*"announce cryptocurrency month"
2020-06-07 19:06:04,564 : INFO : topic #2 (0.250): 0.013*"anonymity" + 0.011*"asked" + 0.010*"aug" + 0.009*"acceptable" + 0.009*"affair" + 0.008*"

2020-06-07 19:06:05,229 : INFO : topic #3 (0.250): 0.009*"asked government" + 0.008*"backed partnership recently" + 0.008*"bank liechtenstein grew" + 0.008*"active time posted" + 0.008*"account looks" + 0.008*"account provided zero" + 0.007*"americans buying" + 0.007*"allow cryptocurrency trading" + 0.007*"avoid confusion newcomers" + 0.007*"account day btc"
2020-06-07 19:06:05,230 : INFO : topic diff=0.014562, rho=0.290441
2020-06-07 19:06:05,424 : INFO : -8.000 per-word bound, 256.0 perplexity estimate based on a held-out corpus of 1709 documents with 2157 words
2020-06-07 19:06:05,424 : INFO : PROGRESS: pass 4, at document #13709/13709
2020-06-07 19:06:05,529 : INFO : merging changes from 1709 documents into a model of 13709 documents
2020-06-07 19:06:05,532 : INFO : topic #0 (0.250): 0.010*"americans congressman" + 0.010*"banned dollar" + 0.010*"banks credit" + 0.009*"authorities make" + 0.009*"absolutely unacceptable" + 0.009*"algorithm make" + 0.008*"anchored" + 0.008*"attack" + 

2020-06-07 19:06:06,180 : INFO : topic #1 (0.250): 0.008*"annual inflation drops" + 0.008*"absurd journey reason" + 0.007*"banks gatekeepers" + 0.007*"authorities" + 0.007*"america" + 0.007*"amex mooneybookers et" + 0.007*"actually won webby" + 0.007*"active" + 0.006*"bad collapsed" + 0.006*"actually"
2020-06-07 19:06:06,181 : INFO : topic #2 (0.250): 0.012*"anonymity" + 0.010*"asked" + 0.010*"aug" + 0.010*"answered" + 0.009*"acceptable" + 0.009*"anonymous creator" + 0.008*"author" + 0.008*"algorithm" + 0.007*"australian banks announce" + 0.007*"accessible think"
2020-06-07 19:06:06,181 : INFO : topic #3 (0.250): 0.010*"asked government" + 0.009*"account provided zero" + 0.009*"allow cryptocurrency trading" + 0.009*"backed partnership recently" + 0.008*"account day btc" + 0.007*"account looks" + 0.006*"avoid confusion newcomers" + 0.006*"accept ethereum" + 0.005*"bank liechtenstein grew" + 0.005*"ago day normal"
2020-06-07 19:06:06,182 : INFO : topic diff=0.012346, rho=0.278915
2020-06

2020-06-07 19:06:07,009 : INFO : topic diff=0.019101, rho=0.268661
2020-06-07 19:06:07,014 : INFO : PROGRESS: pass 6, at document #8000/13709
2020-06-07 19:06:07,136 : INFO : merging changes from 2000 documents into a model of 13709 documents
2020-06-07 19:06:07,139 : INFO : topic #0 (0.250): 0.011*"banks credit" + 0.011*"americans congressman" + 0.010*"algorithm make" + 0.009*"attack" + 0.008*"apl just engaged" + 0.008*"ability buy sell" + 0.008*"affair block" + 0.008*"absolutely unacceptable" + 0.008*"andrew yang accepting" + 0.007*"aware anybody team"
2020-06-07 19:06:07,140 : INFO : topic #1 (0.250): 0.009*"annual inflation drops" + 0.009*"absurd journey reason" + 0.007*"amex mooneybookers et" + 0.007*"africa" + 0.007*"available" + 0.007*"authorities" + 0.007*"actually" + 0.006*"active" + 0.006*"assets market" + 0.006*"atls"
2020-06-07 19:06:07,141 : INFO : topic #2 (0.250): 0.012*"anonymity" + 0.010*"algorithm" + 0.010*"anonymous creator" + 0.010*"aug" + 0.009*"affair" + 0.009*"ag

2020-06-07 19:06:07,972 : INFO : topic #2 (0.250): 0.012*"anonymity" + 0.012*"aug" + 0.010*"australian banks announce" + 0.010*"affair" + 0.009*"acceptable" + 0.009*"algorithm" + 0.009*"ans" + 0.009*"asked" + 0.009*"accessible think" + 0.008*"answered"
2020-06-07 19:06:07,973 : INFO : topic #3 (0.250): 0.010*"backed partnership recently" + 0.009*"asked government" + 0.009*"account provided zero" + 0.008*"account looks" + 0.008*"ago day normal" + 0.008*"avoid confusion newcomers" + 0.008*"allow cryptocurrency trading" + 0.007*"alerts bitcoin" + 0.007*"available people" + 0.007*"active time posted"
2020-06-07 19:06:07,973 : INFO : topic diff=0.042127, rho=0.259460
2020-06-07 19:06:07,978 : INFO : PROGRESS: pass 7, at document #6000/13709
2020-06-07 19:06:08,100 : INFO : merging changes from 2000 documents into a model of 13709 documents
2020-06-07 19:06:08,103 : INFO : topic #0 (0.250): 0.014*"americans congressman" + 0.011*"algorithm make" + 0.010*"banks credit" + 0.009*"absolutely unac

2020-06-07 19:06:08,935 : INFO : topic #0 (0.250): 0.012*"americans congressman" + 0.010*"authorities make" + 0.009*"algorithm make" + 0.009*"affair block" + 0.009*"banks credit" + 0.009*"absolutely unacceptable" + 0.008*"anonymity features" + 0.008*"banned dollar" + 0.007*"anchored" + 0.007*"apl just engaged"
2020-06-07 19:06:08,936 : INFO : topic #1 (0.250): 0.009*"annual inflation drops" + 0.008*"amex mooneybookers et" + 0.007*"america" + 0.007*"ago invested nzd" + 0.006*"authorities" + 0.006*"active" + 0.006*"actually" + 0.006*"allegedly help pay" + 0.006*"absurd journey reason" + 0.006*"available"
2020-06-07 19:06:08,937 : INFO : topic #2 (0.250): 0.013*"anonymity" + 0.011*"asked" + 0.010*"aug" + 0.009*"acceptable" + 0.009*"affair" + 0.008*"ans" + 0.008*"answered" + 0.008*"algorithm" + 0.008*"australian banks announce" + 0.007*"allow"
2020-06-07 19:06:08,937 : INFO : topic #3 (0.250): 0.010*"account looks" + 0.010*"asked government" + 0.010*"account provided zero" + 0.009*"backed 

2020-06-07 19:06:09,605 : INFO : topic diff=0.012563, rho=0.251145
2020-06-07 19:06:09,799 : INFO : -7.976 per-word bound, 251.7 perplexity estimate based on a held-out corpus of 1709 documents with 2157 words
2020-06-07 19:06:09,800 : INFO : PROGRESS: pass 8, at document #13709/13709
2020-06-07 19:06:09,905 : INFO : merging changes from 1709 documents into a model of 13709 documents
2020-06-07 19:06:09,909 : INFO : topic #0 (0.250): 0.010*"americans congressman" + 0.010*"banks credit" + 0.009*"banned dollar" + 0.009*"authorities make" + 0.009*"absolutely unacceptable" + 0.009*"algorithm make" + 0.008*"anchored" + 0.008*"attack" + 0.008*"anonymity features" + 0.007*"aware anybody team"
2020-06-07 19:06:09,909 : INFO : topic #1 (0.250): 0.009*"amex mooneybookers et" + 0.008*"authorities" + 0.008*"annual inflation drops" + 0.007*"absurd journey reason" + 0.007*"actually" + 0.007*"america" + 0.007*"actually won webby" + 0.006*"atls" + 0.006*"approved" + 0.006*"banks gatekeepers"
2020-06-0

2020-06-07 19:06:10,574 : INFO : topic #2 (0.250): 0.012*"anonymity" + 0.010*"aug" + 0.010*"asked" + 0.009*"answered" + 0.009*"acceptable" + 0.009*"anonymous creator" + 0.008*"algorithm" + 0.008*"author" + 0.007*"australian banks announce" + 0.007*"accessible think"
2020-06-07 19:06:10,574 : INFO : topic #3 (0.250): 0.010*"asked government" + 0.009*"account provided zero" + 0.009*"backed partnership recently" + 0.009*"allow cryptocurrency trading" + 0.008*"account day btc" + 0.007*"account looks" + 0.006*"avoid confusion newcomers" + 0.006*"bank liechtenstein grew" + 0.006*"ago day normal" + 0.006*"accept ethereum"
2020-06-07 19:06:10,575 : INFO : topic diff=0.010608, rho=0.243580
2020-06-07 19:06:10,580 : INFO : PROGRESS: pass 9, at document #12000/13709
2020-06-07 19:06:10,702 : INFO : merging changes from 2000 documents into a model of 13709 documents
2020-06-07 19:06:10,705 : INFO : topic #0 (0.250): 0.011*"americans congressman" + 0.010*"banks credit" + 0.009*"algorithm make" + 0.

In [24]:
#Here are the 10 most important words for each of the 6 topics we found:
    
lda.print_topics()

2020-06-07 19:06:11,018 : INFO : topic #0 (0.250): 0.010*"americans congressman" + 0.010*"banks credit" + 0.009*"banned dollar" + 0.009*"authorities make" + 0.009*"absolutely unacceptable" + 0.009*"algorithm make" + 0.008*"anchored" + 0.008*"attack" + 0.008*"anonymity features" + 0.007*"aware anybody team"
2020-06-07 19:06:11,019 : INFO : topic #1 (0.250): 0.009*"amex mooneybookers et" + 0.008*"authorities" + 0.008*"annual inflation drops" + 0.007*"absurd journey reason" + 0.007*"actually" + 0.007*"america" + 0.006*"actually won webby" + 0.006*"atls" + 0.006*"banks gatekeepers" + 0.006*"approved"
2020-06-07 19:06:11,020 : INFO : topic #2 (0.250): 0.014*"anonymity" + 0.010*"answered" + 0.009*"algorithm" + 0.009*"allow" + 0.009*"affair" + 0.009*"anonymous creator" + 0.009*"aug" + 0.008*"asked" + 0.007*"ago day" + 0.007*"bang"
2020-06-07 19:06:11,021 : INFO : topic #3 (0.250): 0.011*"backed partnership recently" + 0.011*"account provided zero" + 0.010*"account looks" + 0.009*"avoid confus

[(0,
  '0.010*"americans congressman" + 0.010*"banks credit" + 0.009*"banned dollar" + 0.009*"authorities make" + 0.009*"absolutely unacceptable" + 0.009*"algorithm make" + 0.008*"anchored" + 0.008*"attack" + 0.008*"anonymity features" + 0.007*"aware anybody team"'),
 (1,
  '0.009*"amex mooneybookers et" + 0.008*"authorities" + 0.008*"annual inflation drops" + 0.007*"absurd journey reason" + 0.007*"actually" + 0.007*"america" + 0.006*"actually won webby" + 0.006*"atls" + 0.006*"banks gatekeepers" + 0.006*"approved"'),
 (2,
  '0.014*"anonymity" + 0.010*"answered" + 0.009*"algorithm" + 0.009*"allow" + 0.009*"affair" + 0.009*"anonymous creator" + 0.009*"aug" + 0.008*"asked" + 0.007*"ago day" + 0.007*"bang"'),
 (3,
  '0.011*"backed partnership recently" + 0.011*"account provided zero" + 0.010*"account looks" + 0.009*"avoid confusion newcomers" + 0.008*"asked government" + 0.008*"accepts crypto" + 0.007*"active time posted" + 0.007*"allow cryptocurrency trading" + 0.007*"bank liechtenstein 

In [25]:
# Transform the docs from the word space to the topic space 

lda_corpus = lda[corpus]
lda_corpus

<gensim.interfaces.TransformedCorpus at 0x1e226818408>

In [26]:
# Store the documents' topic vectors in a list 

lda_docs = [doc for doc in lda_corpus]

In [27]:
# Check out the document vectors in the topic space 

lda_docs[0]

[(0, 0.1250074), (1, 0.12500624), (2, 0.62497926), (3, 0.12500711)]