# LDA - Latent Dirichlet Allocation

## Topic Modeling

In [3]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

In [4]:
f = fetch_20newsgroups()
d = f['data']
len(d)

11314

In [15]:
import re
corpus = []
for doc in d:
    corpus.append(re.sub('\w*\d+\w*', ' ', doc)) # remove numbers

In [8]:
# f['target_names']
# we don't use the labels -> unsupervised learning

In [16]:
data = ' '.join(corpus)
len(data)

21528103

In [20]:
corpus[1]

"From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley. \nOrganization: University of Washington\nLines:  \nNNTP-Posting-Host: carson.u.washington.edu\n\nA fair number of brave souls who upgraded their SI clock oscillator have\nshared their experiences for this poll. Please send a brief message detailing\nyour experiences with the procedure. Top speed attained, CPU rated speed,\nadd on cards and adapters, heat sinks, hour of usage per day, floppy disk\nfunctionality with   and  .  m floppies are especially requested.\n\nI will be summarizing in the next two days, so please add to the network\nknowledge base if you have done the clock upgrade and haven't answered this\npoll. Thanks.\n\nGuy Kuo <guykuo@u.washington.edu>\n"

In [17]:
cv = CountVectorizer(stop_words='english')

In [38]:
vec = cv.fit_transform(corpus[:500])

In [39]:
len(cv.vocabulary_)

16026

In [40]:
vec

<500x16026 sparse matrix of type '<class 'numpy.int64'>'
	with 48044 stored elements in Compressed Sparse Row format>

### Why is this a sparse matrix, not a normal Numpy array?

10000 docs * 10000 unique words -> 100 million cells

sparse matrix only stores nonzero values -> smaller, faster

In [41]:
lda = LatentDirichletAllocation(n_components=10)
lda.fit(vec)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1,
             n_topics=None, perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [42]:
c = lda.components_
c.shape  # topics x words

(10, 16026)

In [43]:
words = list(sorted(cv.vocabulary_.keys()))
len(words)

16026

In [44]:
ctrans = c.T
ctrans.shape

(16026, 10)

In [45]:
df = pd.DataFrame(ctrans, index=words)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
__,0.58516,0.139139,0.114512,0.11299,0.113316,0.110422,1.205384,8.360709,35.266096,1.872802
___,0.144512,0.146269,0.112228,0.113768,0.113673,0.112231,0.136749,0.114249,21.882438,0.113812
____,0.626206,0.118898,0.112343,0.11225,0.112022,0.112688,0.110138,0.111611,5.714276,0.110535
_____,4.601887,0.126271,0.110959,0.11141,0.110915,0.112372,0.112742,0.112746,9.833511,0.112925
______,0.119251,0.112605,0.111515,0.113781,0.112536,0.112534,0.115748,0.110288,4.121874,0.11237


In [46]:
for i in range(10):
    print(df.sort_values(by=i, ascending=False)[i].head(10))
    print('-' * 40)

morality      18.452477
optilink      16.300565
cramer        13.145780
option        10.723806
ssf            8.505126
gay            8.450591
moral          8.334245
flights        7.763190
homosexual     7.486132
capability     7.030870
Name: 0, dtype: float64
----------------------------------------
armenian     47.307601
jews         37.067077
people       29.439944
war          28.391072
turkish      26.664749
stratus      25.854943
article      25.829962
com          25.390373
armenians    23.959080
russian      23.876034
Name: 1, dtype: float64
----------------------------------------
conference     7.873722
csiro          6.149802
catalog        6.032918
dicta          5.293551
ericsson       5.288643
information    4.487303
recognition    4.427881
aprs           4.421850
division       3.723620
prof           3.566210
Name: 2, dtype: float64
----------------------------------------
space      26.545815
shuttle    23.406696
launch     19.386984
probe      15.939794
audio      

## Gibbs Sampling in LDA

1. assign random topics to each word in each document.
2. go through each word in each document
3. calculate observed p(ki|dj) and p(wi|kj)
4. re-roll new topics for each word in each document
5. repeat 2.-5. for some number of iterations

### Main Appliction: Segment text corpora