# Latent Dirichlet Allocation (LDA) and Topic Modeling

In [1]:
from sklearn.datasets import fetch_20newsgroups

ng_train = fetch_20newsgroups(subset='train', remove=('headers','footers','quotes'))

print("Data has {0:d} documents".format(len(ng_train.data)))

Data has 11314 documents


In [2]:
print(ng_train.data[1][:100])

A fair number of brave souls who upgraded their SI clock oscillator have
shared their experiences fo


In [3]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(ngram_range=(1,2), stop_words='english',
                                   token_pattern="\\b[a-z][a-z]+\\b",lowercase=True,max_features=1000)

# “X” is now our transformed data
X = count_vectorizer.fit_transform(ng_train.data)

In [4]:
import pandas as pd

print(count_vectorizer.get_feature_names()[145:150])  # print 5 random columns

#create DataFrame
df = pd.DataFrame(X.toarray(), columns=count_vectorizer.get_feature_names())

# values of these features on documents 10-15
print(df.iloc[10:15, 145:150])

['check', 'chicago', 'child', 'children', 'chip']
    check  chicago  child  children  chip
10      0        0      0         0     0
11      0        0      6         2     0
12      0        0      0         0     0
13      0        0      0         0     0
14      0        0      0         0     0


In [5]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=4, random_state=42,learning_method='online' )
data = lda.fit_transform(X)

print(data[0])

[0.90361994 0.01121763 0.01000007 0.07516236]


In [6]:
for word in lda.components_[2].argsort()[:10:-1]:
    print(word)

77
78
541
542
79
656
216
92
363
975
657
18
157
505
817
364
120
99
711
578
583
416
579
19
894
832
91
522
70
838
544
402
20
418
414
581
680
365
287
615
506
225
279
137
918
121
642
614
628
939
949
594
268
993
422
641
982
393
212
334
653
636
223
915
472
592
431
941
790
94
772
898
322
215
224
488
755
712
816
90
460
812
2
676
352
881
411
423
925
329
539
254
731
277
135
328
16
965
468
830
346
839
646
486
176
521
972
558
73
173
404
359
456
761
603
687
98
450
775
517
928
619
552
107
406
395
725
186
866
362
435
410
777
440
318
877
513
503
116
219
634
677
523
508
658
108
240
564
234
686
892
903
525
53
957
543
854
307
9
545
858
959
511
717
305
285
337
792
97
310
360
821
905
81
498
562
757
987
611
391
908
535
666
546
293
997
501
466
895
114
735
140
177
567
495
270
482
331
713
952
368
805
883
659
537
836
188
896
570
496
199
233
64
589
388
886
596
769
481
338
478
303
794
252
716
194
924
824
863
760
24
330
823
519
60
679
171
880
122
397
372
105
514
857
139
119
167
723
901
840
471
758
929
306
631
376
8