In [35]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [36]:
sample_df = pd.read_csv('final.csv', index_col=0)

In [37]:
sample_df.shape

(173107, 16)

In [38]:
sample_df.head()

Unnamed: 0_level_0,author_name,book_id,gutenbergbookid,title,text,text_lines,authoryearofbirth,authoryearofdeath,downloads,subjects,topic,Sub_A,Sub_B,Sub_C,str_text_lines,passage_key
author_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
494,"Lincoln, Abraham",PG8,PG8,Abraham Lincoln's Second Inaugural Address,"['to header material.', '', '***', '', ""Lincol...",17,1809.0,1865.0,25.0,{'United States -- Politics and government -- ...,United States,1861-1865,Politics and government,Presidents,17,PG8_17
494,"Lincoln, Abraham",PG9,PG9,Abraham Lincoln's First Inaugural Address,"['I have no lawful right to do so, and I have ...",77,1809.0,1865.0,36.0,{'United States -- Politics and government -- ...,United States,1861-1865,Politics and government,Presidents,77,PG9_77
494,"Lincoln, Abraham",PG9,PG9,Abraham Lincoln's First Inaugural Address,['for a minority of their own will secede from...,277,1809.0,1865.0,36.0,{'United States -- Politics and government -- ...,United States,1861-1865,Politics and government,Presidents,277,PG9_277
494,"Lincoln, Abraham",PG9,PG9,Abraham Lincoln's First Inaugural Address,"['in both cases, and a few break over in each....",327,1809.0,1865.0,36.0,{'United States -- Politics and government -- ...,United States,1861-1865,Politics and government,Presidents,327,PG9_327
494,"Lincoln, Abraham",PG9,PG9,Abraham Lincoln's First Inaugural Address,['There is some difference of opinion whether ...,127,1809.0,1865.0,36.0,{'United States -- Politics and government -- ...,United States,1861-1865,Politics and government,Presidents,127,PG9_127


In [39]:
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words="english")
dtm = cv.fit_transform(sample_df['text'])

In [40]:
dtm

<173107x166727 sparse matrix of type '<class 'numpy.int64'>'
	with 25892329 stored elements in Compressed Sparse Row format>

In [41]:
feature_names = cv.get_feature_names_out()
len(feature_names)
 # show the total number of distinct words


166727

In [118]:
feature_names = cv.get_feature_names_out()
feature_names

array(['00', '000', '000_l', ..., 'ὄρος', 'ὑπὸ', 'ὡς'], dtype=object)

In [42]:
feature_names[166000:]

array(['zebralope', 'zebras', 'zebulon', 'zebus', 'zecchin', 'zech',
       'zechariah', 'zechins', 'zeckler', 'zed', 'zedekiah', 'zee',
       'zeebrugge', 'zeed', 'zeeing', 'zeeland', 'zeem', 'zeemed',
       'zeenar', 'zeerjeek', 'zees', 'zeggensburg', 'zegri', 'zeherit',
       'zehr', 'zehren', 'zehrendorf', 'zeib', 'zeiglemann', 'zeigler',
       'zeit', 'zeit_', 'zeitgeist', 'zeitung_', 'zeke', 'zekiel',
       'zekle', 'zelaya', 'zelda', 'zele', 'zeleia', 'zelie', 'zelinda',
       'zell', 'zella', 'zelle', 'zellerndorf', 'zelo', 'zelotes', 'zelt',
       'zem', 'zembabwans', 'zembabwei', 'zembei', 'zembin', 'zembla',
       'zemindar', 'zemstvo', 'zemzem', 'zen', 'zena', 'zenana',
       'zenanas', 'zenas', 'zend', 'zenda', 'zendavesta', 'zenelophon',
       'zenian', 'zenians', 'zenith', 'zeniths', 'zenithwards', 'zeno',
       'zenobia', 'zenobie', 'zenoburg', 'zenocrate', 'zenone', 'zens',
       'zent', 'zenz', 'zep', 'zeph', 'zephania', 'zephaniah',
       'zephirine', 'z

# LDA

In [72]:
NUM_TOPICS = 7
LDA_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=30, random_state=42)


In [73]:
LDA_model.fit(dtm)

### Show Stored Words.

In [45]:
len(feature_names)

166727

In [49]:
import random

# get the feature names
feature_names = cv.get_feature_names_out()

# print 15 random words
for index in range(15):
    random_word_ID = random.randint(0, len(feature_names)-1)
    print(feature_names[random_word_ID])


azelma
ascendancy
onions
pie
oceola
_solemnly
faulchions
marzak
sparkling
washstands
banked
unecclesiastical
trollop
blemishing
snickered


### Top Words Per Topic

In [50]:
len(LDA_model.components_[0])

166727

In [99]:
# Pick a single topic
a_topic = LDA_model.components_[0]

# Get the indices that would sort this array
a_topic.argsort()

array([ 68562,  52706, 137521, ..., 148710,  70124, 148294])

In [100]:
# The word least representative of this topic
a_topic[597]

0.1438157561857944

In [101]:
# The word most representative of this topic
a_topic[3598]

12.656333063244391

Let have a look at the top 10 words for the topic we previously took

In [102]:
# get the feature names
feature_names = cv.get_feature_names_out()

# get the indices of the top 10 words in the topic
top_10_words_indices = a_topic.argsort()[-10:]

# print the top 10 words
for i in top_10_words_indices:
    print(feature_names[i])


let
king
love
lord
man
thee
shall
thy
god
thou


In [103]:
# get the feature names
feature_names = cv.get_feature_names_out()

# print the top 10 words for each topic
for i, topic in enumerate(LDA_model.components_):
    print("THE TOP {} WORDS FOR TOPIC #{}".format(10, i))
    print([feature_names[index] for index in topic.argsort()[-10:]])
    print("\n")


THE TOP 10 WORDS FOR TOPIC #0
['let', 'king', 'love', 'lord', 'man', 'thee', 'shall', 'thy', 'god', 'thou']


THE TOP 10 WORDS FOR TOPIC #1
['mother', 'say', 'like', 'did', 'think', 'mrs', 'know', 'good', 'little', 'said']


THE TOP 10 WORDS FOR TOPIC #2
['like', 'just', 'got', 've', 'man', 'know', 'mr', 'll', 'don', 'said']


THE TOP 10 WORDS FOR TOPIC #3
['time', 'white', 'men', 'great', 'away', 'long', 'came', 'water', 'little', 'like']


THE TOP 10 WORDS FOR TOPIC #4
['came', 'like', 'hand', 'little', 'room', 'did', 'man', 'face', 'eyes', 'said']


THE TOP 10 WORDS FOR TOPIC #5
['mr', 'world', 'men', 'years', 'new', 'time', 'people', 'life', 'man', 'great']


THE TOP 10 WORDS FOR TOPIC #6
['great', 'man', 'general', 'day', 'did', 'captain', 'said', 'time', 'king', 'men']




In [104]:
### Attach Discovered Topic Labels to Original text

In [105]:
final_topics = LDA_model.transform(dtm)
final_topics.shape

(173107, 7)

In [106]:
final_topics[0]

array([0.08435078, 0.0007924 , 0.00079286, 0.00079236, 0.0007938 ,
       0.64756399, 0.2649138 ])

In [107]:
final_topics[0].argmax()

5

### Combination with the original data
Let's create a new column called **Topic N°** that will correspond to the topic value to which each document belongs to.

In [108]:
sample_df["Topic N°"] = final_topics.argmax(axis=1)

In [109]:
sample_df.head()

Unnamed: 0_level_0,author_name,book_id,gutenbergbookid,title,text,text_lines,authoryearofbirth,authoryearofdeath,downloads,subjects,topic,Sub_A,Sub_B,Sub_C,str_text_lines,passage_key,Topic N°
author_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
494,"Lincoln, Abraham",PG8,PG8,Abraham Lincoln's Second Inaugural Address,"['to header material.', '', '***', '', ""Lincol...",17,1809.0,1865.0,25.0,{'United States -- Politics and government -- ...,United States,1861-1865,Politics and government,Presidents,17,PG8_17,5
494,"Lincoln, Abraham",PG9,PG9,Abraham Lincoln's First Inaugural Address,"['I have no lawful right to do so, and I have ...",77,1809.0,1865.0,36.0,{'United States -- Politics and government -- ...,United States,1861-1865,Politics and government,Presidents,77,PG9_77,5
494,"Lincoln, Abraham",PG9,PG9,Abraham Lincoln's First Inaugural Address,['for a minority of their own will secede from...,277,1809.0,1865.0,36.0,{'United States -- Politics and government -- ...,United States,1861-1865,Politics and government,Presidents,277,PG9_277,5
494,"Lincoln, Abraham",PG9,PG9,Abraham Lincoln's First Inaugural Address,"['in both cases, and a few break over in each....",327,1809.0,1865.0,36.0,{'United States -- Politics and government -- ...,United States,1861-1865,Politics and government,Presidents,327,PG9_327,5
494,"Lincoln, Abraham",PG9,PG9,Abraham Lincoln's First Inaugural Address,['There is some difference of opinion whether ...,127,1809.0,1865.0,36.0,{'United States -- Politics and government -- ...,United States,1861-1865,Politics and government,Presidents,127,PG9_127,5


According to our LDA model:
- the first document belongs to 4th topic.
- the second document belongs to 4th topic.
- the third document belongs to 6th topic.
etc.

In [110]:
import pyLDAvis.sklearn

In [111]:
pyLDAvis.enable_notebook() # To enable the visualization on the notebook

In [None]:
# # get the feature names
# panel = pyLDAvis.sklearn.prepare(LDA_model, dtm, cv, mds='tsne',feature_names=feature_names) # Create the panel for the visualization
# panel

In [121]:
feature_names = cv.get_feature_names_out()


In [122]:
import pyLDAvis.sklearn
import matplotlib.pyplot as plt

# create the visualization
pyLDAvis.enable_notebook()

panel = pyLDAvis.sklearn.prepare(LDA_model, dtm, cv, mds='tsne', feature_names=feature_names)

# display the visualization
pyLDAvis.display(panel)
plt.show()


AttributeError: 'CountVectorizer' object has no attribute 'get_feature_names'