## Topic Modelling
This notebook contains a demo of LDA and LSA using the gensim library. The dataset's link can be found in the `BookSummaries_Link.md` file under the Data folder in Ch7.

In [1]:
!pip install nltk
!pip install gensim



In [5]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')

from gensim.models import LdaModel
from gensim.corpora import Dictionary
from pprint import pprint


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/etherealenvy/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
#tokenize, remove stopwords, non-alphabetic words, lowercase
def preprocess(textstring):
   stops =  set(stopwords.words('english'))
   tokens = word_tokenize(textstring)
   return [token.lower() for token in tokens if token.isalpha() and token not in stops]

data_path = "/home/etherealenvy/Downloads/booksummaries/booksummaries.txt"
summaries = []
for line in open(data_path, encoding="utf-8"):
   temp = line.split("\t")
   summaries.append(preprocess(temp[6]))

# Create a dictionary representation of the documents.
dictionary = Dictionary(summaries)
# Filter infrequent or too frequent words.
dictionary.filter_extremes(no_below=10, no_above=0.5)
corpus = [dictionary.doc2bow(summary) for summary in summaries]
# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token
#Train the topic model
model = LdaModel(corpus=corpus, id2word=id2word,iterations=400, num_topics=10)
top_topics = list(model.top_topics(corpus))
pprint(top_topics)


[([(0.007026231, 'life'),
   (0.0060395496, 'love'),
   (0.006031805, 'family'),
   (0.0060214815, 'father'),
   (0.0059033865, 'he'),
   (0.0056308284, 'novel'),
   (0.004568777, 'young'),
   (0.0045522423, 'she'),
   (0.004382369, 'story'),
   (0.004264761, 'one'),
   (0.0042424756, 'also'),
   (0.004066663, 'in'),
   (0.0040207854, 'mother'),
   (0.0038233518, 'two'),
   (0.0037485228, 'becomes'),
   (0.0036681942, 'time'),
   (0.0036503694, 'first'),
   (0.0035617696, 'new'),
   (0.0035586786, 'years'),
   (0.003553522, 'son')],
  -0.9155499921177572),
 ([(0.007956107, 'he'),
   (0.0074816775, 'she'),
   (0.0064061387, 'mother'),
   (0.005793881, 'one'),
   (0.005216782, 'tells'),
   (0.005172581, 'back'),
   (0.004947877, 'house'),
   (0.004940073, 'father'),
   (0.004542262, 'school'),
   (0.0045250333, 'go'),
   (0.0044640102, 'home'),
   (0.004325223, 'they'),
   (0.0041368674, 'day'),
   (0.004133858, 'family'),
   (0.003949343, 'get'),
   (0.0039093546, 'when'),
   (0.0038658

In [24]:
for idx in range(10):
    print("Topic #%s:" % idx, model.print_topic(idx, 10))
print("=" * 20)

Topic #0: 0.013*"jacky" + 0.006*"dahlia" + 0.005*"novel" + 0.005*"one" + 0.004*"story" + 0.004*"also" + 0.004*"book" + 0.004*"team" + 0.004*"narrator" + 0.003*"jeremy"
Topic #1: 0.010*"book" + 0.009*"war" + 0.006*"in" + 0.006*"world" + 0.005*"novel" + 0.005*"states" + 0.004*"also" + 0.004*"new" + 0.004*"chapter" + 0.004*"story"
Topic #2: 0.008*"he" + 0.007*"she" + 0.006*"mother" + 0.006*"one" + 0.005*"tells" + 0.005*"back" + 0.005*"house" + 0.005*"father" + 0.005*"school" + 0.005*"go"
Topic #3: 0.007*"life" + 0.006*"love" + 0.006*"family" + 0.006*"father" + 0.006*"he" + 0.006*"novel" + 0.005*"young" + 0.005*"she" + 0.004*"story" + 0.004*"one"
Topic #4: 0.007*"he" + 0.006*"one" + 0.004*"murder" + 0.004*"police" + 0.004*"man" + 0.003*"two" + 0.003*"case" + 0.003*"also" + 0.003*"would" + 0.003*"time"
Topic #5: 0.007*"earth" + 0.006*"one" + 0.005*"time" + 0.005*"human" + 0.005*"world" + 0.004*"new" + 0.004*"planet" + 0.004*"life" + 0.003*"space" + 0.003*"he"
Topic #6: 0.006*"he" + 0.005*"t

In [14]:
from gensim.models import LsiModel
lsamodel = LsiModel(corpus, num_topics=10, id2word = id2word)  # train model

pprint(lsamodel.print_topics(num_topics=10, num_words=10))


[(0,
  '0.305*"he" + 0.215*"one" + 0.150*"she" + 0.140*"time" + 0.132*"back" + '
  '0.131*"also" + 0.127*"two" + 0.125*"they" + 0.123*"tells" + 0.118*"in"'),
 (1,
  '0.493*"tom" + 0.226*"sophia" + 0.182*"mrs" + 0.178*"house" + 0.161*"she" + '
  '0.154*"father" + 0.147*"mr" + 0.146*"he" + 0.138*"tells" + -0.126*"one"'),
 (2,
  '-0.558*"tom" + -0.252*"sophia" + 0.213*"she" + 0.190*"he" + -0.185*"mrs" + '
  '0.163*"tells" + 0.144*"mother" + -0.138*"mr" + -0.129*"western" + '
  '-0.102*"however"'),
 (3,
  '-0.233*"they" + -0.203*"ship" + 0.187*"he" + -0.183*"david" + -0.178*"back" '
  '+ -0.165*"tells" + 0.161*"life" + 0.160*"family" + 0.154*"narrator" + '
  '-0.154*"find"'),
 (4,
  '0.664*"he" + -0.258*"mother" + -0.213*"she" + -0.195*"father" + '
  '-0.180*"family" + 0.121*"narrator" + 0.120*"monk" + -0.100*"school" + '
  '-0.099*"novel" + -0.095*"children"'),
 (5,
  '0.486*"david" + -0.241*"king" + 0.169*"rosa" + 0.162*"book" + '
  '0.126*"harlan" + -0.120*"he" + 0.111*"she" + 0.108*"go

In [19]:
for idx in range(10):
    print("Topic #%s:" % idx, lsamodel.print_topic(idx, 10))
 
print("=" * 20)

Topic #0: 0.305*"he" + 0.215*"one" + 0.150*"she" + 0.140*"time" + 0.132*"back" + 0.131*"also" + 0.127*"two" + 0.125*"they" + 0.123*"tells" + 0.118*"in"
Topic #1: 0.493*"tom" + 0.226*"sophia" + 0.182*"mrs" + 0.178*"house" + 0.161*"she" + 0.154*"father" + 0.147*"mr" + 0.146*"he" + 0.138*"tells" + -0.126*"one"
Topic #2: -0.558*"tom" + -0.252*"sophia" + 0.213*"she" + 0.190*"he" + -0.185*"mrs" + 0.163*"tells" + 0.144*"mother" + -0.138*"mr" + -0.129*"western" + -0.102*"however"
Topic #3: -0.233*"they" + -0.203*"ship" + 0.187*"he" + -0.183*"david" + -0.178*"back" + -0.165*"tells" + 0.161*"life" + 0.160*"family" + 0.154*"narrator" + -0.154*"find"
Topic #4: 0.664*"he" + -0.258*"mother" + -0.213*"she" + -0.195*"father" + -0.180*"family" + 0.121*"narrator" + 0.120*"monk" + -0.100*"school" + -0.099*"novel" + -0.095*"children"
Topic #5: 0.486*"david" + -0.241*"king" + 0.169*"rosa" + 0.162*"book" + 0.126*"harlan" + -0.120*"he" + 0.111*"she" + 0.108*"gould" + -0.108*"anita" + 0.103*"would"
Topic #6: 