## Topic Modelling
This notebook contains a demo of LDA and LSA using the gensim library. The dataset's link can be found in the `BookSummaries_Link.md` file under the Data folder in Ch7.

In [None]:
# Import OS
import os
# For NLTK virtual environments are high recommended and it requires python verisions higher than 3.5
!pip install gensim
!pip install nltk



In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

from gensim.models import LdaModel
from gensim.corpora import Dictionary
from pprint import pprint


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
#tokenize, remove stopwords, non-alphabetic words, lowercase
def preprocess(textstring):
   stops =  set(stopwords.words('english'))
   tokens = word_tokenize(textstring)
   return [token.lower() for token in tokens if token.isalpha() and token not in stops]

# Please update it to your actual download path regradless of your choice of operating system
try:
    from google.colab import files
    uploaded = files.upload()
    data_path='booksummaries.txt'
except ModuleNotFoundError:
    data_path='Data/booksummaries.txt'

summaries = []
for line in open(data_path, encoding="utf-8"):
   temp = line.split("\t")
   summaries.append(preprocess(temp[6]))

# Create a dictionary representation of the documents.
dictionary = Dictionary(summaries)

# Filter infrequent or too frequent words.
dictionary.filter_extremes(no_below=10, no_above=0.5)
corpus = [dictionary.doc2bow(summary) for summary in summaries]

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

#Train the topic model
model = LdaModel(corpus=corpus, id2word=id2word,iterations=400, num_topics=10)
top_topics = list(model.top_topics(corpus))
pprint(top_topics)

Saving booksummaries.txt to booksummaries (1).txt




[([(0.023265265, 'will'),
   (0.006791828, 'sandy'),
   (0.005802573, 'man'),
   (0.00545891, 'they'),
   (0.005361698, 'says'),
   (0.0052492092, 'she'),
   (0.0050261193, 'house'),
   (0.00495469, 'new'),
   (0.0049020336, 'place'),
   (0.0043071937, 'life'),
   (0.004191512, 'old'),
   (0.0040845894, 'back'),
   (0.0039467746, 'book'),
   (0.0039076605, 'love'),
   (0.0037853858, 'replaced'),
   (0.0037836144, 'family'),
   (0.0034425468, 'asks'),
   (0.0031122135, 'world'),
   (0.0030991007, 'two'),
   (0.0030360203, 'time')],
  -1.0425441686974948),
 ([(0.0046427725, 'time'),
   (0.0039827097, 'death'),
   (0.0036928887, 'two'),
   (0.0036362822, 'people'),
   (0.0036056098, 'new'),
   (0.00346534, 'back'),
   (0.0033212684, 'house'),
   (0.0032688887, 'family'),
   (0.0032591114, 'after'),
   (0.0032214858, 'john'),
   (0.003128154, 'first'),
   (0.0030789988, 'they'),
   (0.0030488728, 'however'),
   (0.0030121296, 'book'),
   (0.0029799026, 'mother'),
   (0.0029058016, 'man'),


In [None]:
for idx in range(10):
    print("Topic #%s:" % idx, model.print_topic(idx, 10))
print("=" * 20)

Topic #0: 0.005*"time" + 0.004*"death" + 0.004*"two" + 0.004*"people" + 0.004*"new" + 0.003*"back" + 0.003*"house" + 0.003*"family" + 0.003*"after" + 0.003*"john"
Topic #1: 0.006*"new" + 0.006*"wife" + 0.004*"bath" + 0.004*"man" + 0.003*"york" + 0.003*"young" + 0.003*"world" + 0.003*"first" + 0.003*"miss" + 0.003*"another"
Topic #2: 0.023*"will" + 0.007*"sandy" + 0.006*"man" + 0.005*"they" + 0.005*"says" + 0.005*"she" + 0.005*"house" + 0.005*"new" + 0.005*"place" + 0.004*"life"
Topic #3: 0.006*"novel" + 0.005*"book" + 0.004*"story" + 0.004*"first" + 0.004*"family" + 0.004*"time" + 0.004*"love" + 0.003*"two" + 0.003*"world" + 0.003*"characters"
Topic #4: 0.008*"nbsp" + 0.005*"novel" + 0.004*"translations" + 0.004*"time" + 0.004*"world" + 0.003*"new" + 0.003*"young" + 0.003*"bible" + 0.003*"two" + 0.003*"battle"
Topic #5: 0.008*"liberty" + 0.007*"sandy" + 0.006*"rats" + 0.006*"they" + 0.005*"florence" + 0.004*"love" + 0.004*"ron" + 0.004*"she" + 0.004*"mr" + 0.004*"house"
Topic #6: 0.010

In [None]:
from gensim.models import LsiModel
lsamodel = LsiModel(corpus, num_topics=10, id2word = id2word)  # train model

pprint(lsamodel.print_topics(num_topics=10, num_words=10))


[(0,
  '0.181*"tom" + 0.147*"two" + 0.143*"time" + 0.126*"father" + 0.126*"man" + '
  '0.120*"however" + 0.116*"house" + 0.114*"she" + 0.114*"back" + '
  '0.112*"first"'),
 (1,
  '-0.717*"tom" + -0.308*"mrs" + -0.228*"mr" + -0.196*"western" + '
  '-0.136*"lady" + -0.107*"house" + -0.106*"jones" + -0.092*"honour" + '
  '0.076*"new" + -0.069*"squire"'),
 (2,
  '-0.910*"harry" + -0.167*"ron" + -0.164*"hermione" + -0.090*"professor" + '
  '-0.079*"stone" + 0.070*"prince" + -0.066*"black" + 0.057*"narrator" + '
  '-0.050*"school" + -0.044*"sirius"'),
 (3,
  '0.789*"narrator" + 0.201*"de" + 0.137*"mme" + -0.091*"hill" + '
  '-0.080*"people" + 0.066*"friend" + -0.064*"world" + 0.063*"alice" + '
  '-0.061*"labour" + 0.061*"home"'),
 (4,
  '0.725*"prince" + 0.183*"narrator" + 0.118*"harry" + -0.110*"she" + '
  '0.109*"don" + 0.102*"people" + 0.101*"hill" + 0.099*"narration" + '
  '0.094*"as" + 0.088*"family"'),
 (5,
  '-0.355*"prince" + 0.338*"narrator" + 0.267*"hill" + -0.167*"family" + '
  '-

In [None]:
for idx in range(10):
    print("Topic #%s:" % idx, lsamodel.print_topic(idx, 10))

print("=" * 20)

Topic #0: 0.181*"tom" + 0.147*"two" + 0.143*"time" + 0.126*"father" + 0.126*"man" + 0.120*"however" + 0.116*"house" + 0.114*"she" + 0.114*"back" + 0.112*"first"
Topic #1: -0.717*"tom" + -0.308*"mrs" + -0.228*"mr" + -0.196*"western" + -0.136*"lady" + -0.107*"house" + -0.106*"jones" + -0.092*"honour" + 0.076*"new" + -0.069*"squire"
Topic #2: -0.910*"harry" + -0.167*"ron" + -0.164*"hermione" + -0.090*"professor" + -0.079*"stone" + 0.070*"prince" + -0.066*"black" + 0.057*"narrator" + -0.050*"school" + -0.044*"sirius"
Topic #3: 0.789*"narrator" + 0.201*"de" + 0.137*"mme" + -0.091*"hill" + -0.080*"people" + 0.066*"friend" + -0.064*"world" + 0.063*"alice" + -0.061*"labour" + 0.061*"home"
Topic #4: 0.725*"prince" + 0.183*"narrator" + 0.118*"harry" + -0.110*"she" + 0.109*"don" + 0.102*"people" + 0.101*"hill" + 0.099*"narration" + 0.094*"as" + 0.088*"family"
Topic #5: -0.355*"prince" + 0.338*"narrator" + 0.267*"hill" + -0.167*"family" + -0.166*"father" + 0.139*"labour" + 0.136*"lesson" + 0.129*"