In [38]:
import nltk
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np

In [95]:
bookNames = nltk.corpus.gutenberg.fileids()
print(bookNames)

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [60]:
books = [nltk.corpus.gutenberg.words(bookName) for bookName in bookNames]
print(len(books))

18


In [36]:
books[0][:30]

['[',
 'Emma',
 'by',
 'Jane',
 'Austen',
 '1816',
 ']',
 'VOLUME',
 'I',
 'CHAPTER',
 'I',
 'Emma',
 'Woodhouse',
 ',',
 'handsome',
 ',',
 'clever',
 ',',
 'and',
 'rich',
 ',',
 'with',
 'a',
 'comfortable',
 'home',
 'and',
 'happy',
 'disposition',
 ',',
 'seemed']

In [45]:
def preprocess(book):
    #1. Remove non-alphanumeric words & convert to lowercase
    cleanBook = [word.lower() for word in book if word.isalpha() and len(word) > 2]
    #2. Remove stopwords
    cleanBook = [word for word in cleanBook if word not in gensim.parsing.preprocessing.STOPWORDS]
    #2. Stemming and Lemmetization
    stemmer = SnowballStemmer('english')
    cleanBook = [stemmer.stem(WordNetLemmatizer().lemmatize(word, pos='v')) for word in cleanBook if word not in gensim.parsing.preprocessing.STOPWORDS]
    return cleanBook

In [46]:
book = preprocess(books[0])

In [47]:
nltk.FreqDist(book)

FreqDist({'emma': 865, 'mrs': 699, 'think': 692, 'miss': 611, 'say': 566, 'know': 541, 'harriet': 506, 'thing': 460, 'weston': 448, 'elton': 407, ...})

In [48]:
cleanBooks = [preprocess(book) for book in books]

In [71]:
# Create a dictionary
# In simple words a dictionary is nothing but unique words and their indexes stored together
booksDictionary = gensim.corpora.Dictionary(cleanBooks)

In [72]:
print(booksDictionary)

Dictionary(24410 unique tokens: ['abbey', 'abbot', 'abdi', 'abhor', 'abid']...)


In [73]:
#Print some values from the dictionary
values = [print(v) for k, v in booksDictionary.iteritems() if k < 10]

abbey
abbot
abdi
abhor
abid
abil
abl
abolit
abomin
abroad


In [99]:
# 1. Remove words that don't appear in at least 2 books (remove words that are TOO specific)
# 2. Keep words that appear in not more than 50% of the books (remove words that are TOO common)
# 3. After #1 and #2, keep the first 10k words
# filter_extremes documentation: https://radimrehurek.com/gensim/corpora/dictionary.html#gensim.corpora.dictionary.Dictionary.filter_extremes
booksDictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=10000)

In [100]:
print(booksDictionary[2])

abrupt


In [101]:
# Now create a Bag-of-Word corpus
# A BoW corpus replaces the words in a document with their corresponding index in dictionary 
# and the number of times it appears in the document
# For e.g., ["Who let the dogs out?", "Who? Who? Who? Who?"]
# The BoW for the above would be something like: [[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(4, 4)]]
# Where [4, 1] represents that the word is Who and appeared once in first sentence and
# [4, 4] represents that Who appeared 4 times in the sentence
# Once we create the BoW, the semantics and word postions go out the window.
cleanBooks_BoW_Corpus = [booksDictionary.doc2bow(book) for book in cleanBooks]
cleanBooks_BoW_Corpus[2][:10]

[(0, 3),
 (1, 1),
 (2, 4),
 (5, 3),
 (7, 6),
 (8, 13),
 (9, 4),
 (10, 1),
 (12, 5),
 (14, 3)]

In [102]:
# To see replace BoW indexes with words, we can do this:
word_indexes = [print(booksDictionary[k] + ', ' + str(v)) for k,v in cleanBooks_BoW_Corpus[2][:10]]

abhor, 3
abomin, 1
abrupt, 4
abund, 3
accident, 6
accommod, 13
accomplish, 4
accost, 1
ach, 5
acquiesc, 3


In [121]:
# Run LDA Model
# Pass LAST 17 books (out of a total of 18)
# The last book would be for testing the generated model
lda_model = gensim.models.LdaMulticore(cleanBooks_BoW_Corpus[1:18], num_topics=10, id2word=booksDictionary, passes=2, workers=2)

In [122]:
for i, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(i, topic))

Topic: 0 
Words: 0.025*"mrs" + 0.020*"edward" + 0.013*"big" + 0.012*"colonel" + 0.011*"farmer" + 0.010*"mous" + 0.008*"rabbit" + 0.007*"unto" + 0.006*"turtl" + 0.006*"couldn"
Topic: 1 
Words: 0.159*"unto" + 0.042*"israel" + 0.034*"hath" + 0.030*"shalt" + 0.018*"thine" + 0.016*"jesus" + 0.014*"thereof" + 0.012*"mose" + 0.011*"egypt" + 0.011*"spake"
Topic: 2 
Words: 0.025*"unto" + 0.023*"hath" + 0.016*"whale" + 0.008*"israel" + 0.007*"shalt" + 0.006*"thine" + 0.006*"boat" + 0.005*"spake" + 0.005*"adam" + 0.004*"eve"
Topic: 3 
Words: 0.021*"mrs" + 0.012*"guinea" + 0.011*"arthur" + 0.007*"whilst" + 0.007*"hath" + 0.006*"farmer" + 0.006*"price" + 0.006*"adam" + 0.005*"eve" + 0.005*"hardi"
Topic: 4 
Words: 0.008*"professor" + 0.005*"whale" + 0.005*"bull" + 0.004*"presid" + 0.004*"comrad" + 0.004*"boat" + 0.004*"colonel" + 0.004*"vast" + 0.003*"mrs" + 0.003*"big"
Topic: 5 
Words: 0.042*"ham" + 0.028*"caesar" + 0.014*"hath" + 0.013*"hamlet" + 0.012*"unto" + 0.008*"doth" + 0.006*"shew" + 0.006*

In [128]:
for i, score in sorted(lda_model[cleanBooks_BoW_Corpus[0]], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(i, 5)))

Score: 0.7590150237083435	 Topic: 0.058*"whale" + 0.024*"mrs" + 0.020*"ann" + 0.017*"boat" + 0.008*"chapter"
Score: 0.12667332589626312	 Topic: 0.025*"mrs" + 0.020*"edward" + 0.013*"big" + 0.012*"colonel" + 0.011*"farmer"
Score: 0.11383391171693802	 Topic: 0.021*"mrs" + 0.012*"guinea" + 0.011*"arthur" + 0.007*"whilst" + 0.007*"hath"
