Finds key topics from all documents extracted from the books. Documents are text segments. Segment size is defined by users.
Using topic modeling techniques such as LDA, LSI, key topics are extracted from the documents.

Text segments/documents are clustered (using k-means) into topics. 

In [1]:
from textblob import TextBlob
from nltk.tokenize import word_tokenize

In [2]:
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [3]:
import os
from os import path
import sys
import string

In [4]:
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import cm
%matplotlib inline

In [5]:
BASE_DIR = "~/HarryPotter/"

In [246]:
sys.path.append(path.abspath(BASE_DIR+'NLP/SupportModules/'))
import data_transform_module as dtmod
import book_module as bmod

In [247]:
BOOK_PATH = BASE_DIR+'NLP/BookData/'

### Topic modeling with Gensim LDA
* generate term-matrix for LDA (sklearn --> countVectorizer --> term-matrix)

In [91]:
# imports for LDA
from gensim import corpora, models, similarities, matutils
from sklearn.feature_extraction.text import CountVectorizer
#import lda

In [92]:
class Document():  
    """
    input is one book as dictionary.
    each list in the dictionary is a chapter.
    """
    def __init__(self, book, chunk_size):
        self.book = book
        self.chunk_size = chunk_size
        self.docs = self.get_bookDocuments()
    
    def get_bookDocuments(self):
        book_documents = []
        for key in self.book:
            chapter = self.book[key].lower()
            chapter_tokens = word_tokenize(chapter)         
            
            if len(chapter_tokens) >= self.chunk_size: 
                start_ind = list(range(0,len(chapter_tokens), self.chunk_size))
                end_ind = start_ind[1:] + [len(chapter_tokens)]
                
                for ind in range(0, len(start_ind)):
                    tokenized_chunk = chapter_tokens[start_ind[ind]:end_ind[ind]]
                    chunk = ' '.join(x for x in tokenized_chunk)
                    book_documents.append(chunk)
        return(book_documents)



In [93]:
def get_LDA_topicModel(all_documents):    
    # Create a CountVectorizer 
    count_vectorizer = CountVectorizer(ngram_range=(1, 2),  stop_words='english')
    # Fit it on the document data (trining?)
    count_vectorizer.fit(all_documents)
    # Create the term-document matrix (Transpose places terms on the rows)
    counts = count_vectorizer.transform(all_documents).transpose()
    # Convert sparse matrix of counts to a gensim corpus
    corpus = matutils.Sparse2Corpus(counts)
    # Map matrix rows to words (tokens)
    id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())
    
    # Create lda model
    lda = models.LdaModel(corpus=corpus, num_topics=10, id2word=id2word, passes=10)
    #lda topics
    lda_print_topics = lda.print_topics()
    # Transform the docs from the word space to the topic space
    lda_corpus = lda[corpus]
    
    return lda_corpus, lda_print_topics

In [97]:
all_documents = []
chunk_size = 400
for seq in range(1, 8):
    book_dict = dtmod.unpickleSomething(BOOK_PATH, "book{}_chapters.p".format(seq))
    doc = Document(book_dict, chunk_size)
    book_documents = doc.docs
    all_documents = all_documents + book_documents

print(len(all_documents), "obtained from all 7 books with a chunk size of", chunk_size )   


2902 obtained from all 7 books with a chunk size of 400


In [98]:
%time lda_corpus_output, lda_print_topics  = get_LDA_topicModel(all_documents)

CPU times: user 7min 38s, sys: 22.9 s, total: 8min 1s
Wall time: 8min 3s


In [99]:
lda_print_topics

[(45,
  '0.001*"dumbledore" + 0.001*"morfin" + 0.001*"belby" + 0.001*"horcruxes" + 0.001*"fleur" + 0.001*"mclaggen" + 0.001*"known" + 0.001*"ze" + 0.001*"scoop" + 0.001*"meant"'),
 (8,
  '0.004*"dirk" + 0.001*"cresswell" + 0.001*"dirk cresswell" + 0.001*"crackled" + 0.001*"rons" + 0.001*"dean" + 0.001*"thought hear" + 0.001*"going don" + 0.001*"lifted air" + 0.001*"irksome"'),
 (12,
  '0.008*"said" + 0.007*"weasley" + 0.006*"harry" + 0.004*"mrs" + 0.004*"mrs weasley" + 0.003*"ginny" + 0.003*"mr" + 0.002*"mr weasley" + 0.002*"fred" + 0.002*"george"'),
 (44,
  '0.005*"bellatrix" + 0.004*"draco" + 0.003*"lord" + 0.003*"greyback" + 0.003*"lucius" + 0.003*"ted" + 0.003*"narcissa" + 0.002*"death" + 0.002*"voldemort" + 0.002*"malfoy"'),
 (29,
  '0.005*"professor" + 0.001*"professor mcgonagall" + 0.001*"nearly headless" + 0.001*"mcgonagall" + 0.001*"said professor" + 0.001*"headless" + 0.001*"nick" + 0.001*"headless nick" + 0.001*"trelawney" + 0.001*"professor umbridge"'),
 (48,
  '0.022*"vern

In [100]:
lda_docs = [doc for doc in lda_corpus_output]
len(lda_docs)

2902

lda_docs

In [116]:
lda_print_topics

[(45,
  '0.001*"dumbledore" + 0.001*"morfin" + 0.001*"belby" + 0.001*"horcruxes" + 0.001*"fleur" + 0.001*"mclaggen" + 0.001*"known" + 0.001*"ze" + 0.001*"scoop" + 0.001*"meant"'),
 (8,
  '0.004*"dirk" + 0.001*"cresswell" + 0.001*"dirk cresswell" + 0.001*"crackled" + 0.001*"rons" + 0.001*"dean" + 0.001*"thought hear" + 0.001*"going don" + 0.001*"lifted air" + 0.001*"irksome"'),
 (12,
  '0.008*"said" + 0.007*"weasley" + 0.006*"harry" + 0.004*"mrs" + 0.004*"mrs weasley" + 0.003*"ginny" + 0.003*"mr" + 0.002*"mr weasley" + 0.002*"fred" + 0.002*"george"'),
 (44,
  '0.005*"bellatrix" + 0.004*"draco" + 0.003*"lord" + 0.003*"greyback" + 0.003*"lucius" + 0.003*"ted" + 0.003*"narcissa" + 0.002*"death" + 0.002*"voldemort" + 0.002*"malfoy"'),
 (29,
  '0.005*"professor" + 0.001*"professor mcgonagall" + 0.001*"nearly headless" + 0.001*"mcgonagall" + 0.001*"said professor" + 0.001*"headless" + 0.001*"nick" + 0.001*"headless nick" + 0.001*"trelawney" + 0.001*"professor umbridge"'),
 (48,
  '0.022*"vern

### Topic modeling with Gensim LSI

In [102]:
class Document():  
    """
    input is one book as dictionary.
    each list in the dictionary is a chapter.
    """
    def __init__(self, book, chunk_size):
        self.book = book
        self.chunk_size = chunk_size
        self.docs = self.get_bookDocuments()
    
    def get_bookDocuments(self):
        book_documents = []
        for key in self.book:
            chapter = self.book[key].lower()
            chapter_tokens = word_tokenize(chapter)         
            
            if len(chapter_tokens) >= self.chunk_size: 
                start_ind = list(range(0,len(chapter_tokens), self.chunk_size))
                end_ind = start_ind[1:] + [len(chapter_tokens)]
                
                for ind in range(0, len(start_ind)):
                    tokenized_chunk = chapter_tokens[start_ind[ind]:end_ind[ind]]
                    chunk = ' '.join(x for x in tokenized_chunk)
                    book_documents.append(chunk)
        return(book_documents)




In [185]:
def get_LSI_topicModel(all_documents):
    
    # Create a CountVectorizer 
    count_vectorizer = CountVectorizer(ngram_range=(1, 2),  stop_words='english')
    # Fit it on the document data (trining?)
    count_vectorizer.fit(all_documents)
    
    # Create the term-document matrix (Transpose places terms on the rows)
    counts = count_vectorizer.transform(all_documents).transpose()
    print("shape of counts", counts.shape)
    # Convert sparse matrix of counts to a gensim corpus
    corpus = matutils.Sparse2Corpus(counts)
    # Map matrix rows to words (tokens)
    id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())
    
    # Create lda model
    lsi = models.LsiModel(corpus=corpus, id2word=id2word, num_topics=50)
    #lda topics
    lsi_print_topics = lsi.print_topics()
    length = len(lsi_print_topics)
    # Transform the docs from the word space to the topic space
    lsi_corpus = lsi[corpus]
    
    return lsi_corpus, lsi_print_topics, length

In [186]:
all_documents = []
chunk_size = 400
for seq in range(1, 8):
    book_dict = dtmod.unpickleSomething(BOOK_PATH, "book{}_chapters.p".format(seq))
    doc = Document(book_dict, chunk_size)
    book_documents = doc.docs
    all_documents = all_documents + book_documents

print(len(all_documents), "obtained from all 7 books with a chunk size of", chunk_size )   

2902 obtained from all 7 books with a chunk size of 400


In [187]:
%time lsi_corpus_output, lsi_print_topics, length  = get_LSI_topicModel(all_documents)

shape of counts (354132, 2902)
CPU times: user 58.2 s, sys: 2.86 s, total: 1min 1s
Wall time: 44.1 s


In [182]:
length, len(lsi_corpus_output)

(20, 354132)

In [165]:
lsi_docs = [doc for doc in lsi_corpus_output]
#lsi_docs are coordinates in topic space

In [183]:
lsi_print_topics

[(0,
  '0.042*"accident send" + 0.040*"able timid" + 0.039*"accidents don" + 0.039*"accountant talk" + 0.039*"address harry" + 0.038*"achingly" + 0.037*"abuse said" + 0.037*"added drops" + 0.036*"actually attempting" + 0.036*"added voldemort"'),
 (1,
  '-0.092*"accepts harry" + -0.077*"abysmal" + -0.076*"ache wanted" + -0.066*"abuse said" + -0.065*"accepting voldemort" + -0.060*"able blast" + -0.060*"abstinence precisely" + 0.059*"aberforth glared" + -0.057*"addressed silent" + -0.057*"accordance educational"'),
 (2,
  '0.078*"actively prevent" + 0.072*"account terrible" + 0.071*"accio locker" + 0.071*"accompanied certain" + 0.068*"abou clamberin" + 0.067*"admit hagrid" + -0.063*"advice ron" + -0.062*"advice sending" + 0.061*"accio horcrux" + -0.060*"added voicing"'),
 (3,
  '0.127*"absentmindedly touched" + 0.100*"absently dug" + 0.099*"absently new" + 0.093*"accurate title" + 0.092*"advancing ron" + 0.089*"absorbed going" + 0.088*"absentmindedly large" + 0.086*"abstained" + 0.086*"ad

In [168]:
len(lsi_print_topics)

20

In [146]:
import numpy as np
def convert_LSIdocs2matrix(lsi_docs):
    topic_coord_array = []
    for doc in lsi_docs:
        doc_coords = []
        for tup in doc:
            doc_coords.append(tup[1])
        topic_coord_array.append(doc_coords)
    
    return np.array(topic_coord_array)

In [147]:
topic_coord_array = convert_LSIdocs2matrix(lsi_docs)

In [148]:
topic_coord_array

array([[  2.49633013,   0.76659269,  -1.64917045, ...,  -0.10815428,
         -2.38524346,   0.62878254],
       [  2.49891891,  -0.03404812,  -1.66072778, ...,   0.16852346,
         -1.02784256,   1.507728  ],
       [  4.1530947 ,   2.29593405,   0.02389489, ...,  -1.68104024,
         -1.79290312,   0.08273479],
       ..., 
       [ 11.28442844,  -5.26381999,   0.23202536, ...,   0.133109  ,
          0.5382131 ,   0.71849217],
       [  9.11452896,   1.95393419,  -1.25900534, ...,   0.61754305,
          1.0340638 ,   0.44500571],
       [  5.58497671,   2.92222343,   1.43343451, ...,   0.26630126,
         -0.05622959,  -0.28190311]])

In [149]:
topic_coord_array.shape

(2902, 50)

### Cluster LSI topics in topic coordinate space 

In [152]:
from sklearn.cluster import KMeans

In [154]:
def runKmeans(data, num_clusters=10):
    km = KMeans(n_clusters=num_clusters, init='k-means++', n_init=10)
    %time km.fit(data)
    cluster_id = km.labels_.tolist()
    return cluster_id

In [155]:
cluster_id = runKmeans(topic_coord_array, num_clusters=10)

CPU times: user 438 ms, sys: 14.3 ms, total: 452 ms
Wall time: 446 ms


In [159]:
len(cluster_id), np.unique(cluster_id)

(2902, array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))

### Sklearn:
Topic extraction with Non-negative Matrix Factorization and Latent Dirichlet Allocation

In [208]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [229]:
all_documents = []
chunk_size = 400
for seq in range(1, 8):
    book_dict = dtmod.unpickleSomething(BOOK_PATH, "book{}_chapters.p".format(seq))
    doc = Document(book_dict, chunk_size)
    book_documents = doc.docs
    all_documents = all_documents + book_documents

print(len(all_documents), "chunks obtained from all 7 books with a chunk size of", chunk_size )   

2902 chunks obtained from all 7 books with a chunk size of 400


In [236]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

def run_sklearnLDA(data_samples):
    # count vectorization for LDA
    tf_vectorizer = CountVectorizer(max_df=0.75, min_df=2,
                                max_features=10000,
                                stop_words='english')
    tf = tf_vectorizer.fit_transform(data_samples)
    
    # Fit LDA model
    lda = LatentDirichletAllocation(n_topics=20, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
    lda.fit(tf)
    data = lda.transform(tf)
    tf_feature_names = tf_vectorizer.get_feature_names()
    print_top_words(lda, tf_feature_names, n_top_words=20)
    
    sklearn_lda_components = lda.components_
    
    return sklearn_lda_components, data

In [237]:
sklearn_lda_components, data = run_sklearnLDA(all_documents)

Topic #0:
ter got hagrid wand yeh did ollivander yer neville potion mr hogwarts er bit expelled black books look firs right
Topic #1:
hagrid malfoy just like got knew uncle vernon potter aunt way looking petunia broom letters car crash lily told eyes
Topic #2:
ron hermione dumbledore did know like looked just snape got professor wand think time looking eyes face right voldemort voice
Topic #3:
troll fasten baron ending mountain liking knocking quidditch share sinks did desert fastest people steel lumbered playing plates hagrid hermione
Topic #4:
james evans snivellus snivelly lily hamburger spattering mature ticket exists hagrid know plastic yeh escalator professor dumbledore dursleys petrificus beginning
Topic #5:
bean saturn uranus decipher dumbfounded cartoon captioned flavor crush mar margins mimed birth zabini mystical timetables consultation dud toadless gringotts
Topic #6:
team gryffindor wood snitch quidditch angelina katie match broom slytherin field fred quaffle crowd bludger

In [238]:
data.shape

(2902, 20)

In [235]:
sklearn_lda_components

array([[  0.06004512,   0.0587493 ,   0.06014815, ...,   0.05867293,
          0.05947637,   0.05714464],
       [  0.05956537,   0.05872073,   0.05907155, ...,   0.05817783,
          0.05915199,   0.05905397],
       [  0.07544881,   4.49015457,   3.73589282, ...,   6.09488489,
         29.11613497,  18.49202694],
       ..., 
       [  2.95413696,   0.07746157,   0.06544147, ...,   0.059033  ,
          0.0613584 ,   0.28619427],
       [  0.05774296,   0.0582823 ,   0.05796075, ...,   0.06008137,
          0.05696976,   0.05853718],
       [  0.05874129,   0.05864225,   0.05767886, ...,   0.05722428,
          0.05830336,   0.05757705]])

**Cluster sklearn/lda topics in topic coordinate space**

In [239]:
def runKmeans(data, num_clusters=10):
    km = KMeans(n_clusters=num_clusters, init='k-means++', n_init=10)
    %time km.fit(data)
    cluster_id = km.labels_.tolist()
    return cluster_id

In [248]:
cluster_id = runKmeans(data, num_clusters=10)

CPU times: user 171 ms, sys: 8.46 ms, total: 179 ms
Wall time: 175 ms


In [249]:
cluster_id

[5,
 8,
 8,
 8,
 8,
 7,
 8,
 0,
 0,
 8,
 1,
 8,
 5,
 5,
 5,
 5,
 5,
 5,
 8,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 1,
 2,
 9,
 2,
 1,
 9,
 2,
 2,
 5,
 9,
 9,
 9,
 2,
 1,
 1,
 9,
 4,
 9,
 2,
 2,
 8,
 2,
 2,
 2,
 2,
 9,
 2,
 1,
 5,
 5,
 0,
 0,
 0,
 0,
 0,
 8,
 4,
 8,
 8,
 0,
 8,
 0,
 0,
 7,
 4,
 0,
 8,
 8,
 1,
 8,
 0,
 7,
 7,
 0,
 8,
 4,
 0,
 4,
 7,
 0,
 0,
 0,
 8,
 9,
 9,
 6,
 7,
 6,
 6,
 8,
 6,
 6,
 8,
 7,
 4,
 7,
 7,
 8,
 8,
 8,
 3,
 3,
 3,
 8,
 0,
 0,
 0,
 0,
 0,
 1,
 6,
 0,
 6,
 3,
 3,
 3,
 6,
 6,
 9,
 0,
 1,
 1,
 8,
 5,
 0,
 7,
 7,
 7,
 8,
 0,
 0,
 8,
 0,
 0,
 6,
 6,
 8,
 8,
 8,
 6,
 6,
 8,
 1,
 8,
 9,
 2,
 9,
 1,
 2,
 1,
 1,
 4,
 4,
 7,
 6,
 6,
 8,
 7,
 9,
 9,
 9,
 9,
 1,
 2,
 2,
 1,
 0,
 0,
 8,
 9,
 1,
 0,
 0,
 7,
 4,
 7,
 0,
 0,
 0,
 0,
 7,
 0,
 0,
 0,
 0,
 8,
 8,
 0,
 0,
 8,
 0,
 0,
 0,
 0,
 2,
 6,
 8,
 8,
 5,
 5,
 5,
 5,
 5,
 8,
 8,
 5,
 8,
 0,
 0,
 0,
 8,
 5,
 8,
 8,
 0,
 8,
 8,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 8,
 2,
 0,
 0,
 1,
 8,
 2,
 2,
 2,
 0,
 0,


In [250]:
cluster_id.index(3)

111

In [251]:
all_documents[111]

'where he would be learning to play that night he bolted his dinner that evening without noticing what he was eating and then rushed upstairs with ron to unwrap the nimbus two thousand at last wow ron sighed as the broomstick rolled onto harry s bedspread even harry who knew nothing about the different brooms thought it looked wonderful sleek and shiny with a mahogany handle it had a long tail of neat straight twigs and nimbus two thousand written in gold near the top as seven of the clock drew nearer harry left the castle and set off in the dusk toward the quidditch field held never been inside the stadium before hundreds of seats were raised in stands around the field so that the spectators were high enough to see what was going on at either end of the field were three golden poles with hoops on the end they reminded harry of the little plastic sticks muggle children blew bubbles through except that they were fifty feet high too eager to fly again to wait for wood harry mounted his b