In [1]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

In [5]:
DATASET_LOCATION = './sir_arthur_conan_doyle_dataset/'

In [6]:
# Cleanup gutenberg data and record time

import json,os
import glob
import pickle
from nltk.tokenize import RegexpTokenizer    
data = []
doc_map = {}
doc_id = 1
for filename in glob.glob(os.path.join(DATASET_LOCATION, '*.txt')):
    with open(filename,'r',) as content_file:
        (head,tail)=os.path.split(filename) 
        #print("Processing file: ", tail)
        doc_map[doc_id] = tail
        raw = content_file.read().replace('\n','')
        data.append(raw)
        doc_id += 1

print(doc_map)

{1: 'Sir Arthur Conan Doyle___The Refugees.txt', 2: 'Sir Arthur Conan Doyle___The Great Keinplatz Experiment and Other Tales of Twilight and the Unseen.txt', 3: 'Sir Arthur Conan Doyle___Danger! and Other Stories.txt', 4: 'Sir Arthur Conan Doyle___The Exploits Of Brigadier Gerard.txt', 5: 'Sir Arthur Conan Doyle___The Green Flag.txt', 6: 'Sir Arthur Conan Doyle___Sir Nigel.txt', 7: 'Sir Arthur Conan Doyle___Round the Red Lamp.txt', 8: 'Sir Arthur Conan Doyle___A Visit to Three Fronts.txt', 9: 'Sir Arthur Conan Doyle___The Adventures of Gerard.txt', 10: 'Sir Arthur Conan Doyle___The Disappearance of Lady Frances Carfax.txt', 11: 'Sir Arthur Conan Doyle___The Adventure of the Bruce-Partington Plans.txt', 12: 'Sir Arthur Conan Doyle___The Tragedy of The Korosko.txt', 13: 'Sir Arthur Conan Doyle___The Poison Belt.txt', 14: 'Sir Arthur Conan Doyle___The Last of the Legions and Other Tales of Long Ago.txt', 15: 'Sir Arthur Conan Doyle___The Last Galley.txt', 16: 'Sir Arthur Conan Doyle___The

In [7]:
# Functions to prepare for LDA Topic modeling format required by Gensim
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        else:
            lda_tokens.append(token.lower_) 
    return lda_tokens     

import nltk
nltk.download('wordnet')

from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

#nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))
en_stop.add("could")
en_stop.add("would")
en_stop.add("still")
en_stop.add("shall")
from nltk import word_tokenize
def prepare_text_for_lda(text,count):
    #print(count)
    #tokens = tokenize(text)
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens



[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/prananth/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
print(en_stop)

{'ourselves', 'shan', 'into', 'both', 'its', 'just', 'that', 'when', "weren't", 'few', "shan't", 'but', 'above', 'me', 'now', "couldn't", 'their', "that'll", 'll', 'had', "you've", 'having', 'some', 'him', 'before', 'wouldn', 'has', 'mightn', "wouldn't", "she's", 'our', 'more', "mightn't", 'myself', 'was', 'below', 'herself', 'or', 'during', 'then', 'until', 'is', 'd', 'shall', 'yourselves', 'very', 'his', 'don', 'through', 'out', 'o', 'what', 'would', 'with', 'under', 'no', 'a', 'itself', 'not', "won't", 'y', 'aren', 'were', 'how', "don't", 'we', 'couldn', "it's", 'further', 'an', 'down', 'in', 'here', 'nor', 'haven', 'yours', 'other', "hadn't", 'mustn', 'between', 'same', 'yourself', 'theirs', 'each', 'can', "shouldn't", 'shouldn', "you're", 'again', 'could', 'didn', 'be', 'and', 'while', "you'd", 'been', 'any', 'he', 'about', 'there', 'hers', 'themselves', 'against', 'so', "mustn't", 's', 'will', 'wasn', 'she', 'from', 'after', 'than', 'this', 'ain', 'i', 'own', 'still', 'hasn', 'on

In [9]:
# Prepare text for LDA
token_data = []
count = 1
for doc in data:
    tokens = prepare_text_for_lda(doc,count)
    token_data.append(tokens)
    count += 1

In [10]:
with open('acd_doc_map.pickle', 'wb') as handle:
    pickle.dump(doc_map, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('acd_token_data.pickle','wb') as handle:
    pickle.dump(token_data,handle,protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
# Save corpus and dictionary
from gensim import corpora
dictionary = corpora.Dictionary(token_data)
corpus = [dictionary.doc2bow(text) for text in token_data]
import pickle
pickle.dump(corpus, open('corpus_acd.pkl', 'wb'))
dictionary.save('dictionary_acd.gensim')

In [12]:
# LDA Modeling and save
import gensim
NUM_TOPICS = 20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model_acd.gensim')

In [13]:
ldamodel.print_topics()

[(0,
  '0.000*"There" + 0.000*"little" + 0.000*"might" + 0.000*"cry" + 0.000*"night" + 0.000*"never" + 0.000*"think" + 0.000*"thought" + 0.000*"Holmes" + 0.000*"round"'),
 (1,
  '0.004*"There" + 0.003*"State" + 0.003*"rubber" + 0.003*"people" + 0.003*"native" + 0.003*"little" + 0.003*"Congo" + 0.003*"Holmes" + 0.002*"great" + 0.002*"matter"'),
 (2,
  '0.005*"force" + 0.005*"British" + 0.004*"Boers" + 0.004*"Nigel" + 0.003*"great" + 0.003*"enemy" + 0.003*"position" + 0.003*"horse" + 0.003*"attack" + 0.003*"small"'),
 (3,
  '0.006*"British" + 0.004*"Boers" + 0.003*"Government" + 0.002*"force" + 0.002*"country" + 0.002*"power" + 0.002*"woman" + 0.002*"Transvaal" + 0.002*"South" + 0.002*"medium"'),
 (4,
  '0.000*"little" + 0.000*"never" + 0.000*"great" + 0.000*"cry" + 0.000*"might" + 0.000*"think" + 0.000*"There" + 0.000*"first" + 0.000*"round" + 0.000*"thought"'),
 (5,
  '0.004*"little" + 0.004*"cry" + 0.004*"great" + 0.004*"Emperor" + 0.003*"think" + 0.003*"There" + 0.003*"Colonel" + 0.0

In [14]:
import pyLDAvis.gensim
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))
