In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
import nltk

# Gensim
import gensim as gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [5]:
list_books = ['The-Economist-USA.txt','The_Descent_of_Man.txt',
             'The First Men In The Moon.txt','The Evolution of Man.txt',
             'Problems of Genetics.txt','Islands of Space.txt'
              ]
file = [open(book,'r',encoding="utf8") for book in list_books]

file_text = [fil.read() for fil in file]
df_text = pd.DataFrame(file_text)
df_text['index'] = df_text.index
documents = df_text
for fil in file:
    fil.close()

In [6]:
documents.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
0        6 non-null object
index    6 non-null int64
dtypes: int64(1), object(1)
memory usage: 176.0+ bytes


In [7]:
np.random.seed(2019)

In [8]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SK\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
stemmer = SnowballStemmer('english')

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [10]:
doc_sample = documents[documents['index'] == 1].values[0][0]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)

original document: 




In [11]:
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))



 tokenized and lemmatized document: 


['project', 'gutenberg', 'ebook', 'descent', 'charl', 'darwin', 'ebook', 'cost', 'restrict', 'whatsoev', 'copi', 'away', 'term', 'project', 'gutenberg', 'licens', 'includ', 'ebook', 'onlin', 'gutenberg', 'titl', 'descent', 'author', 'charl', 'darwin', 'post', 'date', 'januari', 'ebook', 'post', 'novemb', 'updat', 'octob', 'languag', 'english', 'start', 'project', 'gutenberg', 'ebook', 'descent', 'produc', 'asscher', 'descent', 'select', 'relat', 'work', 'charl', 'darwin', 'life', 'letter', 'charl', 'darwin', 'chapter', 'edit', 'franci', 'darwin', 'portrait', 'volum', 'popular', 'edit', 'condens', 'volum', 'naturalist', 'journal', 'research', 'natur', 'histori', 'geolog', 'countri', 'visit', 'voyag', 'round', 'world', 'illustr', 'pritchett', 'popular', 'edit', 'woodcut', 'cheaper', 'edit', 'origin', 'speci', 'mean', 'natur', 'select', 'preserv', 'favour', 'race', 'struggl', 'life', 'larg', 'type', 'edit', 'volum', 'popular', 'edit', 'cheaper', 'edit', 'portrait', 'contriv', 'orchid', 'f

In [12]:
processed_docs = documents[0].map(preprocess)

In [13]:
processed_docs[:5]

0    [come, brexit, elect, german, reunif, save, pe...
1    [project, gutenberg, ebook, descent, charl, da...
2    [project, gutenberg, ebook, moon, well, ebook,...
3    [project, gutenberg, ebook, evolut, ernst, hae...
4    [project, gutenberg, ebook, problem, genet, wi...
Name: 0, dtype: object

In [14]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [15]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 ababa
1 abandon
2 abat
3 abattoir
4 abaya
5 abbasid
6 abbey
7 abduct
8 abdul
9 abdulmahdi
10 abil


In [69]:
# dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [16]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
print(len(bow_corpus))
bow_corpus[1]

6


[(1, 1),
 (10, 7),
 (12, 83),
 (13, 2),
 (15, 26),
 (17, 2),
 (18, 5),
 (21, 2),
 (22, 76),
 (23, 30),
 (24, 7),
 (25, 4),
 (26, 2),
 (27, 10),
 (30, 12),
 (32, 2),
 (34, 45),
 (35, 14),
 (36, 8),
 (37, 13),
 (39, 15),
 (41, 193),
 (42, 163),
 (43, 1),
 (44, 3),
 (46, 7),
 (48, 1),
 (49, 312),
 (50, 11),
 (53, 53),
 (54, 159),
 (55, 39),
 (58, 1),
 (59, 54),
 (60, 8),
 (61, 8),
 (63, 58),
 (64, 63),
 (66, 40),
 (67, 10),
 (76, 105),
 (77, 3),
 (78, 127),
 (80, 10),
 (83, 259),
 (86, 80),
 (87, 91),
 (88, 3),
 (89, 4),
 (92, 2),
 (93, 3),
 (95, 4),
 (97, 14),
 (98, 46),
 (99, 1),
 (100, 2),
 (101, 101),
 (102, 49),
 (103, 1),
 (104, 56),
 (105, 32),
 (107, 5),
 (110, 3),
 (111, 7),
 (113, 33),
 (114, 28),
 (116, 1),
 (120, 28),
 (121, 1),
 (132, 7),
 (141, 1),
 (142, 6),
 (145, 2),
 (151, 1),
 (152, 6),
 (153, 56),
 (154, 10),
 (163, 124),
 (164, 4),
 (165, 5),
 (167, 17),
 (174, 23),
 (175, 9),
 (178, 21),
 (182, 13),
 (183, 2),
 (184, 1),
 (191, 118),
 (192, 154),
 (196, 1),
 (198, 4)

In [17]:
bow_doc_1 = bow_corpus[1]

for i in range(len(bow_doc_1)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_1[i][0], 
                                                     dictionary[bow_doc_1[i][0]], 
                                                     bow_doc_1[i][1]))

Word 1 ("abandon") appears 1 time.
Word 10 ("abil") appears 7 time.
Word 12 ("abl") appears 83 time.
Word 13 ("abli") appears 2 time.
Word 15 ("abnorm") appears 26 time.
Word 17 ("abolish") appears 2 time.
Word 18 ("abort") appears 5 time.
Word 21 ("abroad") appears 2 time.
Word 22 ("absenc") appears 76 time.
Word 23 ("absolut") appears 30 time.
Word 24 ("absorb") appears 7 time.
Word 25 ("absorpt") appears 4 time.
Word 26 ("absurd") appears 2 time.
Word 27 ("abund") appears 10 time.
Word 30 ("academi") appears 12 time.
Word 32 ("acceler") appears 2 time.
Word 34 ("accept") appears 45 time.
Word 35 ("access") appears 14 time.
Word 36 ("accid") appears 8 time.
Word 37 ("accident") appears 13 time.
Word 39 ("accompani") appears 15 time.
Word 41 ("accord") appears 193 time.
Word 42 ("account") appears 163 time.
Word 43 ("accru") appears 1 time.
Word 44 ("accus") appears 3 time.
Word 46 ("acknowledg") appears 7 time.
Word 48 ("acquiesc") appears 1 time.
Word 49 ("acquir") appears 312 time.

Word 517 ("believ") appears 198 time.
Word 520 ("belong") appears 114 time.
Word 521 ("belov") appears 5 time.
Word 522 ("belt") appears 17 time.
Word 524 ("bend") appears 11 time.
Word 525 ("beneath") appears 21 time.
Word 531 ("bennett") appears 5 time.
Word 536 ("berlin") appears 1 time.
Word 540 ("berri") appears 1 time.
Word 543 ("best") appears 104 time.
Word 545 ("betray") appears 6 time.
Word 546 ("better") appears 58 time.
Word 547 ("bewild") appears 1 time.
Word 550 ("bias") appears 1 time.
Word 558 ("bilater") appears 1 time.
Word 561 ("bill") appears 6 time.
Word 564 ("bind") appears 8 time.
Word 568 ("biograph") appears 1 time.
Word 569 ("biographi") appears 28 time.
Word 570 ("biolog") appears 1 time.
Word 572 ("bird") appears 1103 time.
Word 573 ("birmingham") appears 1 time.
Word 574 ("birth") appears 108 time.
Word 576 ("birthplac") appears 10 time.
Word 579 ("bishop") appears 12 time.
Word 582 ("bit") appears 3 time.
Word 584 ("black") appears 272 time.
Word 586 ("bla

Word 1066 ("column") appears 7 time.
Word 1069 ("combat") appears 12 time.
Word 1070 ("combin") appears 15 time.
Word 1071 ("come") appears 140 time.
Word 1073 ("comfort") appears 7 time.
Word 1074 ("command") appears 11 time.
Word 1077 ("comment") appears 5 time.
Word 1080 ("commerci") appears 1 time.
Word 1081 ("commiss") appears 1 time.
Word 1083 ("commit") appears 5 time.
Word 1086 ("common") appears 298 time.
Word 1088 ("communal") appears 12 time.
Word 1089 ("communic") appears 18 time.
Word 1091 ("communism") appears 1 time.
Word 1092 ("communist") appears 1 time.
Word 1094 ("communiti") appears 48 time.
Word 1096 ("comp") appears 4 time.
Word 1097 ("compani") appears 4 time.
Word 1098 ("companion") appears 13 time.
Word 1099 ("compar") appears 109 time.
Word 1100 ("comparison") appears 45 time.
Word 1102 ("compel") appears 20 time.
Word 1103 ("compens") appears 10 time.
Word 1104 ("compet") appears 11 time.
Word 1105 ("competit") appears 10 time.
Word 1106 ("compil") appears 2 

Word 1493 ("despis") appears 1 time.
Word 1494 ("despit") appears 1 time.
Word 1495 ("destin") appears 1 time.
Word 1496 ("destini") appears 1 time.
Word 1497 ("destroy") appears 31 time.
Word 1498 ("destruct") appears 26 time.
Word 1499 ("detail") appears 27 time.
Word 1500 ("detain") appears 1 time.
Word 1501 ("detect") appears 23 time.
Word 1504 ("deterior") appears 7 time.
Word 1505 ("determin") appears 50 time.
Word 1513 ("develop") appears 608 time.
Word 1515 ("devoid") appears 1 time.
Word 1516 ("devot") appears 11 time.
Word 1527 ("dictat") appears 1 time.
Word 1530 ("die") appears 20 time.
Word 1532 ("diet") appears 4 time.
Word 1533 ("differ") appears 1408 time.
Word 1535 ("digit") appears 15 time.
Word 1536 ("digniti") appears 4 time.
Word 1540 ("diminish") appears 13 time.
Word 1541 ("diminut") appears 13 time.
Word 1546 ("direct") appears 143 time.
Word 1547 ("director") appears 3 time.
Word 1548 ("dirt") appears 2 time.
Word 1549 ("dirti") appears 2 time.
Word 1551 ("disa

Word 1953 ("expans") appears 7 time.
Word 1954 ("expect") appears 35 time.
Word 1955 ("expedit") appears 6 time.
Word 1956 ("expel") appears 7 time.
Word 1957 ("expenditur") appears 2 time.
Word 1958 ("expens") appears 6 time.
Word 1959 ("experi") appears 72 time.
Word 1961 ("expert") appears 1 time.
Word 1964 ("explain") appears 51 time.
Word 1965 ("explan") appears 41 time.
Word 1969 ("explor") appears 2 time.
Word 1973 ("export") appears 1 time.
Word 1974 ("expos") appears 75 time.
Word 1975 ("exposur") appears 13 time.
Word 1976 ("express") appears 82 time.
Word 1978 ("extend") appears 49 time.
Word 1979 ("extens") appears 12 time.
Word 1980 ("extent") appears 58 time.
Word 1981 ("extern") appears 43 time.
Word 1982 ("extinct") appears 56 time.
Word 1983 ("extort") appears 1 time.
Word 1984 ("extra") appears 1 time.
Word 1985 ("extract") appears 14 time.
Word 1987 ("extraordinari") appears 61 time.
Word 1988 ("extraordinarili") appears 7 time.
Word 1990 ("extravag") appears 6 time.

Word 2448 ("hat") appears 1 time.
Word 2449 ("hatchet") appears 1 time.
Word 2450 ("hate") appears 2 time.
Word 2451 ("haunt") appears 9 time.
Word 2452 ("have") appears 237 time.
Word 2457 ("hawaii") appears 2 time.
Word 2458 ("hay") appears 2 time.
Word 2460 ("hazard") appears 1 time.
Word 2461 ("head") appears 301 time.
Word 2466 ("health") appears 25 time.
Word 2467 ("healthi") appears 10 time.
Word 2468 ("heap") appears 1 time.
Word 2469 ("hear") appears 94 time.
Word 2470 ("heart") appears 11 time.
Word 2472 ("heat") appears 12 time.
Word 2474 ("heavi") appears 10 time.
Word 2475 ("heavili") appears 1 time.
Word 2476 ("hedg") appears 5 time.
Word 2477 ("heel") appears 2 time.
Word 2478 ("heft") appears 6 time.
Word 2485 ("help") appears 7 time.
Word 2487 ("helpless") appears 7 time.
Word 2493 ("hermann") appears 3 time.
Word 2494 ("heroic") appears 3 time.
Word 2496 ("hesit") appears 12 time.
Word 2500 ("hide") appears 18 time.
Word 2502 ("high") appears 242 time.
Word 2503 ("hig

Word 2951 ("key") appears 1 time.
Word 2959 ("kick") appears 1 time.
Word 2962 ("kill") appears 67 time.
Word 2965 ("kind") appears 256 time.
Word 2966 ("king") appears 22 time.
Word 2967 ("kingdom") appears 51 time.
Word 2968 ("kinship") appears 1 time.
Word 2970 ("kitchen") appears 1 time.
Word 2975 ("knee") appears 2 time.
Word 2976 ("kneel") appears 5 time.
Word 2977 ("knight") appears 1 time.
Word 2978 ("knock") appears 6 time.
Word 2979 ("know") appears 328 time.
Word 2981 ("knowledg") appears 27 time.
Word 3008 ("label") appears 3 time.
Word 3009 ("laboratori") appears 1 time.
Word 3010 ("labour") appears 21 time.
Word 3011 ("lace") appears 5 time.
Word 3014 ("lade") appears 1 time.
Word 3015 ("lake") appears 3 time.
Word 3018 ("lament") appears 2 time.
Word 3019 ("land") appears 48 time.
Word 3021 ("landscap") appears 1 time.
Word 3023 ("languag") appears 115 time.
Word 3027 ("laps") appears 8 time.
Word 3029 ("larg") appears 247 time.
Word 3030 ("larger") appears 151 time.
Wor

Word 3549 ("museum") appears 17 time.
Word 3550 ("music") appears 133 time.
Word 3555 ("mutual") appears 32 time.
Word 3557 ("myriad") appears 1 time.
Word 3558 ("mysteri") appears 7 time.
Word 3566 ("nake") appears 56 time.
Word 3567 ("name") appears 39 time.
Word 3573 ("narrat") appears 6 time.
Word 3574 ("narrow") appears 26 time.
Word 3575 ("narrowli") appears 3 time.
Word 3585 ("nation") appears 94 time.
Word 3589 ("nativ") appears 115 time.
Word 3591 ("natur") appears 526 time.
Word 3592 ("navig") appears 1 time.
Word 3597 ("neanderth") appears 3 time.
Word 3598 ("near") appears 205 time.
Word 3600 ("necessari") appears 26 time.
Word 3601 ("necessarili") appears 18 time.
Word 3602 ("necessit") appears 3 time.
Word 3603 ("need") appears 36 time.
Word 3604 ("needl") appears 1 time.
Word 3606 ("negat") appears 2 time.
Word 3607 ("neglect") appears 4 time.
Word 3609 ("neighbour") appears 12 time.
Word 3610 ("neighbourhood") appears 3 time.
Word 3615 ("nerv") appears 13 time.
Word 361

Word 4096 ("popular") appears 12 time.
Word 4098 ("pore") appears 8 time.
Word 4099 ("port") appears 1 time.
Word 4101 ("portion") appears 32 time.
Word 4103 ("portrait") appears 3 time.
Word 4106 ("portugues") appears 2 time.
Word 4109 ("posit") appears 72 time.
Word 4110 ("possibl") appears 101 time.
Word 4111 ("post") appears 10 time.
Word 4121 ("potent") appears 5 time.
Word 4125 ("poultri") appears 12 time.
Word 4126 ("pound") appears 12 time.
Word 4127 ("poverti") appears 5 time.
Word 4128 ("powel") appears 2 time.
Word 4129 ("power") appears 357 time.
Word 4132 ("practic") appears 56 time.
Word 4134 ("prais") appears 16 time.
Word 4141 ("precious") appears 1 time.
Word 4142 ("precis") appears 17 time.
Word 4145 ("predecessor") appears 2 time.
Word 4146 ("predict") appears 2 time.
Word 4148 ("prefer") appears 93 time.
Word 4150 ("prematur") appears 5 time.
Word 4152 ("premier") appears 2 time.
Word 4154 ("prepar") appears 12 time.
Word 4157 ("presenc") appears 48 time.
Word 4158 

Word 4542 ("resolv") appears 3 time.
Word 4543 ("reson") appears 3 time.
Word 4544 ("resort") appears 3 time.
Word 4546 ("respect") appears 223 time.
Word 4547 ("respond") appears 1 time.
Word 4550 ("rest") appears 39 time.
Word 4553 ("restor") appears 1 time.
Word 4554 ("restraint") appears 5 time.
Word 4555 ("restrict") appears 6 time.
Word 4557 ("result") appears 142 time.
Word 4558 ("resum") appears 1 time.
Word 4562 ("retain") appears 104 time.
Word 4567 ("retir") appears 3 time.
Word 4568 ("retract") appears 1 time.
Word 4570 ("retreat") appears 1 time.
Word 4571 ("retriev") appears 12 time.
Word 4572 ("return") appears 53 time.
Word 4578 ("reveal") appears 4 time.
Word 4580 ("reveng") appears 11 time.
Word 4581 ("revenu") appears 1 time.
Word 4582 ("rever") appears 4 time.
Word 4583 ("revers") appears 72 time.
Word 4584 ("review") appears 82 time.
Word 4587 ("reviv") appears 1 time.
Word 4589 ("revolut") appears 2 time.
Word 4591 ("reward") appears 1 time.
Word 4602 ("rich") app

Word 5048 ("solut") appears 4 time.
Word 5049 ("solv") appears 8 time.
Word 5052 ("somewhat") appears 87 time.
Word 5053 ("song") appears 77 time.
Word 5056 ("soon") appears 75 time.
Word 5057 ("sooner") appears 11 time.
Word 5061 ("sort") appears 14 time.
Word 5062 ("soul") appears 10 time.
Word 5063 ("sound") appears 154 time.
Word 5066 ("sourc") appears 14 time.
Word 5067 ("south") appears 81 time.
Word 5068 ("southern") appears 21 time.
Word 5075 ("space") appears 24 time.
Word 5076 ("spain") appears 2 time.
Word 5079 ("spare") appears 1 time.
Word 5080 ("spark") appears 1 time.
Word 5085 ("speak") appears 69 time.
Word 5088 ("speci") appears 1100 time.
Word 5089 ("special") appears 115 time.
Word 5092 ("spectat") appears 6 time.
Word 5094 ("spectrum") appears 3 time.
Word 5095 ("specul") appears 6 time.
Word 5096 ("speech") appears 22 time.
Word 5097 ("speed") appears 3 time.
Word 5098 ("spell") appears 1 time.
Word 5099 ("spend") appears 2 time.
Word 5103 ("spike") appears 7 time

Word 5494 ("throw") appears 35 time.
Word 5495 ("thug") appears 2 time.
Word 5496 ("thumb") appears 8 time.
Word 5498 ("thump") appears 1 time.
Word 5499 ("thunder") appears 2 time.
Word 5505 ("tie") appears 4 time.
Word 5506 ("tight") appears 2 time.
Word 5512 ("till") appears 14 time.
Word 5513 ("time") appears 314 time.
Word 5516 ("timid") appears 9 time.
Word 5518 ("ting") appears 6 time.
Word 5521 ("tire") appears 3 time.
Word 5525 ("titl") appears 10 time.
Word 5530 ("toe") appears 6 time.
Word 5533 ("toler") appears 5 time.
Word 5537 ("tone") appears 23 time.
Word 5538 ("tongu") appears 7 time.
Word 5541 ("tool") appears 24 time.
Word 5542 ("tooth") appears 7 time.
Word 5552 ("tortur") appears 5 time.
Word 5553 ("toss") appears 1 time.
Word 5554 ("total") appears 10 time.
Word 5556 ("touch") appears 20 time.
Word 5557 ("tough") appears 1 time.
Word 5560 ("tour") appears 2 time.
Word 5563 ("town") appears 9 time.
Word 5565 ("trace") appears 69 time.
Word 5566 ("track") appears 3 

Word 6056 ("wire") appears 5 time.
Word 6057 ("wisdom") appears 4 time.
Word 6058 ("wise") appears 2 time.
Word 6059 ("wish") appears 44 time.
Word 6060 ("wit") appears 15 time.
Word 6061 ("withdraw") appears 3 time.
Word 6062 ("wither") appears 2 time.
Word 6066 ("wolf") appears 6 time.
Word 6068 ("woman") appears 60 time.
Word 6069 ("women") appears 207 time.
Word 6070 ("wonder") appears 70 time.
Word 6073 ("woo") appears 3 time.
Word 6074 ("wool") appears 2 time.
Word 6078 ("word") appears 50 time.
Word 6079 ("work") appears 224 time.
Word 6080 ("worker") appears 6 time.
Word 6084 ("world") appears 110 time.
Word 6086 ("worri") appears 2 time.
Word 6087 ("wors") appears 2 time.
Word 6089 ("worst") appears 8 time.
Word 6090 ("worth") appears 17 time.
Word 6091 ("worthless") appears 2 time.
Word 6092 ("wound") appears 10 time.
Word 6093 ("wrap") appears 1 time.
Word 6094 ("wriggl") appears 1 time.
Word 6096 ("write") appears 53 time.
Word 6097 ("writer") appears 35 time.
Word 6099 ("w

Word 6399 ("ana") appears 10 time.
Word 6400 ("anal") appears 10 time.
Word 6401 ("anastomus") appears 7 time.
Word 6402 ("anat") appears 11 time.
Word 6403 ("anatida") appears 5 time.
Word 6404 ("anatom") appears 3 time.
Word 6405 ("anatomi") appears 48 time.
Word 6406 ("anatomiqu") appears 1 time.
Word 6407 ("anatomist") appears 13 time.
Word 6408 ("anax") appears 3 time.
Word 6409 ("anblick") appears 1 time.
Word 6410 ("ancestri") appears 1 time.
Word 6411 ("anchor") appears 1 time.
Word 6412 ("anchylos") appears 1 time.
Word 6413 ("andaman") appears 4 time.
Word 6414 ("ander") appears 1 time.
Word 6415 ("andes") appears 1 time.
Word 6416 ("andraena") appears 1 time.
Word 6417 ("andreana") appears 1 time.
Word 6418 ("andremiaja") appears 1 time.
Word 6419 ("andrew") appears 14 time.
Word 6420 ("androgyn") appears 8 time.
Word 6421 ("anecdot") appears 1 time.
Word 6422 ("anemon") appears 2 time.
Word 6423 ("angl") appears 4 time.
Word 6424 ("angular") appears 2 time.
Word 6425 ("angu

Word 6612 ("attir") appears 2 time.
Word 6613 ("auditori") appears 6 time.
Word 6614 ("audobon") appears 1 time.
Word 6615 ("audouin") appears 3 time.
Word 6616 ("audubon") appears 56 time.
Word 6617 ("aufgaben") appears 1 time.
Word 6618 ("auflag") appears 1 time.
Word 6619 ("aughey") appears 2 time.
Word 6620 ("augment") appears 13 time.
Word 6621 ("aura") appears 2 time.
Word 6622 ("auratus") appears 3 time.
Word 6623 ("aurelius") appears 6 time.
Word 6624 ("auriculata") appears 2 time.
Word 6625 ("auritum") appears 8 time.
Word 6626 ("auritus") appears 4 time.
Word 6627 ("aussi") appears 1 time.
Word 6628 ("aussterben") appears 4 time.
Word 6629 ("ausstossend") appears 1 time.
Word 6630 ("austen") appears 3 time.
Word 6631 ("australasian") appears 1 time.
Word 6632 ("australi") appears 5 time.
Word 6633 ("authent") appears 1 time.
Word 6634 ("autorisé") appears 1 time.
Word 6635 ("autr") appears 2 time.
Word 6636 ("auxiliari") appears 1 time.
Word 6637 ("avanc") appears 1 time.
Wor

Word 6829 ("blenni") appears 2 time.
Word 6830 ("blethisa") appears 3 time.
Word 6831 ("bleu") appears 1 time.
Word 6832 ("blister") appears 2 time.
Word 6833 ("bloch") appears 2 time.
Word 6834 ("bloodhound") appears 1 time.
Word 6835 ("blotch") appears 1 time.
Word 6836 ("blotti") appears 1 time.
Word 6837 ("blubber") appears 2 time.
Word 6838 ("bluebreast") appears 1 time.
Word 6839 ("blueness") appears 2 time.
Word 6840 ("bluish") appears 7 time.
Word 6841 ("blumenbach") appears 7 time.
Word 6842 ("blush") appears 1 time.
Word 6843 ("blyth") appears 52 time.
Word 6844 ("boar") appears 20 time.
Word 6845 ("boardman") appears 2 time.
Word 6846 ("boarula") appears 1 time.
Word 6847 ("bodili") appears 24 time.
Word 6848 ("bog") appears 1 time.
Word 6849 ("bogota") appears 2 time.
Word 6850 ("boi") appears 1 time.
Word 6851 ("boir") appears 1 time.
Word 6852 ("boitard") appears 4 time.
Word 6853 ("bolder") appears 2 time.
Word 6854 ("bologna") appears 1 time.
Word 6855 ("bombet") appear

Word 7045 ("campbel") appears 4 time.
Word 7046 ("camper") appears 2 time.
Word 7047 ("campestri") appears 7 time.
Word 7048 ("campylopterus") appears 2 time.
Word 7049 ("canadensi") appears 11 time.
Word 7050 ("canal") appears 4 time.
Word 7051 ("canari") appears 21 time.
Word 7052 ("candicantia") appears 1 time.
Word 7053 ("candoll") appears 2 time.
Word 7054 ("cane") appears 1 time.
Word 7055 ("canestrini") appears 17 time.
Word 7056 ("canfield") appears 3 time.
Word 7057 ("canin") appears 67 time.
Word 7058 ("cann") appears 1 time.
Word 7059 ("canna") appears 2 time.
Word 7060 ("cannabina") appears 3 time.
Word 7061 ("cannib") appears 1 time.
Word 7062 ("cano") appears 8 time.
Word 7063 ("canthari") appears 2 time.
Word 7064 ("cantharus") appears 3 time.
Word 7065 ("cantori") appears 2 time.
Word 7066 ("canutus") appears 1 time.
Word 7067 ("capelin") appears 1 time.
Word 7068 ("capensi") appears 7 time.
Word 7069 ("caper") appears 1 time.
Word 7070 ("capercailzi") appears 25 time.


Word 7257 ("chillingham") appears 2 time.
Word 7258 ("chilo") appears 3 time.
Word 7259 ("chimaera") appears 3 time.
Word 7260 ("chimaeroid") appears 2 time.
Word 7261 ("chimney") appears 1 time.
Word 7262 ("chimpanze") appears 37 time.
Word 7263 ("chin") appears 7 time.
Word 7264 ("chinqui") appears 5 time.
Word 7265 ("chinsurdi") appears 3 time.
Word 7266 ("chiroptera") appears 1 time.
Word 7267 ("chirp") appears 5 time.
Word 7268 ("chirurg") appears 4 time.
Word 7269 ("chisel") appears 1 time.
Word 7270 ("chlamydera") appears 2 time.
Word 7271 ("chloeon") appears 4 time.
Word 7272 ("chloephaga") appears 2 time.
Word 7273 ("chlorocoelus") appears 2 time.
Word 7274 ("chloropus") appears 3 time.
Word 7275 ("chocol") appears 2 time.
Word 7276 ("cholera") appears 1 time.
Word 7277 ("chorda") appears 3 time.
Word 7278 ("chough") appears 2 time.
Word 7279 ("chromid") appears 3 time.
Word 7280 ("chromida") appears 2 time.
Word 7281 ("chrysaeto") appears 2 time.
Word 7282 ("chrysemi") appear

Word 7469 ("contagion") appears 1 time.
Word 7470 ("contemporan") appears 1 time.
Word 7471 ("contempt") appears 2 time.
Word 7472 ("contin") appears 21 time.
Word 7473 ("conting") appears 8 time.
Word 7474 ("contradictori") appears 1 time.
Word 7475 ("contravent") appears 1 time.
Word 7476 ("contriv") appears 7 time.
Word 7477 ("convex") appears 5 time.
Word 7478 ("coo") appears 5 time.
Word 7479 ("copepoden") appears 1 time.
Word 7480 ("cophoti") appears 4 time.
Word 7481 ("copious") appears 5 time.
Word 7482 ("copperi") appears 2 time.
Word 7483 ("copri") appears 6 time.
Word 7484 ("coprida") appears 1 time.
Word 7485 ("coprini") appears 2 time.
Word 7486 ("copul") appears 4 time.
Word 7487 ("copyright") appears 14 time.
Word 7488 ("coquet") appears 3 time.
Word 7489 ("coracia") appears 1 time.
Word 7490 ("coral") appears 13 time.
Word 7491 ("corbi") appears 4 time.
Word 7492 ("cordillera") appears 2 time.
Word 7493 ("cordylus") appears 2 time.
Word 7494 ("corfu") appears 3 time.
Wo

Word 7683 ("debar") appears 2 time.
Word 7684 ("debas") appears 1 time.
Word 7685 ("decenc") appears 2 time.
Word 7686 ("decenni") appears 3 time.
Word 7687 ("decept") appears 3 time.
Word 7688 ("deciduari") appears 2 time.
Word 7689 ("deck") appears 6 time.
Word 7690 ("declens") appears 2 time.
Word 7691 ("decompos") appears 5 time.
Word 7692 ("decor") appears 36 time.
Word 7693 ("decoy") appears 4 time.
Word 7694 ("decticus") appears 4 time.
Word 7695 ("deduc") appears 3 time.
Word 7696 ("deduct") appears 3 time.
Word 7697 ("deed") appears 1 time.
Word 7698 ("deepest") appears 2 time.
Word 7699 ("deer") appears 112 time.
Word 7700 ("deerhound") appears 8 time.
Word 7701 ("defect") appears 9 time.
Word 7702 ("defenceless") appears 7 time.
Word 7703 ("defianc") appears 3 time.
Word 7704 ("defiant") appears 1 time.
Word 7705 ("defici") appears 14 time.
Word 7706 ("defin") appears 18 time.
Word 7707 ("definit") appears 37 time.
Word 7708 ("deform") appears 9 time.
Word 7709 ("deg") appea

Word 7896 ("draco") appears 3 time.
Word 7897 ("dragon") appears 19 time.
Word 7898 ("dragonet") appears 4 time.
Word 7899 ("dragonfli") appears 1 time.
Word 7900 ("drain") appears 1 time.
Word 7901 ("drake") appears 14 time.
Word 7902 ("dreamer") appears 1 time.
Word 7903 ("dress") appears 30 time.
Word 7904 ("drink") appears 4 time.
Word 7905 ("dromaeus") appears 2 time.
Word 7906 ("dromoeus") appears 1 time.
Word 7907 ("dromolaea") appears 2 time.
Word 7908 ("drongo") appears 8 time.
Word 7909 ("droop") appears 2 time.
Word 7910 ("druggist") appears 1 time.
Word 7911 ("dryness") appears 4 time.
Word 7912 ("dryopithecus") appears 4 time.
Word 7913 ("dubio") appears 1 time.
Word 7914 ("dublin") appears 3 time.
Word 7915 ("duck") appears 51 time.
Word 7916 ("duckl") appears 1 time.
Word 7917 ("duel") appears 2 time.
Word 7918 ("dufoss") appears 2 time.
Word 7919 ("dugong") appears 4 time.
Word 7920 ("dujardin") appears 2 time.
Word 7921 ("duller") appears 8 time.
Word 7922 ("dullest") 

Word 8113 ("erithacus") appears 1 time.
Word 8114 ("erlangen") appears 1 time.
Word 8115 ("err") appears 2 time.
Word 8116 ("errand") appears 2 time.
Word 8117 ("errat") appears 2 time.
Word 8118 ("erratica") appears 1 time.
Word 8119 ("erregt") appears 1 time.
Word 8120 ("erron") appears 7 time.
Word 8121 ("erscheint") appears 1 time.
Word 8122 ("erycina") appears 2 time.
Word 8123 ("erythrogastra") appears 2 time.
Word 8124 ("erythrop") appears 2 time.
Word 8125 ("erythrorhynchus") appears 3 time.
Word 8126 ("eschricht") appears 9 time.
Word 8127 ("esculenta") appears 2 time.
Word 8128 ("esmeralda") appears 2 time.
Word 8129 ("esox") appears 4 time.
Word 8130 ("espac") appears 1 time.
Word 8131 ("espec") appears 1 time.
Word 8132 ("espèc") appears 15 time.
Word 8133 ("esquil") appears 1 time.
Word 8134 ("esquimaux") appears 8 time.
Word 8135 ("ess") appears 1 time.
Word 8136 ("essai") appears 1 time.
Word 8137 ("essenc") appears 3 time.
Word 8138 ("esteem") appears 13 time.
Word 8139

Word 8331 ("flash") appears 1 time.
Word 8332 ("flat") appears 15 time.
Word 8333 ("flatten") appears 9 time.
Word 8334 ("flatter") appears 1 time.
Word 8335 ("flatteri") appears 1 time.
Word 8336 ("flavour") appears 2 time.
Word 8337 ("fledg") appears 1 time.
Word 8338 ("fleeter") appears 1 time.
Word 8339 ("fleischmann") appears 1 time.
Word 8340 ("flesh") appears 6 time.
Word 8341 ("fleshi") appears 6 time.
Word 8342 ("flexibl") appears 6 time.
Word 8343 ("flexor") appears 4 time.
Word 8344 ("fli") appears 39 time.
Word 8345 ("flight") appears 24 time.
Word 8346 ("flinder") appears 1 time.
Word 8347 ("fling") appears 3 time.
Word 8348 ("flipper") appears 1 time.
Word 8349 ("flit") appears 2 time.
Word 8350 ("float") appears 5 time.
Word 8351 ("flock") appears 15 time.
Word 8352 ("flood") appears 1 time.
Word 8353 ("florenc") appears 1 time.
Word 8354 ("floricolen") appears 1 time.
Word 8355 ("florid") appears 1 time.
Word 8356 ("florisuga") appears 2 time.
Word 8357 ("flounder") app

Word 8546 ("generalis") appears 3 time.
Word 8547 ("generell") appears 6 time.
Word 8548 ("generibus") appears 1 time.
Word 8549 ("generic") appears 1 time.
Word 8550 ("geneva") appears 2 time.
Word 8551 ("gent") appears 1 time.
Word 8552 ("gentleman") appears 4 time.
Word 8553 ("gentlemen") appears 6 time.
Word 8554 ("gentri") appears 1 time.
Word 8555 ("genus") appears 113 time.
Word 8556 ("genutia") appears 3 time.
Word 8557 ("geoffroy") appears 17 time.
Word 8558 ("geograph") appears 18 time.
Word 8559 ("geol") appears 1 time.
Word 8560 ("geolog") appears 15 time.
Word 8561 ("geometra") appears 3 time.
Word 8562 ("geophagus") appears 4 time.
Word 8563 ("georgian") appears 3 time.
Word 8564 ("geotrup") appears 5 time.
Word 8565 ("gerb") appears 2 time.
Word 8566 ("gerland") appears 7 time.
Word 8567 ("germen") appears 2 time.
Word 8568 ("germin") appears 2 time.
Word 8569 ("gervai") appears 7 time.
Word 8570 ("geröthet") appears 1 time.
Word 8571 ("geschicht") appears 3 time.
Word 8

Word 8763 ("harden") appears 2 time.
Word 8764 ("hardwick") appears 2 time.
Word 8765 ("hardwickii") appears 3 time.
Word 8766 ("hare") appears 9 time.
Word 8767 ("harelda") appears 3 time.
Word 8768 ("harem") appears 8 time.
Word 8769 ("harlan") appears 2 time.
Word 8770 ("harlequin") appears 3 time.
Word 8771 ("harmless") appears 3 time.
Word 8772 ("harmon") appears 1 time.
Word 8773 ("harmoni") appears 11 time.
Word 8774 ("harri") appears 8 time.
Word 8775 ("harrison") appears 7 time.
Word 8776 ("hart") appears 5 time.
Word 8777 ("hartman") appears 3 time.
Word 8778 ("hartmann") appears 1 time.
Word 8779 ("hartshorn") appears 1 time.
Word 8780 ("hastili") appears 2 time.
Word 8781 ("hatch") appears 34 time.
Word 8782 ("hath") appears 1 time.
Word 8783 ("hatr") appears 7 time.
Word 8784 ("hatt") appears 1 time.
Word 8785 ("haughton") appears 2 time.
Word 8786 ("hawaiian") appears 2 time.
Word 8787 ("hawk") appears 9 time.
Word 8788 ("haydn") appears 1 time.
Word 8789 ("haymond") appe

Word 8978 ("ibi") appears 50 time.
Word 8979 ("ibid") appears 74 time.
Word 8980 ("ibis") appears 5 time.
Word 8981 ("ichneumon") appears 1 time.
Word 8982 ("ichneumonid") appears 2 time.
Word 8983 ("ichneumonida") appears 3 time.
Word 8984 ("ichthyopterygia") appears 4 time.
Word 8985 ("ichthyosaurian") appears 4 time.
Word 8986 ("icon") appears 1 time.
Word 8987 ("idem") appears 2 time.
Word 8988 ("identif") appears 1 time.
Word 8989 ("idiot") appears 21 time.
Word 8990 ("idolatri") appears 2 time.
Word 8991 ("idyl") appears 1 time.
Word 8992 ("iguana") appears 4 time.
Word 8993 ("ihr") appears 1 time.
Word 8994 ("ile") appears 1 time.
Word 8995 ("illegitim") appears 10 time.
Word 8996 ("illinoi") appears 3 time.
Word 8997 ("illumin") appears 5 time.
Word 8998 ("illustrissimus") appears 2 time.
Word 8999 ("imaginari") appears 2 time.
Word 9000 ("imago") appears 9 time.
Word 9001 ("imb") appears 7 time.
Word 9002 ("imbecil") appears 5 time.
Word 9003 ("immanuel") appears 1 time.
Word 

Word 9190 ("irrespect") appears 2 time.
Word 9191 ("irroratus") appears 3 time.
Word 9192 ("isabellin") appears 1 time.
Word 9193 ("ischial") appears 1 time.
Word 9194 ("ischio") appears 4 time.
Word 9195 ("isi") appears 3 time.
Word 9196 ("isid") appears 1 time.
Word 9197 ("isidi") appears 2 time.
Word 9198 ("isidor") appears 10 time.
Word 9199 ("isl") appears 1 time.
Word 9200 ("isolirung") appears 1 time.
Word 9201 ("ital") appears 3 time.
Word 9202 ("italiana") appears 2 time.
Word 9203 ("itch") appears 1 time.
Word 9204 ("ithagini") appears 3 time.
Word 9205 ("iulus") appears 2 time.
Word 9206 ("ivori") appears 1 time.
Word 9207 ("jack") appears 2 time.
Word 9208 ("jackal") appears 5 time.
Word 9209 ("jackdaw") appears 1 time.
Word 9210 ("jacobin") appears 1 time.
Word 9211 ("jacquinot") appears 2 time.
Word 9212 ("jaeger") appears 4 time.
Word 9213 ("jaguar") appears 2 time.
Word 9214 ("jahrg") appears 5 time.
Word 9215 ("janeiro") appears 2 time.
Word 9216 ("janira") appears 4 t

Word 9409 ("lemurida") appears 11 time.
Word 9410 ("lemurin") appears 1 time.
Word 9411 ("lemuroidea") appears 11 time.
Word 9412 ("lengthi") appears 1 time.
Word 9413 ("lengua") appears 2 time.
Word 9414 ("lennan") appears 14 time.
Word 9415 ("lenteur") appears 1 time.
Word 9416 ("leon") appears 1 time.
Word 9417 ("leopard") appears 4 time.
Word 9418 ("lepadida") appears 1 time.
Word 9419 ("lepidoptera") appears 48 time.
Word 9420 ("lepidopteren") appears 1 time.
Word 9421 ("lepidosiren") appears 3 time.
Word 9422 ("leptalid") appears 4 time.
Word 9423 ("leptorhynchus") appears 4 time.
Word 9424 ("leptura") appears 2 time.
Word 9425 ("lequel") appears 1 time.
Word 9426 ("leroy") appears 3 time.
Word 9427 ("lesli") appears 6 time.
Word 9428 ("lesquell") appears 1 time.
Word 9429 ("less") appears 2 time.
Word 9430 ("lessona") appears 2 time.
Word 9431 ("lethrus") appears 4 time.
Word 9432 ("lettera") appears 1 time.
Word 9433 ("lettr") appears 2 time.
Word 9434 ("leuciscus") appears 5 t

Word 9625 ("mallotus") appears 3 time.
Word 9626 ("malthus") appears 4 time.
Word 9627 ("maluri") appears 2 time.
Word 9628 ("malurida") appears 2 time.
Word 9629 ("malurus") appears 1 time.
Word 9630 ("mamm") appears 4 time.
Word 9631 ("mamma") appears 24 time.
Word 9632 ("mammal") appears 159 time.
Word 9633 ("mammalia") appears 13 time.
Word 9634 ("mammalian") appears 12 time.
Word 9635 ("mammalogi") appears 11 time.
Word 9636 ("mammari") appears 8 time.
Word 9637 ("mammifèr") appears 8 time.
Word 9638 ("man") appears 2 time.
Word 9639 ("manakin") appears 1 time.
Word 9640 ("mandan") appears 4 time.
Word 9641 ("mandarin") appears 2 time.
Word 9642 ("mandibl") appears 21 time.
Word 9643 ("mandril") appears 25 time.
Word 9644 ("mandschu") appears 1 time.
Word 9645 ("mane") appears 30 time.
Word 9646 ("manger") appears 1 time.
Word 9647 ("manhood") appears 7 time.
Word 9648 ("manicatum") appears 2 time.
Word 9649 ("mankind") appears 90 time.
Word 9650 ("manlik") appears 2 time.
Word 96

Word 9839 ("monachus") appears 1 time.
Word 9840 ("monarch") appears 2 time.
Word 9841 ("monarqu") appears 1 time.
Word 9842 ("monatsbericht") appears 1 time.
Word 9843 ("monboddo") appears 2 time.
Word 9844 ("monbuttoo") appears 1 time.
Word 9845 ("monck") appears 1 time.
Word 9846 ("mond") appears 3 time.
Word 9847 ("mongol") appears 1 time.
Word 9848 ("mongolian") appears 8 time.
Word 9849 ("mongrel") appears 6 time.
Word 9850 ("monk") appears 1 time.
Word 9851 ("monkey") appears 244 time.
Word 9852 ("monogam") appears 34 time.
Word 9853 ("monogami") appears 2 time.
Word 9854 ("monogenist") appears 5 time.
Word 9855 ("monograph") appears 6 time.
Word 9856 ("monographi") appears 1 time.
Word 9857 ("mononychus") appears 2 time.
Word 9858 ("monoth") appears 1 time.
Word 9859 ("monotremata") appears 13 time.
Word 9860 ("monoynchus") appears 1 time.
Word 9861 ("monstros") appears 12 time.
Word 9862 ("monstrosa") appears 3 time.
Word 9863 ("monstrous") appears 2 time.
Word 9864 ("mont") a

Word 10052 ("nigrescen") appears 3 time.
Word 10053 ("nigrican") appears 2 time.
Word 10054 ("nigrilabri") appears 2 time.
Word 10055 ("nigripenni") appears 2 time.
Word 10056 ("nihil") appears 1 time.
Word 10057 ("nile") appears 6 time.
Word 10058 ("nilghau") appears 2 time.
Word 10059 ("nilsson") appears 3 time.
Word 10060 ("nineteen") appears 2 time.
Word 10061 ("nineteenth") appears 1 time.
Word 10062 ("ninth") appears 2 time.
Word 10063 ("nippl") appears 11 time.
Word 10064 ("nitsch") appears 2 time.
Word 10065 ("nitzsch") appears 2 time.
Word 10066 ("nivali") appears 3 time.
Word 10067 ("niveus") appears 2 time.
Word 10068 ("nobi") appears 1 time.
Word 10069 ("nobler") appears 1 time.
Word 10070 ("noblest") appears 4 time.
Word 10071 ("noctua") appears 2 time.
Word 10072 ("noctuida") appears 2 time.
Word 10073 ("nocturn") appears 7 time.
Word 10074 ("nois") appears 68 time.
Word 10075 ("noisi") appears 4 time.
Word 10076 ("noisier") appears 2 time.
Word 10077 ("noisiest") appears

Word 10262 ("paddl") appears 4 time.
Word 10263 ("padova") appears 1 time.
Word 10264 ("pagan") appears 1 time.
Word 10265 ("paget") appears 4 time.
Word 10266 ("pagurus") appears 4 time.
Word 10267 ("palaemon") appears 2 time.
Word 10268 ("palaeont") appears 1 time.
Word 10269 ("palaeontolog") appears 1 time.
Word 10270 ("palaeorni") appears 10 time.
Word 10271 ("palamedea") appears 6 time.
Word 10272 ("palat") appears 4 time.
Word 10273 ("pale") appears 36 time.
Word 10274 ("paleolith") appears 2 time.
Word 10275 ("paler") appears 14 time.
Word 10276 ("palestin") appears 6 time.
Word 10277 ("palla") appears 16 time.
Word 10278 ("pallasian") appears 1 time.
Word 10279 ("palm") appears 5 time.
Word 10280 ("palmari") appears 3 time.
Word 10281 ("palmat") appears 1 time.
Word 10282 ("palmata") appears 2 time.
Word 10283 ("palmip") appears 2 time.
Word 10284 ("palpit") appears 1 time.
Word 10285 ("paludosus") appears 4 time.
Word 10286 ("pampa") appears 5 time.
Word 10287 ("pamper") appea

Word 10468 ("phys") appears 17 time.
Word 10469 ("physic") appears 31 time.
Word 10470 ("physician") appears 1 time.
Word 10471 ("physiognomi") appears 5 time.
Word 10472 ("physiolog") appears 30 time.
Word 10473 ("physiologist") appears 1 time.
Word 10474 ("pica") appears 2 time.
Word 10475 ("picide") appears 1 time.
Word 10476 ("pickard") appears 1 time.
Word 10477 ("picker") appears 2 time.
Word 10478 ("picta") appears 10 time.
Word 10479 ("picton") appears 2 time.
Word 10480 ("pictori") appears 1 time.
Word 10481 ("picu") appears 1 time.
Word 10482 ("picus") appears 4 time.
Word 10483 ("pie") appears 11 time.
Word 10484 ("piebald") appears 7 time.
Word 10485 ("pierc") appears 12 time.
Word 10486 ("pieri") appears 5 time.
Word 10487 ("pierr") appears 2 time.
Word 10488 ("pig") appears 15 time.
Word 10489 ("pigeon") appears 84 time.
Word 10490 ("pike") appears 15 time.
Word 10491 ("pillar") appears 2 time.
Word 10492 ("pimelia") appears 3 time.
Word 10493 ("pin") appears 3 time.
Word

Word 10675 ("probosci") appears 4 time.
Word 10676 ("proboscideus") appears 2 time.
Word 10677 ("proc") appears 54 time.
Word 10678 ("procced") appears 1 time.
Word 10679 ("procreat") appears 4 time.
Word 10680 ("proctotretus") appears 5 time.
Word 10681 ("procur") appears 18 time.
Word 10682 ("prodigi") appears 9 time.
Word 10683 ("produir") appears 1 time.
Word 10684 ("prof") appears 148 time.
Word 10685 ("profess") appears 1 time.
Word 10686 ("profici") appears 2 time.
Word 10687 ("profil") appears 1 time.
Word 10688 ("profit") appears 11 time.
Word 10689 ("proflig") appears 8 time.
Word 10690 ("profligaci") appears 8 time.
Word 10691 ("profond") appears 1 time.
Word 10692 ("progenitor") appears 145 time.
Word 10693 ("progn") appears 5 time.
Word 10694 ("prognath") appears 3 time.
Word 10695 ("prohibit") appears 1 time.
Word 10696 ("prolif") appears 5 time.
Word 10697 ("pronounc") appears 20 time.
Word 10698 ("pronuba") appears 1 time.
Word 10699 ("pronunci") appears 1 time.
Word 10

Word 10884 ("reflect") appears 31 time.
Word 10885 ("reflex") appears 3 time.
Word 10886 ("refug") appears 3 time.
Word 10887 ("refund") appears 10 time.
Word 10888 ("refut") appears 1 time.
Word 10889 ("regal") appears 3 time.
Word 10890 ("regener") appears 2 time.
Word 10891 ("regent") appears 4 time.
Word 10892 ("regiment") appears 4 time.
Word 10893 ("registrar") appears 3 time.
Word 10894 ("regn") appears 4 time.
Word 10895 ("regrowth") appears 2 time.
Word 10896 ("rehears") appears 1 time.
Word 10897 ("reichert") appears 1 time.
Word 10898 ("reign") appears 3 time.
Word 10899 ("reindeer") appears 27 time.
Word 10900 ("reintroduc") appears 1 time.
Word 10901 ("reis") appears 6 time.
Word 10902 ("reiter") appears 3 time.
Word 10903 ("rejoic") appears 1 time.
Word 10904 ("rejoind") appears 1 time.
Word 10905 ("relativ") appears 1 time.
Word 10906 ("relic") appears 3 time.
Word 10907 ("remaind") appears 2 time.
Word 10908 ("remarqu") appears 1 time.
Word 10909 ("remembr") appears 1 t

Word 11095 ("saturniida") appears 4 time.
Word 11096 ("saurait") appears 1 time.
Word 11097 ("sauratus") appears 1 time.
Word 11098 ("savag") appears 247 time.
Word 11099 ("savageri") appears 1 time.
Word 11100 ("savill") appears 1 time.
Word 11101 ("saviotti") appears 2 time.
Word 11102 ("saxicola") appears 3 time.
Word 11103 ("saxon") appears 6 time.
Word 11104 ("scabi") appears 1 time.
Word 11105 ("scallop") appears 1 time.
Word 11106 ("scan") appears 1 time.
Word 11107 ("scandalis") appears 1 time.
Word 11108 ("scandinavia") appears 7 time.
Word 11109 ("scandinavian") appears 2 time.
Word 11110 ("scanti") appears 5 time.
Word 11111 ("scantili") appears 1 time.
Word 11112 ("scapula") appears 1 time.
Word 11113 ("scapular") appears 2 time.
Word 11114 ("scapulatus") appears 2 time.
Word 11115 ("scar") appears 7 time.
Word 11116 ("scarc") appears 18 time.
Word 11117 ("scarciti") appears 5 time.
Word 11118 ("scarlet") appears 16 time.
Word 11119 ("scatter") appears 5 time.
Word 11120 ("

In [18]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)

In [19]:
corpus_tfidf = tfidf[bow_corpus]

In [20]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.014342104054779061),
 (1, 0.0008756336273511451),
 (2, 0.0017587641679075762),
 (3, 0.0017587641679075762),
 (4, 0.002868420810955812),
 (5, 0.005736841621911624),
 (6, 0.002868420810955812),
 (7, 0.002868420810955812),
 (8, 0.005736841621911624),
 (9, 0.002868420810955812),
 (10, 0.007767596501337654),
 (11, 0.028684208109558122),
 (12, 0.0023350230062697203),
 (13, 0.0017587641679075762),
 (14, 0.002868420810955812),
 (15, 0.0006491075248593397),
 (16, 0.0017587641679075762),
 (17, 0.005276292503722729),
 (18, 0.007767596501337654),
 (19, 0.002868420810955812),
 (20, 0.005736841621911624),
 (21, 0.006657939858289418),
 (22, 0.00029187787578371504),
 (23, 0.0008756336273511451),
 (24, 0.0011675115031348601),
 (25, 0.0006491075248593397),
 (26, 0.0006491075248593397),
 (27, 0.0032455376242966983),
 (28, 0.017210524865734874),
 (29, 0.022947366487646496),
 (30, 0.003328969929144709),
 (31, 0.002868420810955812),
 (32, 0.0017512672547022902),
 (33, 0.002868420810955812),
 (35, 0.0

 (288, 0.002868420810955812),
 (289, 0.002868420810955812),
 (290, 0.057368416219116244),
 (291, 0.002868420810955812),
 (292, 0.005837557515674301),
 (293, 0.003210656633620865),
 (294, 0.0006491075248593397),
 (296, 0.002868420810955812),
 (297, 0.0011096566430482363),
 (298, 0.002868420810955812),
 (299, 0.0017587641679075762),
 (300, 0.00408629026097201),
 (301, 0.0017587641679075762),
 (302, 0.020078945676690687),
 (303, 0.016644849645723544),
 (304, 0.002868420810955812),
 (305, 0.002868420810955812),
 (306, 0.0011096566430482363),
 (308, 0.00029187787578371504),
 (309, 0.0008756336273511451),
 (310, 0.005192860198874718),
 (311, 0.0011096566430482363),
 (312, 0.008605262432867437),
 (313, 0.005548283215241182),
 (314, 0.002868420810955812),
 (315, 0.002868420810955812),
 (316, 0.013315879716578835),
 (317, 0.007789290298312076),
 (318, 0.011473683243823248),
 (319, 0.002868420810955812),
 (320, 0.002868420810955812),
 (321, 0.0012982150497186794),
 (322, 0.0006491075248593397),


 (565, 0.014342104054779061),
 (566, 0.002868420810955812),
 (567, 0.002868420810955812),
 (568, 0.0017587641679075762),
 (569, 0.0017587641679075762),
 (570, 0.002596430099437359),
 (571, 0.0011096566430482363),
 (572, 0.0008756336273511451),
 (573, 0.0022193132860964726),
 (574, 0.0029187787578371506),
 (575, 0.002868420810955812),
 (576, 0.0017587641679075762),
 (577, 0.002868420810955812),
 (578, 0.002868420810955812),
 (579, 0.0022193132860964726),
 (580, 0.020078945676690687),
 (581, 0.002868420810955812),
 (582, 0.0035175283358151524),
 (583, 0.002868420810955812),
 (584, 0.00408629026097201),
 (585, 0.002868420810955812),
 (586, 0.005276292503722729),
 (587, 0.003210656633620865),
 (588, 0.002868420810955812),
 (589, 0.0017587641679075762),
 (590, 0.0012982150497186794),
 (591, 0.0006491075248593397),
 (592, 0.0022193132860964726),
 (593, 0.0006491075248593397),
 (594, 0.0017587641679075762),
 (595, 0.0014593893789185753),
 (596, 0.002868420810955812),
 (597, 0.0028684208109558

 (837, 0.002868420810955812),
 (838, 0.0022193132860964726),
 (839, 0.0029187787578371506),
 (840, 0.0011096566430482363),
 (841, 0.0011096566430482363),
 (842, 0.002868420810955812),
 (843, 0.002868420810955812),
 (844, 0.0017587641679075762),
 (845, 0.002868420810955812),
 (846, 0.002868420810955812),
 (848, 0.05803921754095001),
 (849, 0.005736841621911624),
 (850, 0.002043145130486005),
 (851, 0.0035175283358151524),
 (852, 0.0023350230062697203),
 (853, 0.002868420810955812),
 (854, 0.0011096566430482363),
 (855, 0.005736841621911624),
 (856, 0.007035056671630305),
 (857, 0.003328969929144709),
 (859, 0.0035175283358151524),
 (860, 0.00029187787578371504),
 (861, 0.002868420810955812),
 (862, 0.0005837557515674301),
 (863, 0.011473683243823248),
 (864, 0.002868420810955812),
 (865, 0.004438626572192945),
 (866, 0.003210656633620865),
 (867, 0.0011675115031348601),
 (868, 0.011473683243823248),
 (869, 0.011473683243823248),
 (870, 0.005736841621911624),
 (871, 0.005276292503722729)

 (1114, 0.0011096566430482363),
 (1115, 0.0022193132860964726),
 (1116, 0.0014593893789185753),
 (1117, 0.004438626572192945),
 (1118, 0.0011096566430482363),
 (1119, 0.0006491075248593397),
 (1120, 0.0017587641679075762),
 (1121, 0.004543752674015378),
 (1123, 0.0017587641679075762),
 (1124, 0.0035175283358151524),
 (1125, 0.004438626572192945),
 (1126, 0.0005837557515674301),
 (1127, 0.00029187787578371504),
 (1128, 0.004378168136755726),
 (1129, 0.0017587641679075762),
 (1130, 0.0035175283358151524),
 (1131, 0.002868420810955812),
 (1132, 0.002868420810955812),
 (1133, 0.0014593893789185753),
 (1134, 0.0011675115031348601),
 (1135, 0.02108347621791649),
 (1136, 0.0017587641679075762),
 (1137, 0.002868420810955812),
 (1138, 0.004378168136755726),
 (1139, 0.003210656633620865),
 (1140, 0.0012982150497186794),
 (1141, 0.0005837557515674301),
 (1142, 0.0011096566430482363),
 (1143, 0.002868420810955812),
 (1144, 0.002043145130486005),
 (1145, 0.005736841621911624),
 (1146, 0.00286842081

 (1396, 0.008438397823171417),
 (1397, 0.002868420810955812),
 (1398, 0.012311349175353033),
 (1399, 0.013315879716578835),
 (1400, 0.0366186692205918),
 (1401, 0.0017587641679075762),
 (1402, 0.0011096566430482363),
 (1403, 0.003328969929144709),
 (1404, 0.0017587641679075762),
 (1405, 0.004670046012539441),
 (1406, 0.02581578729860231),
 (1407, 0.0017512672547022902),
 (1408, 0.0022193132860964726),
 (1409, 0.00408629026097201),
 (1410, 0.009087505348030756),
 (1411, 0.018864162931820017),
 (1412, 0.002868420810955812),
 (1413, 0.0006491075248593397),
 (1414, 0.0022193132860964726),
 (1415, 0.004378168136755726),
 (1416, 0.001947322574578019),
 (1417, 0.002596430099437359),
 (1418, 0.002868420810955812),
 (1419, 0.0035175283358151524),
 (1420, 0.005548283215241182),
 (1421, 0.012206223073530599),
 (1422, 0.007140182773452736),
 (1423, 0.0011675115031348601),
 (1424, 0.0011096566430482363),
 (1425, 0.002868420810955812),
 (1426, 0.002868420810955812),
 (1427, 0.002868420810955812),
 (

 (1669, 0.002868420810955812),
 (1670, 0.002868420810955812),
 (1671, 0.002868420810955812),
 (1672, 0.002868420810955812),
 (1673, 0.005736841621911624),
 (1674, 0.00029187787578371504),
 (1675, 0.0026269008820534357),
 (1676, 0.0011096566430482363),
 (1677, 0.003328969929144709),
 (1678, 0.003894645149156038),
 (1679, 0.005276292503722729),
 (1680, 0.002868420810955812),
 (1681, 0.0017587641679075762),
 (1682, 0.0049619238883231555),
 (1683, 0.0011096566430482363),
 (1684, 0.001947322574578019),
 (1685, 0.002868420810955812),
 (1686, 0.002868420810955812),
 (1687, 0.007035056671630305),
 (1688, 0.0006491075248593397),
 (1689, 0.0006491075248593397),
 (1690, 0.007035056671630305),
 (1691, 0.002868420810955812),
 (1692, 0.0049619238883231555),
 (1693, 0.0011096566430482363),
 (1694, 0.0011096566430482363),
 (1695, 0.005545679639890585),
 (1696, 0.0017587641679075762),
 (1697, 0.0011096566430482363),
 (1698, 0.003894645149156038),
 (1699, 0.0011096566430482363),
 (1700, 0.00175876416790

 (1942, 0.0017587641679075762),
 (1943, 0.00029187787578371504),
 (1944, 0.0008756336273511451),
 (1945, 0.002868420810955812),
 (1946, 0.0022193132860964726),
 (1947, 0.0023350230062697203),
 (1948, 0.002868420810955812),
 (1949, 0.011473683243823248),
 (1950, 0.005736841621911624),
 (1951, 0.0022193132860964726),
 (1952, 0.005841967723734057),
 (1953, 0.0032455376242966983),
 (1954, 0.011675115031348602),
 (1955, 0.0006491075248593397),
 (1956, 0.0022193132860964726),
 (1957, 0.0035175283358151524),
 (1959, 0.00642131326724173),
 (1960, 0.0006491075248593397),
 (1961, 0.003894645149156038),
 (1962, 0.002868420810955812),
 (1963, 0.002868420810955812),
 (1964, 0.003210656633620865),
 (1965, 0.0011675115031348601),
 (1966, 0.0017587641679075762),
 (1967, 0.0011096566430482363),
 (1968, 0.008605262432867437),
 (1969, 0.0008756336273511451),
 (1970, 0.0011096566430482363),
 (1971, 0.0017587641679075762),
 (1972, 0.0017587641679075762),
 (1973, 0.004670046012539441),
 (1974, 0.00116751150

 (2215, 0.002868420810955812),
 (2216, 0.002868420810955812),
 (2217, 0.0011675115031348601),
 (2218, 0.0035175283358151524),
 (2219, 0.002868420810955812),
 (2220, 0.005276292503722729),
 (2221, 0.002868420810955812),
 (2222, 0.002868420810955812),
 (2223, 0.002868420810955812),
 (2224, 0.008605262432867437),
 (2225, 0.014425536359627072),
 (2226, 0.0017587641679075762),
 (2227, 0.0011096566430482363),
 (2228, 0.002868420810955812),
 (2229, 0.002868420810955812),
 (2230, 0.0017587641679075762),
 (2231, 0.002868420810955812),
 (2232, 0.002868420810955812),
 (2233, 0.002868420810955812),
 (2234, 0.002868420810955812),
 (2235, 0.001947322574578019),
 (2236, 0.002868420810955812),
 (2237, 0.0026269008820534357),
 (2238, 0.002868420810955812),
 (2239, 0.002868420810955812),
 (2240, 0.002868420810955812),
 (2241, 0.005736841621911624),
 (2242, 0.002868420810955812),
 (2243, 0.002868420810955812),
 (2244, 0.008605262432867437),
 (2245, 0.01109135927978117),
 (2246, 0.005545679639890585),
 (2

 (2484, 0.005736841621911624),
 (2486, 0.0017587641679075762),
 (2487, 0.0006491075248593397),
 (2488, 0.002868420810955812),
 (2489, 0.002868420810955812),
 (2490, 0.017210524865734874),
 (2491, 0.0035175283358151524),
 (2492, 0.002868420810955812),
 (2493, 0.0017587641679075762),
 (2494, 0.0017587641679075762),
 (2495, 0.002868420810955812),
 (2496, 0.0005837557515674301),
 (2497, 0.002868420810955812),
 (2498, 0.002868420810955812),
 (2499, 0.005736841621911624),
 (2500, 0.002043145130486005),
 (2501, 0.002868420810955812),
 (2502, 0.009923847776646311),
 (2503, 0.007005069018809161),
 (2504, 0.005192860198874718),
 (2505, 0.008605262432867437),
 (2506, 0.005736841621911624),
 (2507, 0.002868420810955812),
 (2508, 0.0005837557515674301),
 (2509, 0.005736841621911624),
 (2510, 0.002868420810955812),
 (2511, 0.0006491075248593397),
 (2512, 0.005736841621911624),
 (2513, 0.002868420810955812),
 (2514, 0.0035175283358151524),
 (2515, 0.0012982150497186794),
 (2516, 0.00887725314438589),

 (2760, 0.0022193132860964726),
 (2761, 0.002868420810955812),
 (2763, 0.001947322574578019),
 (2764, 0.008605262432867437),
 (2765, 0.0012982150497186794),
 (2766, 0.005276292503722729),
 (2768, 0.005276292503722729),
 (2769, 0.0005837557515674301),
 (2770, 0.002868420810955812),
 (2771, 0.0011096566430482363),
 (2772, 0.002868420810955812),
 (2773, 0.0005837557515674301),
 (2774, 0.001947322574578019),
 (2775, 0.002868420810955812),
 (2776, 0.0035175283358151524),
 (2777, 0.0017587641679075762),
 (2778, 0.0017587641679075762),
 (2779, 0.002868420810955812),
 (2780, 0.002868420810955812),
 (2781, 0.004438626572192945),
 (2782, 0.0032455376242966983),
 (2783, 0.002868420810955812),
 (2784, 0.002868420810955812),
 (2785, 0.008793820839537881),
 (2786, 0.0006491075248593397),
 (2787, 0.003328969929144709),
 (2788, 0.0023350230062697203),
 (2789, 0.0017587641679075762),
 (2790, 0.0012982150497186794),
 (2791, 0.0006491075248593397),
 (2792, 0.05326351886631534),
 (2793, 0.0055456796398905

 (5631, 0.002868420810955812),
 (5632, 0.005736841621911624),
 (5633, 0.0035175283358151524),
 (5634, 0.0017587641679075762),
 (5635, 0.002868420810955812),
 (5636, 0.0035025345094045804),
 (5637, 0.002868420810955812),
 (5638, 0.003328969929144709),
 (5639, 0.16349998622448128),
 (5640, 0.002868420810955812),
 (5641, 0.002868420810955812),
 (5642, 0.0023350230062697203),
 (5643, 0.0017587641679075762),
 (5644, 0.0037944123851882954),
 (5645, 0.002868420810955812),
 (5646, 0.002868420810955812),
 (5647, 0.0022193132860964726),
 (5648, 0.017210524865734874),
 (5649, 0.002868420810955812),
 (5650, 0.0011096566430482363),
 (5651, 0.0006491075248593397),
 (5652, 0.003328969929144709),
 (5653, 0.002868420810955812),
 (5654, 0.002868420810955812),
 (5655, 0.005736841621911624),
 (5656, 0.04748663253350456),
 (5657, 0.002868420810955812),
 (5658, 0.029898990854428795),
 (5659, 0.008793820839537881),
 (5660, 0.009340092025078881),
 (5661, 0.002868420810955812),
 (5662, 0.002868420810955812),
 

 (5898, 0.014342104054779061),
 (5899, 0.002868420810955812),
 (5900, 0.005736841621911624),
 (5901, 0.002868420810955812),
 (5902, 0.005736841621911624),
 (5904, 0.002868420810955812),
 (5905, 0.0011096566430482363),
 (5906, 0.0006491075248593397),
 (5907, 0.0022193132860964726),
 (5908, 0.002868420810955812),
 (5909, 0.002868420810955812),
 (5910, 0.0035175283358151524),
 (5911, 0.014425536359627072),
 (5912, 0.002868420810955812),
 (5913, 0.002868420810955812),
 (5914, 0.002868420810955812),
 (5915, 0.013315879716578835),
 (5916, 0.0011675115031348601),
 (5917, 0.0011096566430482363),
 (5918, 0.005736841621911624),
 (5919, 0.0011096566430482363),
 (5920, 0.007035056671630305),
 (5921, 0.0012982150497186794),
 (5922, 0.011473683243823248),
 (5923, 0.0005837557515674301),
 (5924, 0.005548283215241182),
 (5925, 0.002868420810955812),
 (5926, 0.0026269008820534357),
 (5927, 0.0011096566430482363),
 (5928, 0.0011096566430482363),
 (5929, 0.003894645149156038),
 (5930, 0.01407011334326061

 (6172, 0.04589473297529299),
 (6173, 0.10039472838345342),
 (6174, 0.002868420810955812),
 (6175, 0.014342104054779061),
 (6176, 0.005736841621911624),
 (6177, 0.005736841621911624),
 (6178, 0.002868420810955812),
 (6179, 0.008605262432867437),
 (6180, 0.002868420810955812),
 (6181, 0.002868420810955812),
 (6182, 0.002868420810955812),
 (6183, 0.15202630298065806),
 (6184, 0.17784209027926035),
 (6185, 0.022947366487646496),
 (6186, 0.005736841621911624),
 (6187, 0.002868420810955812),
 (6188, 0.002868420810955812),
 (6189, 0.005736841621911624),
 (6190, 0.002868420810955812),
 (6191, 0.002868420810955812),
 (6192, 0.002868420810955812),
 (6193, 0.005736841621911624),
 (6194, 0.002868420810955812),
 (6195, 0.002868420810955812),
 (6196, 0.002868420810955812),
 (6197, 0.002868420810955812),
 (6198, 0.005736841621911624),
 (6199, 0.002868420810955812),
 (6200, 0.011473683243823248),
 (6201, 0.002868420810955812),
 (6202, 0.002868420810955812),
 (6203, 0.002868420810955812),
 (6204, 0.00

In [21]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=1, workers=2)

In [24]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.015*"male" + 0.011*"femal" + 0.010*"differ" + 0.009*"speci" + 0.008*"colour" + 0.006*"case" + 0.006*"sex" + 0.005*"bird" + 0.005*"sexual" + 0.004*"form"
Topic: 1 
Words: 0.008*"male" + 0.006*"femal" + 0.005*"speci" + 0.005*"say" + 0.005*"differ" + 0.004*"colour" + 0.004*"case" + 0.004*"arcot" + 0.004*"know" + 0.004*"come"
Topic: 2 
Words: 0.009*"male" + 0.006*"femal" + 0.006*"speci" + 0.005*"differ" + 0.005*"colour" + 0.005*"form" + 0.004*"case" + 0.004*"sex" + 0.003*"time" + 0.003*"say"
Topic: 3 
Words: 0.012*"male" + 0.008*"differ" + 0.007*"femal" + 0.006*"colour" + 0.004*"speci" + 0.004*"bird" + 0.004*"anim" + 0.004*"like" + 0.004*"say" + 0.004*"sex"
Topic: 4 
Words: 0.010*"male" + 0.008*"femal" + 0.007*"colour" + 0.007*"differ" + 0.006*"speci" + 0.005*"bird" + 0.004*"form" + 0.004*"say" + 0.003*"charact" + 0.003*"sex"
Topic: 5 
Words: 0.010*"male" + 0.007*"femal" + 0.006*"differ" + 0.006*"speci" + 0.005*"form" + 0.005*"colour" + 0.004*"bird" + 0.004*"sex" + 0.004

In [25]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [26]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.000*"ebook" + 0.000*"ernst" + 0.000*"gutenberg" + 0.000*"html" + 0.000*"header" + 0.000*"haeckel" + 0.000*"asscher" + 0.000*"derek" + 0.000*"midnight" + 0.000*"ibiblio"
Topic: 1 Word: 0.000*"arcot" + 0.000*"ebook" + 0.000*"morey" + 0.000*"gutenberg" + 0.000*"ernst" + 0.000*"wade" + 0.000*"asscher" + 0.000*"cavor" + 0.000*"html" + 0.000*"torlo"
Topic: 2 Word: 0.000*"economist" + 0.000*"brexit" + 0.000*"polit" + 0.000*"ﬁrst" + 0.000*"aramco" + 0.000*"trump" + 0.000*"ﬁrms" + 0.000*"saudi" + 0.000*"voter" + 0.000*"minist"
Topic: 3 Word: 0.000*"speci" + 0.000*"variat" + 0.000*"femal" + 0.000*"colour" + 0.000*"intermedi" + 0.000*"male" + 0.000*"breed" + 0.000*"pigment" + 0.000*"differenti" + 0.000*"phenomena"
Topic: 4 Word: 0.001*"arcot" + 0.001*"cavor" + 0.000*"selenit" + 0.000*"morey" + 0.000*"wade" + 0.000*"torlo" + 0.000*"ship" + 0.000*"sphere" + 0.000*"moon" + 0.000*"say"
Topic: 5 Word: 0.000*"male" + 0.000*"femal" + 0.000*"sex" + 0.000*"colour" + 0.000*"sexual" + 0.000

In [27]:
processed_docs[2]

['project',
 'gutenberg',
 'ebook',
 'moon',
 'well',
 'ebook',
 'cost',
 'restrict',
 'whatsoev',
 'copi',
 'away',
 'term',
 'project',
 'gutenberg',
 'licens',
 'includ',
 'ebook',
 'onlin',
 'gutenberg',
 'titl',
 'moon',
 'author',
 'well',
 'releas',
 'date',
 'octob',
 'ebook',
 'updat',
 'juli',
 'languag',
 'english',
 'start',
 'project',
 'gutenberg',
 'ebook',
 'moon',
 'project',
 'gutenberg',
 'etext',
 'prepar',
 'barri',
 'haworth',
 'moon',
 'well',
 'chapter',
 'bedford',
 'meet',
 'cavor',
 'lympn',
 'write',
 'amidst',
 'shadow',
 'vine',
 'leav',
 'blue',
 'southern',
 'itali',
 'come',
 'certain',
 'qualiti',
 'astonish',
 'particip',
 'amaz',
 'adventur',
 'cavor',
 'outcom',
 'purest',
 'accid',
 'fell',
 'thing',
 'time',
 'think',
 'remov',
 'slightest',
 'possibl',
 'disturb',
 'experi',
 'go',
 'lympn',
 'imagin',
 'unev',
 'place',
 'world',
 'rate',
 'say',
 'shall',
 'peac',
 'chanc',
 'work',
 'book',
 'sequel',
 'utter',
 'varianc',
 'destini',
 'littl'

In [28]:
for index, score in sorted(lda_model[bow_corpus[1]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.9988711476325989	 
Topic: 0.015*"male" + 0.011*"femal" + 0.010*"differ" + 0.009*"speci" + 0.008*"colour" + 0.006*"case" + 0.006*"sex" + 0.005*"bird" + 0.005*"sexual" + 0.004*"form"


In [29]:
for index, score in sorted(lda_model_tfidf[bow_corpus[2]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.9743684530258179	 
Topic: 0.001*"arcot" + 0.001*"cavor" + 0.000*"selenit" + 0.000*"morey" + 0.000*"wade" + 0.000*"torlo" + 0.000*"ship" + 0.000*"sphere" + 0.000*"moon" + 0.000*"say"

Score: 0.022432474419474602	 
Topic: 0.000*"ebook" + 0.000*"ernst" + 0.000*"gutenberg" + 0.000*"html" + 0.000*"header" + 0.000*"haeckel" + 0.000*"asscher" + 0.000*"derek" + 0.000*"midnight" + 0.000*"ibiblio"


In [30]:
test_file = open('Democracy in America.txt','r')
test_file_text = test_file.read()
test_file.close()

tokens = test_file_text.split()
vocab = list(set(tokens))

unseen_document = test_file_text
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.2765149176120758	 Topic: 0.011*"male" + 0.010*"femal" + 0.007*"colour" + 0.004*"differ" + 0.004*"speci"
Score: 0.24740447103977203	 Topic: 0.008*"male" + 0.006*"femal" + 0.005*"speci" + 0.005*"say" + 0.005*"differ"
Score: 0.19391782581806183	 Topic: 0.010*"male" + 0.008*"femal" + 0.007*"colour" + 0.007*"differ" + 0.006*"speci"
Score: 0.12378450483083725	 Topic: 0.009*"male" + 0.009*"femal" + 0.006*"colour" + 0.005*"speci" + 0.005*"case"
Score: 0.0821586474776268	 Topic: 0.015*"male" + 0.011*"femal" + 0.010*"differ" + 0.009*"speci" + 0.008*"colour"
Score: 0.03690759465098381	 Topic: 0.010*"male" + 0.007*"femal" + 0.006*"differ" + 0.006*"speci" + 0.005*"form"
Score: 0.017821181565523148	 Topic: 0.009*"male" + 0.007*"femal" + 0.005*"differ" + 0.004*"speci" + 0.004*"case"
Score: 0.01691647619009018	 Topic: 0.012*"male" + 0.008*"differ" + 0.007*"femal" + 0.006*"colour" + 0.004*"speci"


In [31]:
print('\nPerplexity: ', lda_model.log_perplexity(bow_corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.598385679209368

Coherence Score:  0.37675070977806047


In [33]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary,  mds='mmds')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [34]:
pyLDAvis.display(vis)

In [36]:
list_books = ['The-Economist-USA.txt','The_Descent_of_Man.txt','Problems of Genetics.txt']
file = [open(book,'r', encoding="utf8") for book in list_books]

file_text = [fil.read() for fil in file]
df_text = pd.DataFrame(file_text)
df_text['index'] = df_text.index
documents = df_text
for fil in file:
    fil.close()

In [37]:
stemmer = SnowballStemmer('english')

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [38]:
doc_sample = documents[documents['index'] == 1].values[0][0]

print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))




 tokenized and lemmatized document: 


['project', 'gutenberg', 'ebook', 'descent', 'charl', 'darwin', 'ebook', 'cost', 'restrict', 'whatsoev', 'copi', 'away', 'term', 'project', 'gutenberg', 'licens', 'includ', 'ebook', 'onlin', 'gutenberg', 'titl', 'descent', 'author', 'charl', 'darwin', 'post', 'date', 'januari', 'ebook', 'post', 'novemb', 'updat', 'octob', 'languag', 'english', 'start', 'project', 'gutenberg', 'ebook', 'descent', 'produc', 'asscher', 'descent', 'select', 'relat', 'work', 'charl', 'darwin', 'life', 'letter', 'charl', 'darwin', 'chapter', 'edit', 'franci', 'darwin', 'portrait', 'volum', 'popular', 'edit', 'condens', 'volum', 'naturalist', 'journal', 'research', 'natur', 'histori', 'geolog', 'countri', 'visit', 'voyag', 'round', 'world', 'illustr', 'pritchett', 'popular', 'edit', 'woodcut', 'cheaper', 'edit', 'origin', 'speci', 'mean', 'natur', 'select', 'preserv', 'favour', 'race', 'struggl', 'life', 'larg', 'type', 'edit', 'volum', 'popular', 'edit', 'cheaper', 'edit', 'portrait', 'contriv', 'orchid', 'f

In [39]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [40]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
print(len(bow_corpus))
bow_corpus[1]

6


[(1, 1),
 (10, 7),
 (12, 83),
 (13, 2),
 (15, 26),
 (17, 2),
 (18, 5),
 (21, 2),
 (22, 76),
 (23, 30),
 (24, 7),
 (25, 4),
 (26, 2),
 (27, 10),
 (30, 12),
 (32, 2),
 (34, 45),
 (35, 14),
 (36, 8),
 (37, 13),
 (39, 15),
 (41, 193),
 (42, 163),
 (43, 1),
 (44, 3),
 (46, 7),
 (48, 1),
 (49, 312),
 (50, 11),
 (53, 53),
 (54, 159),
 (55, 39),
 (58, 1),
 (59, 54),
 (60, 8),
 (61, 8),
 (63, 58),
 (64, 63),
 (66, 40),
 (67, 10),
 (76, 105),
 (77, 3),
 (78, 127),
 (80, 10),
 (83, 259),
 (86, 80),
 (87, 91),
 (88, 3),
 (89, 4),
 (92, 2),
 (93, 3),
 (95, 4),
 (97, 14),
 (98, 46),
 (99, 1),
 (100, 2),
 (101, 101),
 (102, 49),
 (103, 1),
 (104, 56),
 (105, 32),
 (107, 5),
 (110, 3),
 (111, 7),
 (113, 33),
 (114, 28),
 (116, 1),
 (120, 28),
 (121, 1),
 (132, 7),
 (141, 1),
 (142, 6),
 (145, 2),
 (151, 1),
 (152, 6),
 (153, 56),
 (154, 10),
 (163, 124),
 (164, 4),
 (165, 5),
 (167, 17),
 (174, 23),
 (175, 9),
 (178, 21),
 (182, 13),
 (183, 2),
 (184, 1),
 (191, 118),
 (192, 154),
 (196, 1),
 (198, 4)

In [41]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=20, id2word=dictionary, passes=1, workers=2)

In [42]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.014*"male" + 0.008*"femal" + 0.005*"colour" + 0.005*"speci" + 0.005*"differ" + 0.004*"form" + 0.004*"case" + 0.004*"bird" + 0.004*"come" + 0.004*"natur"
Topic: 1 
Words: 0.005*"femal" + 0.005*"male" + 0.004*"say" + 0.004*"work" + 0.004*"arcot" + 0.004*"form" + 0.004*"power" + 0.003*"case" + 0.003*"come" + 0.003*"ship"
Topic: 2 
Words: 0.014*"male" + 0.012*"femal" + 0.008*"differ" + 0.005*"bird" + 0.005*"colour" + 0.005*"speci" + 0.005*"sex" + 0.004*"anim" + 0.003*"charact" + 0.003*"probabl"
Topic: 3 
Words: 0.013*"male" + 0.009*"femal" + 0.007*"differ" + 0.007*"colour" + 0.006*"speci" + 0.005*"bird" + 0.005*"sex" + 0.005*"anim" + 0.004*"case" + 0.004*"like"
Topic: 4 
Words: 0.013*"male" + 0.012*"femal" + 0.007*"differ" + 0.007*"colour" + 0.006*"bird" + 0.006*"speci" + 0.005*"case" + 0.005*"anim" + 0.005*"sex" + 0.004*"sexual"
Topic: 5 
Words: 0.009*"femal" + 0.009*"male" + 0.007*"differ" + 0.005*"colour" + 0.004*"speci" + 0.004*"say" + 0.004*"know" + 0.004*"bird" + 0

In [43]:
for index, score in sorted(lda_model[bow_corpus[1]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.3536083996295929	 
Topic: 0.012*"male" + 0.012*"femal" + 0.009*"speci" + 0.008*"differ" + 0.007*"colour" + 0.006*"sex" + 0.005*"bird" + 0.004*"case" + 0.004*"anim" + 0.004*"great"

Score: 0.3498832583427429	 
Topic: 0.013*"male" + 0.012*"femal" + 0.007*"differ" + 0.007*"colour" + 0.006*"bird" + 0.006*"speci" + 0.005*"case" + 0.005*"anim" + 0.005*"sex" + 0.004*"sexual"

Score: 0.293453186750412	 
Topic: 0.018*"male" + 0.011*"colour" + 0.010*"femal" + 0.009*"differ" + 0.007*"case" + 0.006*"speci" + 0.006*"sex" + 0.006*"bird" + 0.005*"anim" + 0.005*"sexual"


In [44]:
print('\nPerplexity: ', lda_model.log_perplexity(bow_corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -9.346355973021442

Coherence Score:  0.3599786976498738


In [46]:
test_file = open('The Last Evolution.txt','r')
test_file_text = test_file.read()
test_file.close()

tokens = test_file_text.split()
vocab = list(set(tokens))

unseen_document = test_file_text
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.37628817558288574	 Topic: 0.007*"form" + 0.006*"male" + 0.006*"femal" + 0.006*"differ" + 0.005*"colour"
Score: 0.36085566878318787	 Topic: 0.008*"male" + 0.006*"say" + 0.005*"colour" + 0.005*"femal" + 0.005*"speci"
Score: 0.1585923284292221	 Topic: 0.006*"differ" + 0.006*"male" + 0.005*"speci" + 0.005*"form" + 0.005*"femal"
Score: 0.06490256637334824	 Topic: 0.009*"male" + 0.006*"femal" + 0.005*"colour" + 0.005*"come" + 0.005*"like"
Score: 0.013996445573866367	 Topic: 0.008*"male" + 0.006*"differ" + 0.005*"colour" + 0.005*"femal" + 0.005*"bird"
Score: 0.010114007629454136	 Topic: 0.005*"male" + 0.005*"say" + 0.004*"femal" + 0.004*"speci" + 0.004*"differ"


In [47]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary, mds='mmds')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [48]:
pyLDAvis.display(vis)