In [1]:
# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html
from sklearn.cluster import AgglomerativeClustering

# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.FeatureAgglomeration.html
from sklearn.cluster import FeatureAgglomeration

# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
from sklearn.cluster import KMeans

# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.MiniBatchKMeans.html
from sklearn.cluster import MiniBatchKMeans

In [2]:
import numpy as np
from nltk.corpus import stopwords
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import pandas as pd
import seaborn as sns
import timeit
import markovify
import re
from collections import Counter

In [3]:
df = pd.read_csv('../data/lem_stem_text.csv')

In [4]:
# load pickle with text files filtered 
other_files = pickle.load(open('../data/file_w_text.pkl', 'rb'))
#print(len(other_files))

files_w_text = df[df['stemmed_text'].isna()]
other_files.extend(list(files_w_text['filename'].values))
#print(len(other_files))
pickle.dump(list({*other_files}), open('../data/file_w_text.pkl', 'wb'))

df.drop(index=files_w_text.index, inplace=True)

# Parameters
## [TFIDF](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)
**tfidf.stop_words_**
    Terms that were ignored because they either:

- occurred in too many documents (max_df)

- occurred in too few documents (min_df)

- were cut off by feature selection (max_features)

In [21]:
params = {
    'vectorizer': {
        'analyzer': 'word',
        'stop_words': stopwords.words('english'),
        'ngram_range': (1, 1),
        'token_pattern': '[a-z]{3,}',
        'max_df': 0.1,
        'lowercase': True
    },
    'km_params': {
        'random_state': 30,
        'n_clusters': 10,
    }
}

tfidf = TfidfVectorizer(**params['vectorizer'])
X = tfidf.fit_transform(df['stemmed_text'])
KMeans(**params['km_params']).fit(X).labels_

array([6, 2, 6, ..., 2, 2, 2], dtype=int32)

In [22]:
labls = KMeans(**params['km_params']).fit(X).labels_

In [26]:
for i in zip(df['filename'].values, labls):
    print(i)
    break

('000083.text', 6)


In [5]:
params = {
    'vectorizer': {
        'analyzer': 'word',
        'stop_words': stopwords.words('english'),
        'ngram_range': (1, 1),
        'token_pattern': '[a-z]{3,}',
        'max_df': 0.1,
        'lowercase': True
    },
    'raw_documents': df['stemmed_text'],
}

In [6]:
tfidf = TfidfVectorizer(**params['vectorizer'])
X_tfidf = tfidf.fit_transform(df['stemmed_text'])

tfidf_doc_word = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names())
tfidf_doc_word

Unnamed: 0,aaa,aaaa,aaaaacgccacatggcttggtc,aaaaacgccacatggcttggtctcttggacatgctgcaaata,aaaaagttgg,aaaacctcattctcggacccatct,aaaacgctgctgggagatggtc,aaagtacttggagcttgcaggtgcg,aaah,aaasouth,...,zygot,zymogen,zyprexa,zyrtecd,zzeve,zzi,zznobo,zzp,zzuser,zzwk
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3225,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3228,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
len(tfidf.stop_words_)

812

In [8]:
for i in tfidf_doc_word.columns:
    print(i)

aaa
aaaa
aaaaacgccacatggcttggtc
aaaaacgccacatggcttggtctcttggacatgctgcaaata
aaaaagttgg
aaaacctcattctcggacccatct
aaaacgctgctgggagatggtc
aaagtacttggagcttgcaggtgcg
aaah
aaasouth
aab
aac
aacaaaataa
aaccggctttctgggcttttg
aacgggactggcatgctacagg
aacgroup
aactcttccgcccgcttgtgaacagcctcagtctccacag
aactgccacatcctttgcgt
aactiv
aacton
aacw
aad
aaeavmaga
aaf
aafa
aafm
aafrsocwhjxca
aag
aagcccaccgacccatct
aagelkngfaiirppghhaeestamgfcffnsvaitakllqqklnvgkvlivdwdihhgn
aagfcyfnnaaiaaeyairelgvdsvaildwdahhgdgtqeifydrddvlyvsihqdgrt
aaggaggggcgacaaaggaaaat
aaggaggggcgacaaaggaaaatatgaccagtttcaccacacc
aagggtggcgggaggaagcacactgccagtctccagc
aahd
aahn
aai
aaipeegiaridadttaspkswqaviaaigaanaavddvfagradnvfvaarppghhaek
aaiqng
aajpguk
aak
aal
aalib
aalpd
aaltckeetgealiidfdahhgngtqeifwndpevvhvdlherdi
aam
aamacdowell
aandbpub
aandem
aaot
aap
aapcc
aapex
aapj
aaq
aaquatics
aaqvlrkqaarvaildvdlhhgngtqgifyarpdvftvslhadpvrfypffwghaderg
aar
aardvark
aarp
aarvaildvdlhhgngtqgifyarpdvft
aarvaildvdlhhgngtqgifyarpdvftvslhadpv
aas
a

afrika
afrikan
afrims
afro
afroamericano
afrocentr
afrohuman
afrosea
afs
afscf
afsi
aft
afterbirth
afterbodi
afterburner
aftercar
aftercut
afterdamp
afterglow
afterhour
afterimag
afterimage
afterload
aftermarket
aftermath
afternoon
afterschool
aftershock
afterthought
aftertreat
aftertreatment
afterward
afterword
aftfter
aftua
afudc
afunctional
afwa
afwholesale
afya
afyqspdilylsihrhddgnffpgtggptecgsgaglgfnvniswsgalnpplgdaeyia
afz
aga
agaaacaggctgcaaaggtggac
agaagtcctgggcattgtcgg
agaarlaknymkkviiidfdvhhgngtqeifwndnrvihidfhqrgi
agaccggcgcacagaggaag
againfo
agajnst
agak
agalsidas
agama
agana
agar
agaricit
agarkov
agaros
agarsupply
agat
agav
agaw
agbahey
agbanzo
agbodjan
agboyibor
agc
agccatagctaggccaccaatctggttctttgtgaaggcagc
agcccctgaactgtgatgatgagg
agcggccccggtagcact
agcggccccggtagcactgaggaagtggaactgcgtg
agcgtcgcgccaagag
agcm
agctgccggtcatcgtaagtgc
agdufa
ageappropri
aged
agee
ageg
ageing
agen
agenci
agencies
agency
agencywid
agenda
agengies
agengineering
agent
agents
agentss
agenzia
age

andth
andthen
andther
andthousa
andtransport
andvari
andwholesal
andy
andyandes
ane
anecdot
anelli
anem
anemia
anemomet
anemon
anen
anencephali
anerg
anergen
anergic
anergy
anesthesia
anesthesiolog
anesthesiologist
anesthesiology
anesthet
anesthetic
anesthetise
anesthetize
aneuploid
aneuploidy
aneurysm
anew
anex
anexa
ang
anga
angel
angela
angelabonobana
angeles
angelfish
anger
angereichert
angina
angio
angioedema
angiogen
angiogenesi
angiographi
angioneurot
angioplasti
angiosperm
angiotensin
angl
anglais
angle
angler
angles
anglican
anglicization
anglicize
anglo
anglophil
angloplat
angri
angrili
angsana
angst
angstrom
anguish
anguished
angul
angular
angularity
angulate
angulated
angulation
angus
angustifolia
anharmon
anhydr
anhydras
anhydrid
anhydridetermin
anhydrit
anhydrou
ani
anil
anileridin
anilid
anilin
anim
animadversion
animadvert
animal
animalcul
animalia
animate
animation
animaux
anime
animos
animus
anion
anis
anisindion
anisoplia
anisotrop
anisotropi
anistreplas
aniswave
ani

asexu
asf
asghegtampftwptrglrgdvppkrvdallgyysfdaga
asgm
ash
asham
ashbon
ashelman
ashen
ashepherd
ashford
ashi
ashland
asho
ashor
ashp
ashra
ashtabula
ashtray
ashw
asi
asia
asian
asiat
asic
asid
asifood
asil
asilida
asimilar
asinh
asisst
asisten
asistencia
asistir
asiz
asize
asjoab
aska
askew
askhertz
askoff
askon
asl
asleep
asm
asmfc
asmx
asn
aso
asociaci
asociada
asociado
asoii
asolut
asopao
asopisu
asotin
asotus
asp
aspac
asparagin
asparaginas
asparagus
aspart
aspartam
aspe
aspectratio
aspecular
aspel
aspen
aspergillus
asphalt
aspher
asphyxi
asphyxia
asphyxiant
aspic
aspir
aspiraci
aspiration
aspire
aspirin
asplen
aspnet
asprin
aspx
asqc
asr
asrar
asrba
ass
assail
assassin
assassinate
assassination
assault
assay
assciat
assedt
assef
assem
assembl
assemblag
assembleproducts
assembly
assenizatsii
assenmbl
assent
assert
asserted
assertion
assertive
assertiveness
asservation
asses
assessment
assessments
assessor
assest
asset
assetpath
assets
assever
asseverate
assez
assi
assiduous
assid

banksia
banksiana
bankstabil
bankstreet
banktop
bankwir
banner
bannerasset
bannerimg
bannermap
bannert
bannerurl
banquet
bantam
bao
bap
baptise
baptism
bar
baraga
baraja
barangay
barb
barbadian
barbar
barbarian
barbaric
barbarism
barbata
barbecu
barbel
barber
barberri
barbershop
barbitur
barbiturates
barborderclass
barc
barcel
barclass
barclay
barcod
bard
bare
barebon
barefoot
barenboim
bareness
barg
bargain
barge
bargeman
barger
barikmo
barium
bark
barl
barley
barlow
barmore
barn
barnacl
barnstabl
barnstorm
barnyard
baroclin
baromet
barometr
baron
baronet
baronetcy
baroqu
baroreceptor
barotrop
barotropic
barouch
barr
barrack
barracuda
barrag
barranca
barrel
barren
barreto
barri
barricad
barrier
barrierbust
barrigada
barrist
barrlabs
barroom
barrow
barrrack
barrymac
bartel
bartelso
barter
bartholin
bartoe
bartonflat
bartschi
bartt
barva
barwidth
barx
baryon
bas
basada
basado
basain
basal
basalt
bascial
basebal
baseband
baseboard
basecarriercategori
basecontentcategori
based
basedow
bas

blockioelectron
blockiol
blockiomuon
blockiorawtrig
blockiotrack
blockram
blocks
blocksign
blog
blogg
blogger
blogs
blond
bloo
blood
bloodborn
bloodgood
bloodi
bloodless
bloodlet
bloodlett
bloodmeal
bloodsh
bloodshot
bloodspot
bloodstream
bloodsuck
bloom
blossom
blot
blotch
blous
blow
blowdown
blower
blowfish
blowhol
blown
blowout
blowup
blqkrddw
blrt
blrts
bls
blt
blttl
bltvl
blu
blubber
bludgeon
blue
bluearrow
bluebeehatch
bluebel
blueberri
bluebird
bluefin
bluefish
bluegil
bluegrass
bluegreen
bluejay
blueness
blueprint
bluesman
bluespin
bluespond
bluestem
blueston
bluestraightbkgd
blueway
bluewin
blueyonder
bluez
bluff
bluish
blunder
blunderbuss
blunt
bluntfac
bluntness
bluntnos
blur
blurfocus
blurri
blurt
blush
bluster
blusteri
blv
blw
bma
bmaa
bmag
bmaped
bmawmvjp
bmaxdeltar
bmc
bmcs
bmd
bmeglen
bmenke
bmeyer
bmi
bmiller
bmir
bmjped
bmlh
bmm
bmoxckutjsg
bmp
bmpft
bmr
bms
bmsincorp
bmukherjee
bmxph
bname
bnc
bncterial
bncteriolog
bnd
bndri
bnetworks
bnf
bnfdux
bnk
bnl
bnorris
bnp
b

cambi
cambia
cambiar
cambio
cambium
cambler
cambodian
cambrian
cambridge
camcord
camcra
camcrasvicw
came
camel
camellia
cameo
camera
cameraman
cameras
camerawork
cameron
cameroonian
camhmi
camhmis
camille
caminando
caminar
camisa
camouflag
camp
campaign
campanologist
campanology
campb
campbell
campbellsci
camper
campesino
campestri
campfir
campfire
campground
camphor
camphorata
campimet
campion
campral
campsit
camptosar
campus
campylobact
canada
canadensi
canadian
canajohari
canal
canalization
canalize
canalla
canard
canari
canarsiecc
canc
cancel
cancelbubbl
cancellation
cancer
cancerchromosom
cancersociety
cancertopics
canci
cancom
cancriform
cand
candela
candetectplugin
candi
candid
candida
candidaci
candidacy
candidate
candidates
candidness
candl
candlelight
candlepow
candor
candy
cane
cani
canin
canis
canism
canist
canisters
cannabi
cannabidiol
cannabinoid
canneri
canni
cannib
cannibal
cannibalism
cannibalisminduc
cannibalist
cannibalistic
canning
cannon
cannot
cannula
cano
canoe
c

chatt
chattanooga
chattanoogadiscgolf
chattanoogatrackclub
chattanoogawritersguild
chattaroy
chattbike
chatter
chaud
chaudhuri
chauffag
chaussur
chauvinist
chbounds
chc
chch
chcocbxx
chd
chdir
chdr
che
cheap
cheaper
cheapli
cheapness
cheat
cheater
cheatgrass
chec
checkbook
checkbox
checkcomment
checkconnect
checked
checker
checkerboard
checkin
checkit
checklist
checkmark
checkobj
checkout
checkpoint
checkroom
checks
checksecur
checksum
checkup
checkur
checkwrit
cheek
cheekbon
cheer
cheerful
cheerfulness
cheerlead
cheerless
cheerlessness
chees
chef
chehalis
chej
chelat
chelate
chelation
chelip
chell
chem
chemcial
chemicaleffectsof
chemically
chemicalprotect
chemin
chemine
chemiosmot
chemisorb
chemisorpt
chemist
chemistri
chemkal
chemnitz
chemo
chemoattract
chemoautotroph
chemofossil
chemoheterotroph
chemoheterotrophic
chemokin
chemokinesis
chemokinetic
chemoprevent
chemosensit
chemostat
chemotaxi
chemotherapeut
chemotherapi
chemotyp
chempc
chen
chenevo
cheney
cheneyhall
chenier
cheniqua

codification
codify
coding
codistribut
codmac
codman
codomin
codominant
codon
coe
coec
coedit
coeduc
coef
coeff
coeffici
coelenter
coelentrazin
coeliac
coelom
coelut
coenocyt
coenzym
coerc
coercion
coerciv
coerciviti
coercivity
coeruleu
coeval
coevolut
coexist
coexpress
coextens
cofactor
cofactsheet
cofe
coffe
coffea
coffer
cofferdam
coffi
coffin
cofn
cog
cogaids
cogener
cogent
cognat
cognit
cognitive
cogniz
cognizance
cognize
cognoscent
cogrweeg
cogseiv
coh
cohabit
cohabitation
cohb
cohcsion
cohcsionless
cohen
coher
cohes
cohesionl
cohesionless
cohesiv
cohnert
coho
cohonina
cohor
cohort
cohosh
coiffures
coii
coil
coilabl
coillin
coin
coinag
coinc
coincid
coincidence
coincident
coinfect
coinsur
coir
coitus
cojet
cojr
cok
coke
col
cola
colaboraci
colaborado
colaborar
colaf
colagr
colar
colascholars
colband
colbert
colbornpapfnl
colby
colchicin
cold
colder
coldest
coldmass
coldplat
coldtest
coldwat
cole
colecci
coleoptil
colestipol
colgando
coli
colibri
colic
coliform
colilert
colinvaux


corpora
corporation
corporatist
corport
corps
corpsman
corpul
corpulence
corpulent
corpus
corpuscular
corr
corral
corre
correcta
correctament
correctbyteshading
correctcolorshading
correctivo
correctly
corrector
correctorstrength
correctshading
correctshortshading
corredor
correentr
corregir
correl
correlationstructur
correlogram
correo
correpond
corresond
correspondan
corridor
corridordevelop
corrl
corrlprcssion
corrobo
corrobor
corrod
corros
corrug
corrupt
corruption
corset
corsvcrgcnc
cortex
cortic
corticos
corticosteroid
cortisol
cortison
corvett
corvid
corymbosum
cos
cosa
cosech
cosecha
cosechado
cosechar
cosegreg
cosh
cosid
cosign
cosin
cosmet
cosmetolog
cosmic
cosmo
cosmolog
cosmological
cosmologist
cosmonaut
cosmopolis
cosmopolitan
cosmos
cosmotron
cosn
cosponsor
cosq
costa
costal
costar
costaricensi
costeffect
costen
costh
costheta
costimulatori
costliness
costly
costneutr
costo
costs
costum
costumbr
cosurfact
cot
cota
cotazym
cotect
cotermin
coth
cotidiana
cotinin
coton
cotr


delicatessen
delici
deliciosum
delicto
delight
delim
delimit
delin
delineated
delinqu
delirium
delist
deliv
deliver
deliveredthre
deliveri
delivery
dell
della
deloc
delpag
delphacida
delphinium
delpijl
delstr
delt
delta
deltaguaba
deltaic
deltar
deltat
deltav
deltavirg
deltavis
deltoid
deltor
delug
delus
delux
delv
delveri
delx
dem
demadex
demagnet
demagogu
demagogueri
demakova
demand
demanda
demandeur
demarc
demarcada
demasiado
demat
demco
demean
demeanor
dement
dementia
demer
demerol
demers
demersum
demet
demeur
demilitar
deminer
demiolog
demis
demix
demo
demob
demobil
demobilizion
democraci
democracia
democrat
democratization
demodul
demograph
demographi
demogrpah
demokratom
demolhuman
demolish
demolit
demon
demonstrated
demor
demora
demoralis
demostr
demostrar
demot
demount
demplt
dempsey
demultiplex
demur
demutu
demux
demyelin
demystifi
den
denatur
denc
dendrim
dendrit
dendritic
dendritogenesi
dendro
dendrobatid
dendrochronolog
dendrogram
denesiuk
dengu
deni
denial
denied
denigr
d

dismal
dismantl
dismast
dismay
dismemb
dismember
disminuir
disminuy
dismiss
dismissal
dismissed
dismount
dismuk
dismutas
disney
disobedi
disobey
disodium
disom
disopyramid
disord
disorder
disorders
disorgan
disori
disorient
disorigin
disown
disp
dispar
disparag
dispariti
disparlur
dispartiy
dispatch
dispel
dispens
dispensari
dispensatori
dispers
dispersa
dispersiviti
dispirit
displac
displacement
displacements
displaybar
displayhorizontaldrawinggridevery
displayverticaldrawinggrideveri
displayverticaldrawinggridevery
displeas
displeasur
displex
dispon
dispos
disposici
disposicion
disposit
dispositif
disposition
dispositivo
dispossess
dispossessori
disproport
disproportion
disproportionat
disprov
disproven
disput
disputa
disputar
dispute
disqualif
disqualifi
disquiet
disquisit
disr
disre
disregard
disrepair
disreput
disrespect
disrob
disrupt
disruptor
diss
dissapeardelay
dissapoint
dissappoint
dissatisfact
dissatisfi
dissect
dissembl
dissemin
disseminafion
dissens
dissent
dissenting
dis

eaeff
eaest
eafi
eagan
eagbahey
eager
eagl
eaigklglpsvvvqeggylcdalsdnltaf
eak
eal
ealan
ealbajqymq
ealllgrppghhagirgralgaptagfciv
ealth
ealthcar
ealthcare
ealtl
eanalmeth
eanes
eap
eaq
eaquafarm
ear
earch
eardrum
earenholz
earin
eariy
earlbquak
earless
earlhq
earlhqmkc
earlier
early
earlyed
earlypay
earlystartinc
earmark
earmarks
earn
earnabl
earner
earnest
earnings
earphon
earplug
earring
earshap
earspool
eartbquakc
earth
earthact
earthbound
earthcall
earthen
earthern
earthexplor
earthfil
earthlink
earthlodg
earthmedi
earthquak
earthquakes
earthshin
earthstar
earthwatch
earthwork
earthworks
earwax
earwig
eas
easel
easement
easibl
easier
easiest
easili
easp
east
eastbay
eastbound
easter
eastern
easternmost
easton
eastward
easur
eat
eatabl
eater
eaton
eatonville
eatpost
eattdyqb
eav
eavesdrop
eavesdropp
eavigat
eax
eaxxgdg
eay
eaz
ebam
ebb
ebbot
ebbr
ebc
ebd
ebdvg
ebglaw
ebhmub
ebi
ebian
ebillpay
ebkq
ebm
ebmast
ebmatc
ebo
eboni
ebp
ebrat
ebs
ebt
ebulletin
eburwd
eby
eca
ecad
ecal
ecals

entor
entorpec
entourag
entr
entrada
entrain
entran
entranc
entranceway
entrant
entrap
entrar
entre
entregado
entregu
entrench
entrepreneaur
entrepreneur
entrepreneuri
entrepreneurship
entrepris
entretenir
entrevista
entrevu
entrez
entrezsystem
entries
entrop
entropi
entropy
entrust
entry
entryrescu
entryway
entsm
entsmend
entt
entual
entwin
entz
enum
enumclaw
enumer
enumerated
enunci
env
envejecimiento
envelop
envi
enviada
enviar
envion
envious
enviparamed
envir
enviro
envirofact
enviroment
enviromncnt
environagents
environemtn
environhsi
environmen
environmenta
environmental
environmentaldisturb
environmentalist
environments
environn
envirosmart
envis
envisag
envisioned
envoy
envrpt
envstudies
envz
eny
enzoot
enzym
enzymat
enzymax
enzyme
enzymol
enzymolog
eob
eoc
eocen
eoconnor
eodraft
eodtmldr
eof
eoff
eofvaswjq
eog
eok
eol
eolian
eomon
eompression
eon
eop
eopl
eoqmkwqlt
eor
eorgi
eorrstrsiinl
eorun
eos
eosdis
eosin
eospso
eoswa
eot
eov
eowcpjwel
epa
epaapprov
epact
epafiles
epanci


fabrick
fabul
fabz
fac
faca
facad
facebook
facecent
facelift
facepl
faces
faceshield
facet
faceti
facgmp
faci
facial
facid
faciil
facililight
facilit
faciliti
facilities
facilitiesexpo
facility
facilityi
facilti
facin
facl
faco
facpi
facsimil
facsimile
facsimili
facstaff
factfind
faction
factiti
facto
factoid
factori
factors
factory
facts
factsag
factsaircraftsurveys
factsheet
factsheets
factsmsr
factual
facturar
facu
facult
faculti
faculty
fad
fadc
fadccca
faddl
fade
fae
faecal
faekir
faeri
faet
fafafa
fafc
faggot
faheem
fahgkivlaleggynleslgksslacvq
fahmed
fahrenheit
faibl
faiecliti
failed
failov
failsaf
failures
failureto
fainer
faint
fair
fairfax
fairfaxcounty
fairforest
fairground
fairi
fairsaac
fairway
fairwaymed
faisant
fait
faith
faithbas
fake
fakeroot
fakn
fal
falais
falcat
falcata
falciparum
falcon
falcrppghhaggdfmggycflnnaaiat
falirppghhavpkgpmgfcvfgnvaiaarhaqrthglkrifiidfdvhhgngtndaft
fallaci
fallback
fallen
fallibl
falll
falloff
fallopian
fallout
fallow
falls
fals
false
fal

garabet
garag
garage
garamycin
garb
garbag
garbanzo
garbolog
garcia
gard
garden
gardenoflife
gardenoflifeusa
gareth
garfield
gargoyl
gari
garland
garlic
garment
garner
garnet
garnett
garnish
garret
garrison
garrot
garrvaildvdyhhgngtqd
garsafalcrppghhaggdfmggycflnnaaiat
garsafalcrppghhaggdfmggycflnnaaiatqafldqgarrvaildvdyhhgngtqd
garter
gartman
gary
gasb
gase
gaseous
gaser
gases
gasfield
gash
gasif
gasket
gaslight
gasohol
gasolin
gasolina
gasolinepow
gasomet
gasp
gass
gassihoun
gastar
gastechnology
gaster
gastight
gastric
gastriti
gastro
gastrocnemius
gastroenter
gastroenterolog
gastroenterologist
gastroesophag
gastroid
gastroinfect
gastrointes
gastrointestin
gastropod
gastropoda
gastrostomi
gastrul
gat
gata
gate
gateabl
gatech
gatedrain
gatekeep
gateserver
gateway
gatf
gatgcccctatcagatgtgttgaa
gather
gatifloxacin
gation
gator
gatorbob
gatsonis
gatz
gau
gauch
gaucho
gaudi
gaug
gaunt
gauntlet
gaur
gausian
gauss
gaussian
gauz
gavag
gave
gavel
gavh
gawk
gawronski
gax
gaxg
gaxh
gay
gayw
gaz

goldwork
golf
golfbal
golfer
golik
golux
golyt
golytely
gom
gonad
gonadotroph
gonadotropin
gondola
goni
goniomet
gonioscop
gonna
gonochor
gonococcus
gonopodi
gonopodium
gonorrhea
gonorrheae
gonorrhoeae
gonumb
gonumber
gonzalez
goo
goodby
gooddataqu
goodeast
goodei
goodfieldhorz
goodfieldvert
goodi
goodies
goods
goodwil
goodwinprocter
goodyear
gooey
goofi
googl
google
googlebot
googlemobiles
goop
goopi
goops
goos
goose
gooseberri
gooseneck
gopher
goraya
gordon
gorg
gorgeous
gorget
gorgonian
gori
gorilla
gorlando
gorrilla
gorskii
gortex
gorup
goryeo
gosh
gosiut
gosjnuewy
goskaya
gospel
gosps
gossam
gossip
got
goth
gothere
gothic
gothicist
gothicness
goto
gott
gouach
goubi
goudronn
goudybookletter
goug
gouger
gourmet
gout
gouvern
gove
govem
governm
government
governmentissu
governmentpublications
governmentwid
governor
governorgener
governorship
govert
govertschilling
govpubs
govt
govtech
gown
goyer
gpa
gpb
gpc
gpd
gpdr
gpg
gpglgynfnlplprksadaaflealgvafqrirafspdalvvalgldafegdpfgglsvt
gpgv

hexahedr
hexahedron
hexahydr
hexakis
hexal
hexan
hexanchiform
hexaploid
hexapod
hexapol
hexaval
hexazinon
hexos
hexosaminidase
hexr
hexterock
hexulos
hexuron
hexyl
hey
heyday
heyison
hfc
hfd
hff
hfh
hfhh
hfill
hfir
hfm
hfo
hfootprint
hfs
hfscontent
hfsfdummy
hfsfh
hfsfooterhack
hfti
hfu
hfv
hgbdebt
hgbgarth
hgdprveypyflgyad
hgieozasps
hgsv
hgt
hgtracks
hgv
hgvbutton
hgvfooter
hgvlist
hhafadvaggfcfinnsavaaqvlrkqaarvaildvdlhhgngtqgifyarpdvftvslh
hhaggdfmg
hhg
hhgn
hhgng
hhgngtq
hhgngtqaiveinpqiaycslhqypc
hhgngtqgifyarpdvftvslhadpvrfypffwghadergegpglgynfnlplprksada
hhlaw
hhmm
hhmmss
hhnura
hhp
hhs
hhsc
hhvd
hhwicion
hhz
hia
hiac
hiatus
hibern
hibiscus
hibit
hic
hici
hick
hickori
hid
hidden
hiddenconst
hide
hidecp
hidedelay
hidediv
hidegrammaticalerrors
hideinlineeditsplash
hideinlineschedulingsplash
hidelist
hidelistbutton
hidemenu
hideonmouseout
hiderppwarn
hides
hidetarget
hidetop
hidewash
hidrogr
hidta
hier
hierach
hierarch
hierarchi
hierarchy
hieroglyph
hig
higgs
highaffin
highalertme

inferences
inferenti
inferior
infernali
inferred
infertil
infest
infesta
inffar
infidel
infield
infil
infill
infiltr
infiltromet
infimum
infin
infinet
infinit
infinitesim
infinitum
infirm
infirmari
infix
inflam
inflamm
inflammation
inflammatori
inflat
inflatedclav
inflationari
inflect
inflector
inflex
inflict
inflight
inflo
inflores
infloresc
inflow
influ
influence
influenci
influencia
influent
influenti
influenza
influx
infm
info
infobank
infodb
infodir
infofact
infoinc
infold
infom
infomerci
infomil
infopap
infor
inforev
informa
informaci
informaloc
informan
informant
informar
informat
informath
informati
informatio
information
informationc
informationweek
informedthat
infoseek
infowars
infozen
infozip
infra
infracabbrevi
infract
infraorbit
infrar
infrared
infrastruc
infrastructur
infratechnolog
infratechnologi
infrecuentement
infrequ
infring
infringement
infringers
infrutesc
infuri
infus
infusedsolutions
infusor
ing
ingaa
ingdal
ingebretson
ingeni
ingenu
ingest
ingl
ingok
ingot
ingr

joda
jody
joe
joeb
jog
jogger
johann
johansson
johl
john
johnkokou
johnni
johns
johnson
johnston
joiinam
join
joinder
joiner
joinstyle
joint
jointcommission
joist
jokamoto
joke
jolesz
jolina
jolt
jon
jonathan
jonell
jones
jont
joppa
joqa
jordan
jordanian
jorgensen
jorkey
jose
joseph
jossey
jostl
jou
joul
joule
jouml
jour
journ
journal
journalist
journals
journaux
journey
journui
jouzwounpjjprcj
jovial
jovian
joy
joyce
joyceb
joycelyn
joyous
jpc
jpe
jpeg
jpegint
jpeglib
jpg
jpi
jpl
jpmchase
jratcliffe
jre
jreznik
jrnasiatka
jrow
jrsmith
jsa
jsb
jsberg
jsc
jscp
jsdlgloader
jsessionid
jsetiwqhdwrw
jshay
jsi
jsibio
jsmith
jsp
jstone
jsu
jsub
jsutils
jsutton
jsversion
jtag
jtbxw
jtfloyd
jth
jti
juan
juana
juanes
jubil
jucateo
judeo
judg
judgebefor
judgement
judgeship
judgment
judi
judic
judicata
judici
judiciair
judiciari
judith
judy
judyjust
juelich
juev
jufq
juft
jug
jugat
juggernaut
juggl
juggler
juic
juici
juillet
juin
jukebox
jul
juli
julia
julian
julians
julie
juliett
jumbl
jumbo
jump

lavra
lavxl
lavy
lawbreak
lawec
lawenforc
lawftilli
lawless
lawmak
lawman
lawn
lawrel
lawrence
laws
lawson
lawsuit
lawyer
lax
laxat
laxiti
lay
layback
layer
layernam
layers
layery
layin
layman
layoff
layout
layoutrawtablewidth
layouttablerowsapart
layov
layperson
layup
lazaroo
laze
lazi
lba
lbar
lbarnard
lber
lbl
lbm
lbnd
lbnl
lbp
lbrmer
lbrown
lbs
lburkhead
lbwn
lca
lcala
lcd
lcdmml
lcdper
lch
lchclearnet
lcib
lcif
lck
lcl
lclass
lcm
lcmpor
lcood
lcoody
lcp
lcrbmrp
lcrmscp
lcrqk
lcs
lcsh
lcsja
lcst
lctatd
lcv
lcweb
lda
ldac
ldap
ldata
ldb
ldcm
lddacvriag
ldecrees
ldejager
ldeq
ldeu
ldivid
ldivide
ldk
ldkt
ldl
ldlcruz
ldllelynpsyvl
ldmi
ldquo
ldrd
ldri
ldrw
lds
ldspdylrmgeriarlglptlfimeggyaveaiginavnvlqgyeg
ldu
ldvdlhhgngtqgifyarpdvftvslha
lea
leach
leachat
leader
leaders
leadership
leadlin
leaf
leafhopp
leafi
leafless
leaflet
leaflik
leafstalk
leagu
leak
leakag
leakcheck
leaker
leaki
leakproof
lean
leaner
leap
leapfrog
leapsecond
leapto
learnabl
learner
learning
learns
leary
learyabas

loneli
loner
lonesom
longa
longact
longan
longbow
longdist
longer
longeron
longestablish
longev
longhand
longhorn
longicorni
longifolia
longip
longipenni
longish
longissima
longitud
longitude
longitudin
longitudm
longleaf
longlin
longnam
longnames
longnamesdefin
longnamesdefined
longnamesused
longobard
longproc
longrang
longserv
longshor
longshorecurr
longshoredrift
longstand
longterm
longtim
longulus
longum
longus
longvalu
longview
longwal
longwav
longwind
lonq
loo
lookalik
lookback
looker
lookout
lookup
loom
loomis
loon
looneylimey
loonrider
looooong
loop
loopedit
loophol
loopi
looprat
loos
loosa
loosconnect
looseconnect
looseleaf
loosen
loosestrif
loot
looter
lop
loperamid
lopholeucaspi
lopid
lopinavir
lorabid
loral
lorant
loratadin
lorazepam
lord
lordcharr
lordi
lore
lorentzian
lorenzojim
loretto
lori
lorica
loricata
loriinid
lorikeet
lorri
lorsqu
lortab
los
losartan
losd
loser
losninos
losses
lossi
lossless
lost
loswego
lot
lotic
lotion
lotronex
lotteri
lotuc
lotus
lou
loud
louder

maverick
maviflowermarket
mawuvi
max
maxalt
maxcap
maxcharact
maxconsum
maxdelta
maxens
maxepa
maxfil
maxfiles
maxilla
maxillari
maxillofaci
maxim
maxima
maximimum
maximis
maximized
maximowiscziana
maxiofaci
maxipp
maxit
maxiumum
maxlab
maxmem
maxmiz
maxpage
maxportal
maxreaderror
maxsalas
maxscm
maxsecond
maxstr
maxwellian
mayan
mayb
mayfli
mayor
mayoritaria
mayorment
maypol
mazandaran
maze
mazon
mbai
mbar
mbarresi
mbcn
mbcooper
mbe
mber
mbere
mberger
mbest
mbhaha
mbi
mbirdims
mbit
mbm
mbnms
mbps
mbricament
mbrico
mbscientific
mbx
mbyte
mbzphccmkyf
mca
mcaapp
mcb
mcbstone
mcbx
mcc
mccarti
mccarty
mcchanism
mcchanisnl
mcclain
mcclellan
mccone
mccorkl
mccoy
mccullah
mcd
mcdonaldbradley
mcdougallii
mcel
mcendep
mcentire
mcetamaximum
mceventcollect
mcf
mcfarland
mcg
mcgeehan
mcgill
mcgilliard
mcginnis
mcguirewoods
mcheck
mchsi
mchurley
mci
mcj
mckenna
mckerrow
mckinney
mcl
mclhod
mclpos
mcm
mcmaximumeta
mcmillan
mcmunns
mcnari
mcnc
mcnp
mcnpx
mcnutt
mco
mcompton
mcparticleid
mcpsure
mcral

mineralocorticoid
mineralog
minerotroph
mines
mineshaft
minesweep
mineta
minework
mingl
mini
miniata
miniatur
miniatus
miniblind
minibus
minichromosom
minicomput
minidendrit
minier
minigun
minima
minimap
minimi
minimil
minimized
minimumcalip
minimun
minimus
mininitiatoret
minion
minipl
miniplates
minirhizotron
miniscul
minist
ministeri
ministerstva
ministr
ministrati
ministri
minitiar
minitron
minivan
mink
minkowski
minmax
minneapolis
minnesota
minnick
minnow
mino
minocyclin
minopim
minor
minority
minoxidil
minstr
minstrel
minstrelsi
mint
mintflavor
mints
mintz
minuend
minumum
minus
minuscul
minutes
minutia
minuto
minutus
minv
minveloc
mio
miocen
mip
mipav
mipspro
miq
mir
miracl
miracul
mirand
miranda
mire
miriamrylands
mirror
mirsky
mirtazapin
mis
misadventur
misalign
misalloc
misappl
misappli
misapprehend
misapprehens
misappropri
misayeed
misbegotten
misbehav
misbehavior
misbeliev
misbrand
misc
miscalcul
miscalibr
miscarriag
miscatalogu
miscellan
miscellaneou
miscellani
mischaracter


murashig
murder
murein
murf
muricata
murieron
murin
murinum
murk
murki
murmer
murmur
muro
murphy
murr
murray
murrel
murrietavw
mus
musano
muscl
muscle
muscovit
muscular
musculatur
musculoskelet
musculus
muse
museologist
museum
mush
mushi
mushroom
music
musician
musicolog
musk
muskeg
musket
muskmelon
muskrat
muslem
muslim
muslin
mussel
mustach
mustang
mustard
mustazi
muster
musti
mutabl
mutagen
mutagenesi
mutagensis
mutant
mutat
mutcd
mute
mutil
mutin
mutini
mutism
mutl
mutt
mutter
mutton
mutual
muy
muzaaq
muzhskago
muzzl
mvallejo
mvb
mvbi
mvd
mve
mvf
mvi
mvk
mvkt
mvm
mvmi
mvoltgeb
mvpd
mvr
mwe
mweiskott
mwerks
mwfld
mwg
mwh
mwhinnova
mwhitaker
mwldgq
mwrhyypgw
mwt
mwvud
mxcmd
mxl
mxscm
mxstcjs
mxv
mxvel
mya
mybar
mybl
myc
mycamin
myccorhiz
myceli
mycelium
mycetophilida
mychoic
mycobacteri
mycobacterium
mycolog
myconfig
mycophenol
mycophenolate
mycoplasma
mycorrhiza
mycotoxin
myd
mydaida
mydat
mydate
mydocumentel
mydomain
myelin
myeloblast
myelocytomatosi
myelogen
myeloid
myeloma
myelom

niter
niti
nitidulida
nitinol
nitrat
nitrazin
nitric
nitrid
nitrif
nitril
nitrit
nitro
nitroaromat
nitrobenzen
nitrobenzotrifluorid
nitrocellulos
nitrocellulose
nitrocompound
nitrogen
nitrogenas
nitroglycerin
nitromethan
nitromethane
nitronic
nitrop
nitrophenol
nitrophenyl
nitrosamin
nitrosoanabasin
nitrosoanatabin
nitrotoluen
nitrous
nitroux
nitti
nitwit
nitzschia
nivali
niveau
nivers
nives
niveum
nivium
niw
nix
nixon
nizat
nize
njastad
njdep
njei
njeis
nji
njjs
njql
nkathuria
nkelez
nkflds
nlat
nlatiml
nlcd
nlcheck
nlcm
nldqc
nlea
nlet
nlfron
nlhyvd
nli
nlik
nlin
nlinist
nlist
nlm
nlon
nlr
nlrac
nlrr
nlrsisasatavik
nltn
nluuzpuk
nlvl
nmai
nmaintactions
nmaintevas
nmat
nmb
nmcquinn
nmd
nme
nmfs
nmg
nmha
nmhc
nmi
nmilj
nml
nmr
nms
nmt
nmu
nmul
nmusfu
nnaa
nnagle
nname
nnd
nndss
nnfa
nnimnl
nniss
nnlm
nnod
nnode
nnos
nnq
nnr
nnrmulticultural
nnsavaaqvlrkqaarvaildvdlhhgngtqgifyarpdvftvslhadpvrfypffwgha
nnt
nntb
nnth
nntp
nnuz
nnw
noaa
noahwebsterhouse
noao
nobc
nobil
nobili
nobilissimus


oacmailinglistsignup
oacute
oacwindow
oadway
oae
oak
oakden
oakesdale
oakland
oaplot
oaqg
oar
oard
oarfish
oas
oasi
oat
oath
oatmeal
obama
obannual
obc
obclav
obdomin
obedi
obeld
obelisk
obemail
oberserv
obert
obes
obespecheni
obesus
obet
obey
obfusc
obi
obiect
obinson
obituari
obj
objcctivc
objcont
objdir
objec
objectid
objectif
objection
objectiv
objectives
objector
objectp
objet
objetivo
objeto
objimg
oblat
oblig
obligaci
obligations
obligatori
oblige
obligor
obliqu
obliquus
obliter
oblivi
oblivion
oblong
oblongata
obninsk
obnoxi
obo
obourg
obov
obp
obpr
obr
obra
obrabotano
obrazoch
obrien
obrovac
obs
obsc
obscen
obscur
obscura
obscurantist
obscurciss
obscurus
obscwcd
observa
observaci
observacion
observations
observatori
observatories
observed
observedrespond
observer
observewidths
obsess
obsidian
obsolesc
obsolet
obst
obstack
obstacl
obstamountain
obstant
obstetr
obstetrician
obstreper
obstruct
obstructionist
obta
obtai
obtectomeran
obten
obtenir
obtienn
obtnin
obtrus
obtur
obtus


ostiol
oston
ostp
ostrac
ostracod
ostrich
ostudi
osvietitel
oswsa
oswsp
ota
otal
otbd
otc
otdel
ote
otect
otecti
otero
otf
otghq
oth
otheract
othercoastalareas
othercontribut
otherdir
otherjuror
othermonthdaystyl
otherprograms
otherrespond
others
othewis
othyy
otic
otis
otiti
otitida
otma
otmrwl
otoacoust
otoemiss
otogr
otograph
otoh
otolaryngolog
otolaryngologist
otolaryngology
otolith
otolog
otopleni
otor
otordrivcn
otorga
otorgu
otorrhea
otorrhoea
otoscopi
ototox
otra
otrinidad
otro
otter
ottermobil
ottom
ottoman
otvorenej
otx
otxcezusjpk
oue
ouest
ouf
oufd
ougbqw
ougd
ough
ought
oui
oultri
oun
ounalacheka
ounc
ound
oung
ounsel
ount
ounti
ouovz
ouradversari
ourn
ournal
oursid
ourt
ous
ousin
ousli
oust
outag
outback
outbackeclipsefestival
outbi
outboard
outbound
outbreak
outbreaknetwork
outbuild
outburst
outcast
outcom
outcome
outcomes
outcompet
outcri
outcrop
outdat
outdate
outdatedpages
outdiffus
outdo
outdoor
outdoorchattanooga
outdoors
outdoorsi
outdoorsman
outdow
outer
outerboun

pdp
pdps
pdr
pds
pdsf
pdt
pdufa
pdviisqngadahyydplthlsatiniyeei
pdw
pea
peabody
peac
peaceabl
peacefulwolftaichi
peacekeep
peacetim
peach
peafant
peak
peakcount
peaker
peakfield
peakposit
peakwin
peal
pean
peanut
pear
pearanc
pearc
pearl
pearlstein
pearsonfuels
peasant
peat
peatbog
peati
peatland
peatman
pebb
pebbl
pec
pecan
pecari
peccari
pechanga
pecif
peck
pecker
peclet
pecosensi
pectat
pecten
pectic
pectin
pectinmethylesteras
pectolyt
pector
pectori
peculiar
pecuniari
pecuniarili
pecvd
ped
pedagog
pedagogi
pedal
pedant
peddler
pedest
pedestal
pedestrian
pedi
pediatr
pediatrician
pediatrics
pedicel
pedicl
pediculicid
pedigre
pediment
pediocactus
pedir
pedo
pedogen
pedogenet
pedogeomorpholog
pedon
pedotransf
pedotypi
pedstudi
peduncl
pedvaxhib
pee
peek
peel
peeler
peem
peep
peeper
peer
peernam
peerport
peerreview
peev
pefloxacin
peg
pega
pegan
pegaptanib
pegasus
peggs
peginterferon
pehpenknrlisilehveksgikka
pei
pein
peintr
pejor
pel
pelag
pelican
peligro
peligroso
pelizaeus
pelki
pel

planter
plantes
plantfinder
plantid
plantiff
plantless
plantlet
plantlist
planto
plantphysiol
plants
plantspecif
planula
plaqu
plasbumin
plasm
plasma
plasmaclean
plasmapheresi
plasmaspher
plasmid
plasminogen
plasmodium
plasmoid
plaster
plastic
plasticco
plasticwar
plastid
plastidi
plastiqu
plastocyanin
plat
plate
plateau
plateaus
plateaux
platelet
plateletpheresi
platelik
platen
platform
plati
platin
platinum
platitud
platon
platoon
platter
platycarenum
platyhelminth
platypezida
platypus
plausibl
plavix
playa
playback
playbil
player
playground
playingfield
playlist
playpen
playscapes
playthings
playtim
playwright
playwrisht
playwrit
plaza
plb
plc
plcor
plcx
ple
plea
plead
pleader
pleasant
please
pleasent
pleaser
pleasur
pleat
plebiscit
pledg
pledge
plef
plein
pleiotrop
pleiotropi
pleistocen
pleistocene
plenari
plenaria
plendil
pleniglaci
pleno
plenti
plenum
pleomorph
pleopod
pletcfi
plete
plethodontid
plethodontin
plethora
plethysmographi
pletion
pleura
pleural
pleurisi
pleurostigma
pl

preditor
predivis
predivision
prednisolon
prednison
predomin
predominant
predov
predstavite
predupreditel
preemi
preemin
preemploy
preempt
preemptibl
preemption
preemptiv
preen
preequilibrium
preevntion
preexist
prefabr
prefac
preface
prefactor
prefatori
prefectur
preferenti
preferr
prefig
prefil
prefir
prefix
preflar
preflight
preflush
prefor
prefork
preform
preformat
prefund
preg
pregabalin
pregnanc
pregnancv
pregnant
pregnyl
pregones
pregrowth
pregunt
pregunta
pregunto
preharv
preharvest
prehear
preheat
prehist
prehistor
prehistori
prehistoria
prehistoric
prehistoriqu
prehospit
prehybrid
preimag
preimpact
preimplant
preincub
preindustri
preinjector
preirradi
prejud
prejudg
prejudic
prejudice
prejudici
prejudicialimpact
prekallikrein
prelaunch
preli
prelim
prelimi
preliminar
preliminari
preliminarili
preliminary
prelingu
prelinguist
prelinguisti
preliteraci
preload
preloadimages
prelud
premagnet
premake
premalign
premarit
premarket
prematur
premeasur
premedit
premeiot
premelt
premeno

punctat
punctata
punctatus
punctuat
punctur
pungen
pungent
pungentsc
punish
punishment
punit
punt
punter
punto
puntur
punuk
pup
pupa
pupal
pupat
pupfish
pupil
pupillari
pupilli
pupillograph
pupillomet
puppet
puppetri
puppi
puppies
pur
purch
purchase
purchaseid
purchasers
purchasing
purdue
pure
purebr
purg
purgatori
purgeabl
purif
purifi
purin
purist
puriti
purothionin
purpl
purplish
purport
purpose
purposes
purpur
purpura
purpurascen
purpurata
purpurescen
purpusii
purs
purser
pursh
purshii
pursu
pursuanc
pursuant
pursuer
pursuit
purti
purvey
purveyor
purview
push
pushbroom
pushbutton
pusher
pushpin
pushrod
puss
putat
putc
puter
putlog
putresc
putrid
putti
puttin
puyallup
puzzl
puzzle
puzzlement
pvavleggi
pvc
pvce
pvd
pvdf
pvep
pvfagmheaasllvggtltaadwvmsgqalhaanlg
pvii
pvinfo
pvlist
pvoss
pvp
pvplot
pvr
pvs
pvsavhtpeyldflehiferwqriegasaevipnihpiarngsypasavgqagyhmadta
pvsprsydvallavsawln
pvt
pvwave
pwaggoner
pwc
pwd
pwdent
pwendorf
pwgpolicystatemktturmoil
pwir
pwl
pwp
pwr
pwrc
pwrnet
pw

rekindl
reklt
rekonstruktsii
reksn
rel
rela
relabel
relaci
relacionada
relacionado
relal
relaps
relate
related
relatedtarget
relati
relations
relationshipbas
relativament
relative
relativist
relativo
relax
relay
rele
relea
release
releases
releasesmay
releg
relegada
relend
relent
relentless
relevant
relevel
reli
reliability
reliabl
relianc
reliant
relic
relicens
relict
relief
reliefweb
reliev
relig
religi
religieux
religion
relin
relinquish
relish
relitig
relo
reload
reloadpag
reloadpage
reloc
relst
relt
reluct
relyr
rem
remaind
remainderman
remaining
remainng
remak
reman
remand
remanufactur
remap
remark
remarkab
remarriag
reme
remeasur
remed
remedi
remedies
remedy
remelt
rememb
remembr
remeron
remeslennikov
remettr
remind
reminisc
remiss
remit
remitt
remittitur
remix
remmedi
remnanc
remnant
remnarrt
remobil
remobilization
remodel
remodifi
remonstr
remont
remontu
remors
remot
remote
remotehostclosederror
remotely
removal
remove
removed
removehbpmdevicelist
removesocket
removevbpmdevice

ropa
rope
ropemak
roperi
rophor
rophylact
ropinirol
ropl
ropos
ror
rorism
rorist
ros
rosaceus
rosado
rosaearl
rosalia
rosamond
rosari
rosburg
rosc
rose
rosea
roseat
rosebud
rosella
rosemari
rosenbaum
rosenblatti
rosenfeldt
rosestocks
rosett
rosewood
rosgen
rosi
rosiglitazon
rosiglitazona
rosin
roslyn
rosquet
ross
rossmanith
rostellata
rostellum
roster
rostral
rostrat
roswellensi
rot
rota
rotari
rotarycentrifug
rotat
rotate
rotation
rotationalvibr
rotativa
rotator
rotavirus
rotenon
rotensi
rotflmao
roth
roti
rotif
roton
rotor
rotorcraft
rotordynam
rots
rotten
rotunda
rotundata
rotundifolia
rotylenchulus
rou
rouf
rough
roughen
rougher
roughi
roughness
rought
roulant
rouler
roulett
round
roundabout
roundcont
roundel
roundoff
roundtabl
roundtail
roundtrip
roundup
roundworm
rous
rout
route
router
routin
routinenam
roux
rov
roval
rove
rover
rovercom
rovfaqt
rovid
row
rowband
rowdi
rowel
rowlands
rows
rowsi
rowth
rox
roxby
roy
royal
royalti
royalty
royaltyfre
royb
roygbiv
rozerem
rozhovor
roz

seepi
seesaw
seeth
seewarn
seflsit
seg
segement
segid
segment
segreg
segu
seguirs
segundo
seguridad
seguro
sehinc
seibright
seich
seichert
seientiu
seiko
sein
seism
seismic
seismici
seismicili
seismiciri
seismogram
seismolog
seismologist
seismomet
seisndciti
seisnlic
seisrnic
seisrnicifi
seisrnicili
seisrnicitv
seiz
seized
seizur
sel
selca
selct
seldom
sele
seleccionada
seleccionar
selecte
selected
selecteddaystyl
selectedindex
selectentirefieldwithstartorend
selectev
selectin
selection
selectionchang
selectivesite
selectman
selectmen
selector
selectorstyl
selecttool
selegilen
selegilin
selegiline
selenbp
selenium
selenocystein
selenoenzym
selenoprotein
selfacceler
selfadminist
selfadminister
selfassembl
selfassess
selfattent
selfcal
selfcertif
selfclos
selfcontain
selfdefens
selfdirect
selfesteem
selfexculpatori
selfextr
selfhelp
selfhood
selfish
selfless
selflim
selfpollin
selfsam
selfschema
selftaught
selinda
sell
sellback
seller
sells
seln
selobj
selon
selv
selva
selvalu
selves
sel

sisyphean
sit
sitagliptin
sitagre
sitchensi
sitebas
sitechar
sitecontent
sitem
sitemap
sitenam
sites
sitespecdir
sitespecif
sitez
siti
sitio
sition
sitkana
sitkha
sito
sitopsi
sitter
sitti
situ
situaci
situacion
situada
situaient
situp
situs
situt
siu
siuslaw
sivaprakash
sive
siw
six
sixfold
sixfor
sixmonth
sixteenth
sixtermin
sixth
sixti
sixtieth
sixty
sixyear
siz
sizabl
sizahl
sizeabl
sizedown
sizelspe
sizeof
sizer
sizestr
sizeup
sizzl
sjhandco
sjhousing
sji
sjiang
sjogren
sjp
sjs
skago
skarn
skate
skateboard
skatepark
skater
skay
skb
skean
skeeter
skelet
skeletal
skeleton
skeptic
sketch
sketchbook
sketchi
sketchingact
sketchingincid
sketchof
skew
skewer
skey
skeyword
skg
skgf
ski
skid
skidder
skidgin
skier
skiff
skikh
skill
skillsuse
skim
skimmer
skin
skineth
sking
skinhead
skinni
skip
skipfil
skipfiles
skipjack
skipov
skipp
skipper
skiptonproperties
skirmish
skirrow
skirt
skit
skittish
sko
skoog
skookumchuck
skopje
skq
sku
skua
skulk
skull
skunk
skuto
skutterudite
sky
skydiv
skykom

spint
spintron
spinup
spinus
spippet
spir
spiraea
spiral
spirali
spiranth
spiratori
spire
spirit
spiritu
spirochetosi
spironolacton
spisula
spit
spite
spittlebug
spiveyla
spiy
spk
spl
splash
splat
splatter
splay
spleen
splenda
splendid
splendor
splendour
splenic
splib
splice
spline
splint
splinter
split
splitc
splitcylind
splitresult
splitrock
splitt
splittabl
splitter
splitvalue
splnt
splotch
spmd
spnav
spnp
spnt
spo
spoff
spoil
spoilag
spoiler
spokane
spoke
spoken
spokesman
spokespeopl
spokesperson
spokesphenomen
spokeswoman
spoliat
spolo
spolok
spon
spond
spondent
spondyl
spondylitis
spondyloarthropathi
spong
sponge
spongi
spongiform
spongiosum
spons
sponsibl
sponsor
sponsoredlegislation
sponsorsearch
sponsorship
spont
spontan
spoof
spook
spooki
spool
spoon
spoonbil
sporad
sporangi
sporangium
spore
sporifer
sporocarp
sporotrichioid
sport
sportella
sportfish
sports
sportsmanlik
sportsmanship
sporul
spot
spotfire
spotlight
spots
spott
spotter
spotti
spottili
spous
spousal
spout
spozn


subinerti
subintegra
subir
subiss
subitem
subitum
subj
subjac
subjects
subjoin
subjug
subjunct
sublamellateradi
sublattic
subleas
sublet
subleth
sublevel
sublim
sublinear
sublingu
subliterari
sublluli
submarin
submenu
submenuon
submerg
submers
submet
submeter
submicro
submicron
submillimet
submiss
submission
submitadvanc
submitforapprovaldialog
submitsearch
submitt
submodel
submuc
subnet
subnetwork
subniv
subnod
subnuclear
suboptim
suborbit
subord
subordin
subox
subpag
subpanel
subpar
subparagraph
subparallel
subpart
subphas
subpicosecond
subpixel
subplot
subpoena
subpoint
subpolar
subpopul
subpot
subprim
subproblem
subprocedur
subprocess
subproject
subpyriform
subqueri
subreach
subrectangular
subregion
subrequir
subrfuc
subrog
subroge
subrogor
subrostr
subround
subroutin
subs
subsaharan
subsalt
subsampl
subscal
subscrib
subscribe
subscript
subsea
subsecretari
subsect
subseg
subsequentdiscuss
subseri
subserotyp
subserv
subservi
subset
subsett
subshadow
subshrub
subsid
subsidi
subsidian

thb
thbty
thc
thcgc
thcix
thcm
thcnsendacapyofthatleuerto
thd
theaccus
thealliantgroup
thealthmaint
theantidrug
theaquaculturenews
theater
theaterleague
theatr
theatre
theatric
thecaret
thecircuit
thecircumst
theclass
thecocain
theconfer
theconfront
theconspiraci
theconstitut
thecount
thecourt
thecrier
thecrim
thedefend
thedestin
thedetect
thedickshow
thedir
thedirectorintheclassroom
thediv
thedonor
thee
theerror
thefcn
thefe
thefeder
thefield
thefirst
theform
theft
theftpron
thefullstr
thefullstring
thefunct
thegat
thegenericmethod
theground
theincid
theinform
theinnovestgroup
theirabl
theirselv
theirwilling
theismann
theiv
thejnet
thejob
thejuror
thelayer
thelength
thelink
thelma
themat
thematic
theme
themeno
thementornetwork
themenu
themenuoffsetx
themenuoffsety
themenustyle
themes
themessag
themestile
themfil
themiddleages
themochem
themonospot
themoplast
themost
themurph
thenc
thennoregul
thenrapcdbyamakcnlagcnt
theo
theobj
theodolit
theokritos
theolog
theologian
theon
theophrasti

trammell
tramp
trampl
tran
tranc
tranch
tranchant
trandolapril
tranfer
tranform
trangress
tranist
tranmiss
tranpar
tranport
tranquil
tranquiliser
tranquilizer
trans
transact
transactions
transactionswith
transactiv
transamidas
transaminas
transatlant
transbronchi
transceiv
transcend
transcendent
transcipt
transcod
transcon
transcond
transconduct
transcontinent
transcrib
transcript
transcriptas
transcription
transcriptionist
transcripts
transction
transderm
transdisciplinari
transduc
transducin
transduct
transect
transesterif
transf
transfect
transferas
transfere
transferor
transferr
transferrin
transfers
transfert
transfor
transform
transforma
transfront
transfus
transfusion
transgen
transgend
transgender
transglob
transglutaminas
transglycosylas
transgranular
transgres
transgress
transgressor
transienc
transient
transillumin
transimped
transistor
transition
transitional
transitori
transitu
transketolas
transl
translat
translatar
translate
translation
transles
transliter
transloc
trans

ucalgari
ucalgary
ucar
ucati
ucc
ucdavis
uch
uchastnikov
uchc
uchello
uchitelei
uchl
uci
uck
ucla
uclink
uconn
ucontext
ucrel
ucrl
ucsb
ucsc
ucsd
ucsf
ucsgrp
ucst
uct
uctuat
udapest
udd
udda
ude
udeb
udev
udf
udfargu
udic
udit
uditor
udky
udlp
udmget
udp
udpgluc
udpkg
udsg
udt
udyz
uec
ueki
uenc
uesday
ufe
uff
ufl
ufoltd
ufrj
ufsm
ufupgi
ugandan
ugh
ugli
uglies
ugust
uhe
uhleri
uhlhorn
uhnr
uhpc
uhug
uid
uidaho
uids
uifsa
uil
uild
uilx
uio
uis
uitaxtopic
uith
uiuc
ujem
ujf
ujguj
ujjnlqow
ujmemhg
ukas
ukmi
ukqyo
ukradn
ukradol
ukraine
ukrainian
ukrainka
ukranian
ukrsyp
uktxptjme
ula
ulati
ulcer
uld
ule
uler
ulf
uliginosum
ulimit
ulius
uljpajahkdemsabp
ullag
ullah
ullca
ulletin
ulli
ulligan
ulmus
ulna
ulnar
ulrich
uls
ulterior
ultim
ultimatebb
ultra
ultrafast
ultrafiltr
ultrahigh
ultrahivac
ultralarg
ultralight
ultralit
ultralumin
ultralyt
ultramarin
ultraprecis
ultrapur
ultrasmal
ultrason
ultrasonographi
ultrasound
ultrastructur
ultrathin
ultrav
ultraviolet
ultrawideband
ulu
umabnet
uma

usnea
usno
usovershenstvovaniami
usp
uspnf
uspolicy
uspro
uspstor
uspto
uspverified
usqu
usr
usrd
ussectech
usselm
usselman
ussen
ussit
usspacecom
ust
ustat
usted
ustreas
ustri
usuabl
usula
usuri
usurp
usv
uswest
uta
utaf
utah
utahensi
utc
utctim
utczuhpflldqlstvewmuyg
ute
utensil
utep
uterin
utero
uterus
utexas
utf
uthor
uti
utic
utico
utilic
utilis
utilisez
utilitarian
utilitario
utiliti
utility
utiliza
utilizado
utilizando
utilizar
utilization
utils
utime
utk
utm
utmck
utmost
utmpx
utograph
utopian
utoronto
utr
utricl
utriculata
uts
utter
utwcxhmvn
utyx
uua
uucp
uud
uuf
uuid
uulib
uunet
uurnal
uus
uva
uvb
uvm
uvnp
uvot
uvoxamin
uvr
uvs
uvtap
uvw
uwaterloo
uwbqx
uwm
uwmta
uwn
uwo
uwthph
uwuy
uwwbwxkjiy
uwy
uwyo
uxa
uxqq
uyeoka
uzatv
uzma
uzn
uznt
vab
vac
vaca
vacanc
vacant
vacareers
vacat
vacation
vacatur
vacc
vaccin
vaccine
vaccinia
vaccinium
vachir
vacpump
vacuatech
vacuol
vacuolar
vacuu
vacuum
vada
vader
vados
vafqrirafspdalvvalgldafegdpfg
vag
vagari
vagin
vagina
vaginatum
vagrant

vvwsldwq
vvztpsepw
vwater
vwb
vwbroadway
vwc
vwm
vwr
vxp
vxptmtg
vxqxjkc
vxw
vxwork
vyd
vydania
vydavetel
vydhdl
vymizli
vyp
vypozorov
vypylr
vyqbx
vyqptw
vyrie
vysok
vytorin
vytrvalos
vyucel
vzdel
vzig
vzil
vzj
vzlotnik
vznikol
waan
waay
waba
wabarletta
wac
waccamaw
waceil
wacki
wacko
wad
wada
wade
wader
wadi
wadsworthatheneum
waf
wafer
waff
waffl
wag
wage
wager
waggon
wagleri
wagner
wagneri
wagon
wahkiakum
wahluk
wahpeton
waif
waikato
wail
waist
waistband
waistlin
wait
waitalittlebit
waiter
waitflags
waitforaborttofinish
waitfordtptoclos
waitfordtptoconnect
waitlist
waitsburg
waitstatus
waiv
waiver
wake
wakec
waken
wakeup
wakeuptour
wal
wala
waldenstrom
waldron
walk
walkabl
walkabout
walkaway
walker
walkerlr
walki
walkway
wall
walla
wallabi
wallace
wallacelk
wallbard
wallboard
wallconfin
wallcov
walled
wallen
waller
wallet
walley
wallop
wallpap
wallsm
wallula
walmart
walnut
walrus
walter
walton
waltz
walv
wam
wampanoag
wan
wand
wanda
wander
wane
wangnmr
wangtq
wanna
wantin
wanton
wap

wrongdo
wrongdoer
wrongwaiv
wrought
wrt
wrtstr
wsaal
wsaalddacvria
wsbclaw
wsbu
wsc
wsdl
wse
wsee
wseo
wsexson
wsfi
wsfo
wsgr
wsgxfyzolsev
wsi
wsjz
wslv
wsm
wsmp
wsmv
wsnt
wsos
wsp
wsra
wsrc
wsrs
wssl
wssn
wstp
wstu
wsuk
wsvo
wswc
wsyt
wtbind
wtc
wtdsrr
wth
wthn
wtid
wtjp
wtnt
wtvj
wtype
wue
wuhr
wujeyo
wuppertal
wur
wus
wustl
wuthier
wuvn
wvby
wvdot
wviy
wvnl
wvsn
wvtm
wvuk
wwa
wwcc
wwd
wwhhvu
wwinter
wwt
wwu
wwwcache
wwwdev
wwworca
wwwphp
wwwroot
wwwvalids
wxa
wxb
wxeduc
wxf
wxh
wxlog
wxmjwrfcs
wxvo
wyeth
wyko
wyolink
wyom
wytka
wytooxfa
wzjf
wznqrcl
xabc
xal
xanax
xandra
xanthan
xanthin
xanthomela
xanthomona
xanthophor
xarchives
xat
xation
xauth
xbai
xbarc
xbcd
xbjwqw
xbp
xbtpert
xbyw
xcc
xcdo
xcfd
xcharsiz
xcm
xcr
xcursio
xcvzt
xdead
xdim
xdmcp
xdsl
xducer
xed
xen
xenobiot
xenograft
xenon
xenophob
xenophobia
xenotransplant
xeric
xeroform
xerograph
xerophthalmia
xerophyt
xerorth
xerox
xetel
xfa
xfba
xfcp
xfeed
xffff
xfmr
xfs
xft
xftseba
xfuncs
xfxexjq
xga
xgax
xgc
xglauca
xglk
xgsta

In [9]:
X = tfidf_doc_word.values

# KMeans


In [20]:
KMeans(**km_params).fit(X_tfidf).labels_

array([6, 2, 6, ..., 2, 2, 2], dtype=int32)

In [10]:
km_params: {
    'random_state': 30,
    'n_clusters': 10,
}

In [11]:
km = KMeans(**km_params).fit(X)

In [12]:
len(km.labels_)

3230

In [13]:
km_doc = pd.DataFrame([km.labels_, df['filename'], df['raw_text']]).T.rename(columns={0: 'km_label', 1: 'filename', 2: 'text'})

In [14]:
km_doc.groupby(by='km_label').count()

Unnamed: 0_level_0,filename,text
km_label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,39,39
1,90,90
2,1771,1771
3,389,389
4,108,108
5,76,76
6,489,489
7,50,50
8,58,58
9,160,160


# Most common words in the clusters

In [15]:
from nltk.corpus import stopwords
sw = stopwords.words('english')
sw.extend(['shall', 'would'])

def name_cluster_labels_common(sw, words=8):
    cluster_names = {}
    
    for i in km_doc.km_label.unique():
        label_text = ' '.join(km_doc[km_doc['km_label'] == i]['text'].values)
        corpus = re.findall('[a-zA-Z]{4,}', label_text)
        corpus = [w for w in corpus if (w.lower() not in sw)]
        label_dict = Counter(corpus)
        cluster_names[i] = '-'.join([w[0] for w in label_dict.most_common(words)])
    
    return cluster_names

In [16]:
name_cluster_labels_common(sw)

{6: 'information-State-also-program-public-Commission-time-must',
 2: 'data-image-time-used-noun-also-system-tiff',
 3: 'water-area-species-data-soil-areas-used-site',
 4: 'court-Court-case-States-United-district-evidence-also',
 0: 'pubmed-entrezsystem-pentrez-links-resultspanel-name-type-gene',
 5: 'generic-drug-Generic-drugs-brand-name-National-product',
 7: 'font-color-text-family-size-none-verdana-level',
 9: 'children-patient-patients-care-health-information-medical-Health',
 8: 'para-como-sobre-puede-usted-diabetes-informaci-pueden',
 1: 'beam-corrector-current-magnet-none-correction-traveler-scanning'}

In [243]:
sw = stopwords.words('english')

In [282]:
km_doc[km_doc.km_label == 1]['filename'].values

array(['004239.text', '004760.text', '004783.text', '004689.xls',
       '004690.xls', '004691.xls', '004693.xls', '004696.xls',
       '003914.html', '004180.html', '004184.html', '004376.html',
       '004712.html', '004713.html', '004714.html', '004716.html',
       '004718.html', '004722.html', '004726.html', '004727.html',
       '004730.html', '004734.html', '004738.html', '004742.html',
       '004751.html', '004754.html', '004043.doc', '004045.doc',
       '004053.doc', '004055.doc', '004057.doc', '004058.doc',
       '004062.doc', '004069.doc', '004083.doc', '004544.doc',
       '004545.doc', '004547.doc', '004548.doc', '004549.doc',
       '004551.doc', '004558.doc', '004559.doc', '004561.doc',
       '004562.doc', '004564.doc', '004572.doc', '004578.doc',
       '004963.doc', '002724.pdf', '002997.pdf', '004117.pdf',
       '004124.pdf', '004125.pdf', '004126.pdf', '004130.pdf',
       '004133.pdf', '004136.pdf', '004137.pdf', '004141.pdf',
       '004148.pdf', '004149.pdf',

In [None]:
MiniBatchKMeans()