In [20]:
# we will work on a partial dataset with only 4 categories out of the 20 
#available in the dataset:
categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']

In [84]:
# load the list of files matching those categories
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
twenty_train.data[:2]

['From: sd345@city.ac.uk (Michael Collier)\nSubject: Converting images to HP LaserJet III?\nNntp-Posting-Host: hampton\nOrganization: The City University\nLines: 14\n\nDoes anyone know of a good way (standard PC application/PD utility) to\nconvert tif/img/tga files into LaserJet III format.  We would also like to\ndo the same, converting to HPGL (HP plotter) files.\n\nPlease email any response.\n\nIs this the correct group?\n\nThanks in advance.  Michael.\n-- \nMichael Collier (Programmer)                 The Computer Unit,\nEmail: M.P.Collier@uk.ac.city                The City University,\nTel: 071 477-8000 x3769                      London,\nFax: 071 477-8565                            EC1V 0HB.\n',
 "From: ani@ms.uky.edu (Aniruddha B. Deglurkar)\nSubject: help: Splitting a trimming region along a mesh \nOrganization: University Of Kentucky, Dept. of Math Sciences\nLines: 28\n\n\n\n\tHi,\n\n\tI have a problem, I hope some of the 'gurus' can help me solve.\n\n\tBackground of the probl

In [17]:
# The returned dataset is a scikit-learn “bunch”: 
# a simple holder object with fields that can be both accessed 
# as python dict keys or object attributes for convenience, 
# for instance the target_names holds the list of the 
# requested category names:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [18]:
# The files themselves are loaded in memory in the data attribute.
len(twenty_train.data)

2257

In [42]:
twenty_train.data[0].split("\n")[:3]

['From: sd345@city.ac.uk (Michael Collier)',
 'Subject: Converting images to HP LaserJet III?',
 'Nntp-Posting-Host: hampton']

In [85]:
# print the first lines of the first loaded file
print("\n".join(twenty_train.data[0].split("\n")[5:10]))
print("*")
print(twenty_train.target_names[twenty_train.target[0]])


Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format.  We would also like to
do the same, converting to HPGL (HP plotter) files.

*
comp.graphics


In [34]:
# For speed and space efficiency reasons scikit-learn 
# loads the target attribute as an array of integers that 
# corresponds to the index of the category name in the 
# target_names list. The category integer id 
# of each sample is stored in the target attribute:
twenty_train.target[:10]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2])

In [35]:
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


## turn the text content into numerical feature vectors.

In [50]:
# Text preprocessing, tokenizing and filtering of stopwords 
# use count vectorize for bag of words to represent sentences as 
# feature vectors with word count for each word vector
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 35788)

In [51]:
# CountVectorizer supports counts of N-grams of words or consecutive characters. 
# Once fitted, the vectorizer has built a dictionary of feature indices:
# The index value of a word in the vocabulary is linked to its frequency in the whole training corpus.
count_vect.vocabulary_.get(u'algorithm')

4690

In [89]:
# print counts of each word
for wc in zip(count_vect.get_feature_names(),
    np.asarray(X_train_counts.sum(axis=0)).ravel()):
    print(wc)

('00', 134)
('000', 92)
('0000', 1)
('0000001200', 2)
('000005102000', 1)
('0001', 3)
('000100255pixel', 1)
('00014', 1)
('000406', 1)
('0007', 1)
('000usd', 2)
('0010', 1)
('001004', 1)
('0010580b', 3)
('001125', 1)
('001200201pixel', 1)
('0014', 1)
('001642', 2)
('00196', 1)
('002', 3)
('0028', 1)
('003258u19250', 2)
('0033', 1)
('0038', 1)
('0039', 1)
('004021809', 2)
('004158', 1)
('004627', 1)
('0049', 1)
('00500', 1)
('005148', 1)
('00630', 1)
('008561', 1)
('0094', 1)
('00am', 5)
('00index', 1)
('00pm', 13)
('01', 95)
('0100', 4)
('010116', 1)
('010702', 1)
('011255', 1)
('011308pxf3', 1)
('011605', 1)
('011720', 2)
('012019', 1)
('012536', 1)
('012946', 1)
('013', 2)
('013034', 1)
('0131', 1)
('013423tan102', 1)
('013657', 1)
('0138', 2)
('013846', 1)
('0150', 1)
('015518', 1)
('01580', 4)
('015931', 2)
('01720', 1)
('01752', 1)
('0179', 1)
('01821', 1)
('01830', 1)
('0184', 2)
('01852', 1)
('01854', 1)
('01890', 1)
('0195', 1)
('0199', 1)
('01a', 2)
('01wb', 1)
('02', 59)
('02

('210631', 1)
('2108', 2)
('210916', 1)
('211', 2)
('2110', 1)
('211108', 1)
('2112', 3)
('211216', 1)
('2113', 1)
('211458', 1)
('21154', 1)
('2118', 3)
('21198', 1)
('212', 22)
('21205', 1)
('212139', 1)
('21222', 1)
('212441', 3)
('21258', 1)
('212706', 3)
('212943', 3)
('213', 7)
('213055', 2)
('213356', 1)
('213522', 1)
('213527', 1)
('213633', 1)
('214', 18)
('2140', 1)
('21400', 1)
('21409', 1)
('21438', 2)
('21443', 1)
('214557', 1)
('2147', 2)
('214741', 1)
('215', 10)
('2150', 2)
('215342', 1)
('21547', 1)
('215826', 1)
('215833', 1)
('216', 13)
('21652', 1)
('217', 7)
('21772', 2)
('2178', 1)
('218', 3)
('2181', 1)
('21864', 2)
('219', 3)
('2190', 1)
('2192', 1)
('2194', 1)
('2197', 1)
('2199', 2)
('21995', 2)
('21apr199308571323', 3)
('21apr199316170714', 1)
('21h', 1)
('21st', 2)
('22', 142)
('220', 1)
('220100', 1)
('220493145727', 1)
('220817', 1)
('221', 5)
('22102', 4)
('221101', 1)
('221111', 1)
('2213', 4)
('221357', 2)
('2215', 1)
('221508', 1)
('22169', 1)
('22176'

('5200', 1)
('52000', 1)
('5215', 1)
('522', 5)
('52223', 1)
('523', 1)
('523296', 3)
('524', 1)
('5245', 1)
('525', 2)
('5252', 1)
('5254', 1)
('525714', 3)
('5258', 3)
('527', 2)
('529', 1)
('5298', 1)
('5299', 1)
('52nd', 1)
('53', 36)
('530', 3)
('53051', 1)
('5316', 1)
('5322844', 1)
('5322970', 1)
('532p', 1)
('533', 2)
('5330', 1)
('534', 3)
('5353', 1)
('536', 1)
('5363', 7)
('5366', 1)
('537', 1)
('53701', 1)
('53706', 1)
('5371', 1)
('53766', 3)
('53792', 1)
('539', 1)
('53iss6', 8)
('54', 24)
('540', 1)
('5402', 3)
('541', 1)
('5418', 2)
('542', 29)
('5420', 4)
('5424', 1)
('543', 4)
('5430', 1)
('5432', 1)
('5435', 1)
('5438', 2)
('5447', 1)
('5448', 1)
('545', 3)
('5458', 1)
('5473', 4)
('548', 1)
('5485', 1)
('5494', 1)
('5497', 1)
('5499', 1)
('55', 35)
('5505', 1)
('55100', 1)
('5515', 1)
('5527', 1)
('553', 3)
('5541', 1)
('5543', 4)
('55444', 1)
('5555', 1)
('5556', 2)
('5560074', 1)
('557', 1)
('55773', 1)
('5580', 1)
('55905', 1)
('55970', 1)
('55e', 1)
('55th', 1)


('97231', 5)
('9729', 1)
('974', 2)
('975', 4)
('9760', 24)
('9770', 1)
('9777', 1)
('978', 4)
('9781bmu', 1)
('98', 14)
('980', 5)
('981', 4)
('98103', 1)
('98104', 1)
('98188', 1)
('985', 1)
('9898', 1)
('99', 10)
('9900', 1)
('9908', 1)
('9909', 1)
('99164', 2)
('994', 1)
('995', 2)
('99517', 1)
('9964', 2)
('9973', 1)
('998', 1)
('999', 1)
('9999', 1)
('9a', 2)
('9b', 1)
('9bv9oke', 1)
('9bx', 1)
('9gram', 1)
('9ig', 2)
('9l', 2)
('9o3', 1)
('9pl', 3)
('9pm', 2)
('9s2', 2)
('9th', 3)
('9ti', 1)
('9v4', 1)
('9x', 1)
('_24x_', 3)
('__', 108)
('___', 161)
('____', 40)
('_____', 21)
('______', 13)
('_______', 7)
('________', 7)
('_________', 1)
('__________', 8)
('___________', 3)
('_______________', 2)
('_________________', 1)
('__________________', 2)
('_______________________', 2)
('____________________________', 2)
('______________________________', 1)
('_______________________________', 15)
('________________________________', 2)
('__________________________________', 1)
('_______

('afar', 3)
('affair', 5)
('affairs', 13)
('affect', 37)
('affected', 13)
('affecting', 10)
('affectionate', 1)
('affects', 8)
('affff', 2)
('afffff', 1)
('affiars', 1)
('affiliated', 7)
('affiliation', 1)
('affiliation_____________________________________', 1)
('affiliations', 3)
('affine', 2)
('affinity', 2)
('affirm', 8)
('affirmation', 3)
('affirmative', 2)
('affirmed', 5)
('affirms', 1)
('afflicted', 6)
('afflicting', 1)
('affliction', 2)
('afflicts', 1)
('afford', 4)
('affordable', 1)
('affraid', 1)
('afghans', 1)
('afil', 1)
('afix', 1)
('aflatoxin', 3)
('afoul', 2)
('afoxx', 1)
('afp', 1)
('afraid', 35)
('africa', 19)
('african', 17)
('afrin', 2)
('afro', 1)
('afront', 1)
('afronted', 1)
('after', 489)
('afterall', 2)
('aftereffects', 4)
('afterlife', 12)
('afterlifes', 1)
('afternoon', 7)
('afternoons', 1)
('afterward', 4)
('afterwards', 16)
('ag', 3)
('ag1', 1)
('ag167', 1)
('ag2', 1)
('aga', 1)
('agabus', 1)
('again', 315)
('against', 213)
('agaist', 1)
('agar', 3)
('agate',

('arguer', 4)
('argues', 5)
('arguing', 31)
('argument', 287)
('argumentation', 3)
('arguments', 104)
('argumentum', 24)
('argus', 4)
('ari', 8)
('ariadne', 6)
('arian', 5)
('arianism', 2)
('arians', 2)
('ariel', 1)
('aries', 3)
('aright', 2)
('arimathea', 1)
('aris', 2)
('arise', 14)
('arisen', 2)
('arises', 8)
('arising', 4)
('aristides', 1)
('aristotelian', 1)
('aristotle', 8)
('aritcle', 1)
('arithmetic', 22)
('arius', 4)
('arizona', 64)
('arizvm1', 1)
('arizvms', 3)
('arj', 3)
('arjed', 1)
('arkansas', 4)
('arl10', 1)
('arlington', 3)
('arlunya', 1)
('arlut', 9)
('arm', 58)
('arm6x', 1)
('arm6xx', 1)
('armaggedon', 1)
('armed', 6)
('armenian', 2)
('armenians', 3)
('armentrout', 1)
('armies', 1)
('armond', 2)
('armor', 1)
('armour', 1)
('arms', 14)
('armstrng', 8)
('armstrong', 13)
('army', 35)
('arndt', 2)
('arno', 5)
('arnold', 12)
('arogant', 1)
('aromatic', 3)
('aron', 6)
('aron01', 1)
('arose', 4)
('around', 246)
('arouse', 2)
('aroused', 1)
('arowatt', 1)
('arp', 1)
('arpa', 

('baron', 4)
('barr', 5)
('barre', 5)
('barred', 2)
('barreno', 1)
('barrett', 2)
('barrier', 23)
('barring', 1)
('barris', 4)
('barry', 14)
('bars', 12)
('bart', 5)
('bartleson', 1)
('bartletts', 1)
('bartman', 1)
('barto', 1)
('bartokomas', 3)
('barton', 2)
('bartoshuk', 2)
('baruch', 2)
('base', 48)
('baseball', 6)
('basecalling', 1)
('based', 286)
('basel', 2)
('baseless', 3)
('baseline', 1)
('basement', 2)
('bases', 7)
('bash', 1)
('bashevis', 1)
('bashings', 1)
('basic', 100)
('basica', 1)
('basically', 76)
('basicaly', 4)
('basics', 8)
('basil', 1)
('basiliensis', 1)
('basin', 3)
('basin04', 1)
('basing', 2)
('basinger', 1)
('basis', 116)
('bask', 1)
('basket', 4)
('basketball', 1)
('basking', 1)
('bass', 8)
('basser', 1)
('bassili', 8)
('bastard', 4)
('bastardisation', 1)
('bastards', 2)
('bastyr', 1)
('bat', 11)
('batavia', 3)
('batch', 9)
('bates', 12)
('bath', 21)
('bathe', 1)
('bathroom', 6)
('baths', 1)
('bathtub', 1)
('bathwater', 1)
('batman', 46)
('baton', 1)
('battery'

('bundoora', 1)
('bungled', 1)
('bunk', 5)
('bunker', 1)
('bunkyo', 1)
('bunnies', 4)
('bunny', 5)
('bunyip', 2)
('buoyancy', 3)
('buphy', 50)
('burden', 24)
('burdens', 1)
('bureau', 25)
('bureaucratic', 2)
('bureaucrats', 1)
('bureaud', 1)
('burholme', 1)
('burial', 1)
('buried', 15)
('burke', 1)
('burkhard', 4)
('burks', 2)
('burlesque', 1)
('burlington', 2)
('burn', 17)
('burnaby', 7)
('burned', 13)
('burner', 1)
('burnet', 3)
('burning', 20)
('burns', 5)
('burnt', 3)
('burroughs', 2)
('burrow', 1)
('burrowed', 1)
('bursitis', 4)
('burton', 5)
('bury', 4)
('burying', 1)
('burzynki', 1)
('burzynski', 19)
('bus', 13)
('buses', 4)
('bush', 18)
('bushnell', 1)
('busily', 2)
('business', 67)
('businesses', 1)
('businessman', 1)
('businesswoman', 1)
('bust', 1)
('buster', 2)
('bustingly', 1)
('busy', 13)
('but', 3453)
('buted', 1)
('butera', 6)
('butler', 3)
('butt', 3)
('butte', 3)
('butter', 1)
('butterflies', 1)
('buttocks', 1)
('button', 2)
('buttons', 9)
('butyl', 1)
('butylated', 1

('categorized', 5)
('categorizing', 1)
('category', 17)
('catfish', 2)
('catharines', 5)
('cathechism', 1)
('cathedra', 4)
('cathedral', 1)
('catherine', 4)
('catheterization', 1)
('catholic', 174)
('catholicism', 3)
('catholics', 48)
('catholique', 2)
('cathy', 1)
('cathye', 1)
('catpe', 1)
('cats', 5)
('catt', 2)
('cattle', 3)
('caucasus', 1)
('caught', 20)
('caulmny', 1)
('causa', 6)
('causal', 5)
('causality', 3)
('causally', 1)
('cause', 262)
('caused', 103)
('causeless', 4)
('causes', 78)
('causeway', 3)
('causing', 40)
('causitive', 1)
('caution', 8)
('cautionary', 2)
('cautious', 4)
('cavalier', 3)
('cavano', 4)
('cave', 1)
('caveat', 3)
('cavities', 1)
('cb', 29)
('cb2', 1)
('cbc', 2)
('cbc5b', 1)
('cbf', 1)
('cblph', 1)
('cbm', 2)
('cbmehq', 1)
('cbmswi', 1)
('cbmvax', 2)
('cbnewsb', 1)
('cbnewsd', 5)
('cbnewse', 1)
('cbnewsh', 2)
('cbnewsi', 1)
('cbnewsj', 4)
('cbnewsk', 12)
('cbnewsl', 8)
('cboesel', 17)
('cbrasted', 1)
('cbs', 2)
('cbw790s', 4)
('cc', 278)
('cc_sysh', 2)
(

('coated', 8)
('coating', 4)
('coats', 2)
('cob', 1)
('cobb', 57)
('cobol', 3)
('cobra', 4)
('coca', 7)
('cocaine', 5)
('coccidiodes', 1)
('coccidiomycosis', 2)
('cochairman', 1)
('cochran', 10)
('cocked', 1)
('cockpit', 1)
('cocktail', 1)
('cocky', 1)
('cocoa', 1)
('coconut', 2)
('cod', 1)
('code', 205)
('code3', 2)
('code______________country_________________', 1)
('codec', 7)
('codecs', 1)
('coded', 7)
('codeine', 3)
('codes', 12)
('codex', 1)
('codexes', 1)
('codification', 1)
('codified', 6)
('codine', 1)
('coding', 16)
('codings', 1)
('coe', 2)
('coefficient', 2)
('coefficients', 1)
('coerced', 2)
('coercion', 1)
('coeur', 1)
('coffee', 8)
('coffey', 4)
('cofmments', 1)
('cogently', 1)
('cogito', 2)
('cognate', 2)
('cognitive', 4)
('cognitively', 1)
('cognitivists', 6)
('cognizant', 1)
('cogs', 1)
('cohen', 7)
('coherence', 1)
('coherency', 1)
('coherent', 5)
('cohesion', 1)
('cohesive', 2)
('coiera', 1)
('coil', 2)
('coin', 4)
('coincide', 3)
('coincidence', 8)
('coincident', 1)

('coronas', 2)
('corp', 52)
('corporate', 7)
('corporation', 44)
('corporeal', 2)
('corps', 5)
('corpus', 6)
('corrallary', 2)
('correct', 172)
('corrected', 6)
('correcting', 5)
('correction', 13)
('corrections', 13)
('correctly', 33)
('correctness', 5)
('corrects', 3)
('correlate', 1)
('correlated', 7)
('correlates', 1)
('correlation', 10)
('correlations', 1)
('correspodence', 1)
('correspond', 6)
('correspondence', 3)
('correspondents', 1)
('corresponding', 7)
('correspondingly', 2)
('corresponds', 1)
('corroboration', 1)
('corrupt', 4)
('corrupted', 11)
('corruptimg', 1)
('corrupting', 1)
('corruption', 7)
('corruptions', 1)
('corte', 1)
('cortex', 5)
('cortical', 5)
('cortical_cell', 2)
('cortico', 1)
('corticosteroids', 1)
('cortisone', 2)
('cory', 1)
('cos', 10)
('cosc', 2)
('cosine', 1)
('cosmetic', 9)
('cosmic', 17)
('cosmicly', 1)
('cosmogony', 1)
('cosmological', 2)
('cosmologists', 1)
('cosmology', 1)
('cosmopolitan', 1)
('cosmos', 3)
('cosponsored', 1)
('cossack', 3)
('cos

('dia', 1)
('diab', 2)
('diabetes', 33)
('diabetic', 5)
('diabetics', 4)
('diablo', 29)
('diag', 1)
('diagnose', 20)
('diagnosed', 42)
('diagnoses', 6)
('diagnosing', 2)
('diagnosis', 35)
('diagnostic', 10)
('diagnostics', 6)
('diagonal', 1)
('diagonistic', 2)
('diagram', 6)
('diagrams', 5)
('diahrea', 1)
('dial', 3)
('dialect', 1)
('dialectic', 1)
('dialectical', 1)
('dialed', 1)
('dialix', 5)
('dialo', 1)
('dialog', 6)
('dialogue', 8)
('dialogues', 1)
('dialysis', 1)
('diameter', 5)
('diamond', 15)
('diane', 6)
('dianetics', 1)
('dianne', 8)
('diaries', 1)
('diarrhea', 13)
('diarrheal', 2)
('diary', 2)
('diatribe', 1)
('dibromide', 2)
('dicarboxylic', 2)
('dice', 5)
('dicer', 1)
('dichotomy', 3)
('diciple', 2)
('diciples', 3)
('dick', 14)
('dickens', 2)
('dickey', 1)
('dickie', 1)
('dickinson', 2)
('dickson', 1)
('diclaimer', 2)
('dicotomy', 1)
('dicta', 7)
('dicta93', 1)
('dictate', 4)
('dictated', 1)
('dictates', 1)
('dictating', 6)
('dictation', 1)
('dictator', 2)
('dictators', 1)

('dittman', 2)
('ditto', 3)
('ditucci', 3)
('diuretic', 3)
('diuretics', 3)
('div', 12)
('diverge', 5)
('diverging', 1)
('diverse', 3)
('diversity', 9)
('divert', 3)
('diverted', 2)
('diverting', 1)
('divide', 5)
('divided', 6)
('dividend', 1)
('divides', 1)
('dividians', 1)
('dividing', 1)
('divil', 1)
('divination', 1)
('divine', 85)
('divinely', 5)
('divinitation', 2)
('divinity', 16)
('division', 74)
('divisions', 5)
('divisive', 2)
('divnity', 3)
('divorce', 26)
('divorced', 4)
('divulge', 1)
('dixie', 4)
('dixon', 6)
('dizziness', 15)
('dizzy', 2)
('dizzying', 3)
('dj', 2)
('djcoyle', 1)
('djgp', 1)
('djgpp', 14)
('djlewis', 4)
('djmst19', 1)
('djohnson', 2)
('djpeg', 3)
('dk', 13)
('dkauni2', 2)
('dkb', 8)
('dkbtrace', 6)
('dkennett', 2)
('dkoresh', 1)
('dkusswur', 2)
('dl', 6)
('dl2021', 3)
('dl8', 2)
('dl8dtl', 1)
('dla', 2)
('dlb', 6)
('dlecoint', 16)
('dleonar', 9)
('dlg', 1)
('dlg1', 1)
('dls', 1)
('dlsg', 1)
('dm', 5)
('dma', 4)
('dmapub', 2)
('dmarble', 1)
('dmhc', 3)
('d

('employed', 5)
('employee', 2)
('employees', 4)
('employer', 17)
('employing', 1)
('employment', 2)
('employs', 1)
('empowers', 3)
('empress', 1)
('empros', 3)
('empties', 1)
('emptiness', 1)
('emptive', 1)
('empty', 37)
('empty__', 1)
('emptying', 2)
('emr', 1)
('ems', 1)
('emstation', 1)
('emu', 2)
('emu387', 3)
('emulate', 2)
('emulates', 1)
('emulation', 3)
('emulator', 1)
('emunix', 1)
('emx', 5)
('en', 8)
('enable', 6)
('enabled', 2)
('enables', 5)
('enabling', 2)
('enacted', 4)
('enamored', 1)
('enamoured', 2)
('enantiomers', 2)
('encapsulated', 1)
('encapsulation', 1)
('encephalocele', 1)
('encir', 1)
('encircles', 1)
('enclose', 5)
('enclosed', 3)
('encloses', 1)
('encod', 1)
('encode', 3)
('encoded', 1)
('encoders', 1)
('encoding', 3)
('encompass', 6)
('encompassed', 4)
('encompasses', 4)
('encore', 9)
('encounter', 12)
('encountered', 8)
('encounters', 3)
('encourage', 26)
('encouraged', 16)
('encouragement', 2)
('encourages', 6)
('encouraging', 8)
('encroaching', 1)
('ency

('expound', 1)
('expounded', 1)
('expounding', 1)
('express', 34)
('expressed', 30)
('expresses', 11)
('expressing', 11)
('expression', 19)
('expressions', 4)
('expressive', 2)
('expressivity', 1)
('ext', 8)
('extant', 6)
('extend', 7)
('extendable', 1)
('extended', 27)
('extender', 3)
('extending', 3)
('extends', 2)
('extensible', 3)
('extension', 29)
('extensions', 10)
('extensive', 21)
('extensively', 2)
('extent', 31)
('exterior', 2)
('exterminate', 6)
('exterminated', 1)
('extermination', 4)
('external', 19)
('externally', 3)
('externals', 2)
('externel', 2)
('extinction', 2)
('extirmination', 1)
('extn', 4)
('extoll', 1)
('extolling', 4)
('extortion', 1)
('extra', 61)
('extract', 13)
('extracted', 15)
('extraction', 6)
('extradinary', 2)
('extramarital', 2)
('extraneous', 1)
('extraordinary', 11)
('extrapolate', 10)
('extrapolating', 1)
('extras', 1)
('extrascriptural', 1)
('extraversion', 1)
('extreemists', 1)
('extreme', 38)
('extremely', 48)
('extremes', 3)
('extremism', 2)
('

('flattening', 5)
('flavor', 28)
('flavored', 2)
('flavoring', 7)
('flavorings', 1)
('flavors', 2)
('flavour', 1)
('flavours', 1)
('flaw', 7)
('flawed', 8)
('flawless', 11)
('flaws', 5)
('flc', 1)
('fled', 1)
('fledging', 1)
('flee', 1)
('fleeing', 2)
('flees', 1)
('fleming', 10)
('flesh', 15)
('fleshing', 1)
('flew', 4)
('flex', 3)
('flexibel', 1)
('flexibility', 5)
('flexible', 10)
('flexion', 1)
('flexpro', 5)
('fli', 11)
('flibble', 3)
('flick', 3)
('flies', 2)
('flight', 15)
('flights', 10)
('fling', 1)
('flint', 5)
('flintstone', 1)
('flip', 4)
('flipped', 1)
('flipping', 3)
('flips', 1)
('flirting', 1)
('flis', 1)
('floating', 8)
('floats', 1)
('flock', 3)
('flocking', 3)
('flogged', 3)
('floggings', 6)
('flood', 5)
('flooded', 2)
('floodfill', 1)
('floor', 19)
('floored', 1)
('flop', 1)
('floppies', 4)
('floppy', 15)
('flora', 5)
('florian', 2)
('florida', 31)
('florin', 1)
('flourish', 1)
('flourished', 3)
('flourishes', 6)
('flourishing', 1)
('flout', 2)
('flow', 9)
('flowed'

('glanced', 2)
('glances', 1)
('gland', 6)
('glands', 4)
('glandular', 1)
('glasgow', 22)
('glasnost', 1)
('glass', 20)
('glass_', 1)
('glasses', 9)
('glassner', 3)
('glaucoma', 1)
('gle', 2)
('glean', 2)
('gleaned', 2)
('glelist', 2)
('glen', 7)
('glen_fullmer', 3)
('glendenning', 1)
('glenn', 7)
('glenne', 1)
('glens', 2)
('glewis', 2)
('glial', 2)
('glib', 2)
('gliding', 1)
('glimpses', 1)
('glioblastoma', 2)
('glitch', 1)
('gloat', 1)
('global', 14)
('globe', 8)
('globular', 1)
('globule', 1)
('globules', 1)
('gloom', 2)
('glop', 3)
('glorification', 3)
('glorified', 3)
('glorifies', 2)
('glorify', 3)
('glorifying', 2)
('glorious', 5)
('glory', 27)
('glory_', 1)
('gloss', 1)
('glossary', 2)
('glouberman', 1)
('glove', 1)
('glover', 1)
('gloves', 1)
('glowing', 3)
('glp', 4)
('glu', 13)
('glucocerebroside', 1)
('glucose', 19)
('glue', 2)
('glued', 1)
('glutamate', 51)
('glutamic', 2)
('gluttony', 1)
('glycemic', 3)
('glycine', 4)
('glycogen', 1)
('gmark', 1)
('gmbh', 1)
('gmeds', 3)

('helper', 3)
('helpers', 1)
('helpful', 49)
('helphelp', 1)
('helphlphelphelp', 1)
('helping', 28)
('helpless', 4)
('helplessness', 1)
('helps', 34)
('helsinki', 11)
('hematomas', 1)
('hemel', 7)
('hemi', 4)
('hemicrania', 1)
('hemingway', 1)
('hemispheres', 1)
('hemoglobin', 1)
('hemolytic', 3)
('hemophilia', 1)
('hemophilus', 2)
('hemorhages', 1)
('hemorrhage', 2)
('hemorrhages', 1)
('hemorrhagic', 1)
('hemorrhaging', 1)
('hemorrhoid', 1)
('hemorrhoids', 1)
('hemorroids', 2)
('hempel', 1)
('hemul', 3)
('hen', 1)
('hence', 34)
('henders', 1)
('henderson', 1)
('hendrix', 9)
('henein', 1)
('henize', 6)
('henling', 11)
('henrietta', 2)
('henrik', 6)
('henry', 4)
('hens', 3)
('hensa', 2)
('hense', 1)
('hep', 5)
('hepatic', 3)
('hepatitis', 7)
('hepburn', 1)
('hepis', 2)
('hepnet', 4)
('her', 390)
('herald', 1)
('herb', 4)
('herbal', 1)
('herbalism', 1)
('herbalists', 2)
('herbert', 5)
('herbison', 2)
('herbs', 2)
('hercules', 1)
('herd', 1)
('herds', 2)
('here', 619)
('here______________

('iniquities', 5)
('iniquity', 3)
('init', 1)
('initial', 19)
('initial_display_type', 1)
('initialisers', 1)
('initialization', 2)
('initially', 8)
('initials', 1)
('initiated', 4)
('initiates', 2)
('initiating', 1)
('initiation', 3)
('initiative', 1)
('initworld', 1)
('inject', 15)
('injectable', 1)
('injected', 6)
('injecting', 4)
('injection', 19)
('injections', 4)
('injector', 1)
('injunction', 1)
('injure', 2)
('injured', 3)
('injuries', 25)
('injury', 41)
('injustice', 1)
('injustices', 1)
('ink', 5)
('inkjet', 1)
('inks', 1)
('inland', 2)
('inlcude', 1)
('inlcudes', 1)
('inline', 3)
('inls1', 1)
('inmet', 7)
('inn', 2)
('innappropriate', 2)
('inner', 14)
('innermost', 1)
('innervation', 1)
('innocence', 5)
('innocence_', 2)
('innocent', 74)
('innocents', 16)
('innoculate', 2)
('innoculated', 1)
('innoculation', 2)
('innovations', 4)
('innovative', 4)
('innovator', 1)
('innovision', 1)
('innsbruck', 1)
('inoculated', 2)
('inordinate', 1)
('inpatient', 1)
('inpenatrable', 1)
('in

('jono', 9)
('jonsson', 1)
('joo', 2)
('joon', 2)
('jordahl', 2)
('jordan', 3)
('jorgensen', 4)
('jorgensonke', 1)
('jory', 1)
('jose', 15)
('josef', 1)
('joseph', 91)
('josephine', 2)
('josephson', 1)
('josephus', 4)
('josh', 12)
('joshua', 19)
('joshuaf', 4)
('joslin', 2)
('jot', 1)
('joth', 2)
('jotted', 2)
('joubert', 2)
('jouney', 1)
('journal', 29)
('journalists', 2)
('journalix', 1)
('journals', 10)
('journey', 8)
('journeyed', 1)
('journeyman', 2)
('jove', 1)
('joy', 17)
('joyce', 2)
('joyful', 1)
('joyfully', 1)
('joyous', 1)
('joystick', 2)
('jp', 12)
('jpeg', 280)
('jpeg4', 2)
('jpeg4386', 1)
('jpeg4bin', 1)
('jpeged', 1)
('jpeging', 2)
('jpegs', 7)
('jpegsrc', 2)
('jpegsrc4', 1)
('jpegv4', 1)
('jpegview', 6)
('jperkski', 3)
('jpg', 20)
('jpg25', 1)
('jpg2gif', 2)
('jpg50', 1)
('jpg5o', 1)
('jpg75', 1)
('jpg95', 1)
('jpgs', 1)
('jpl', 28)
('jplpost', 3)
('jprzybyl', 5)
('jps', 1)
('jpsrc4', 1)
('jpsum00', 1)
('jr', 49)
('jr0930', 6)
('jrl', 4)
('jroberts', 1)
('jsb30', 1)
('

('learn', 94)
('learned', 43)
('learning', 52)
('learns', 3)
('leary', 3)
('leashed', 1)
('least', 319)
('leather', 6)
('leave', 75)
('leaven', 1)
('leavening', 1)
('leaves', 29)
('leaving', 29)
('lebanon', 2)
('lebeau', 1)
('leberle', 2)
('lebrun', 3)
('lecointe', 2)
('lector', 2)
('lecture', 10)
('lecturer', 2)
('lectures', 1)
('lecturing', 1)
('led', 52)
('lederberg', 1)
('leds', 1)
('lee', 51)
('leeds', 1)
('leehian', 1)
('lefebvre', 17)
('lefebvrist', 1)
('lefevre', 1)
('leffler', 1)
('left', 137)
('leg', 10)
('legacy', 1)
('legal', 66)
('legalistic', 1)
('legalists', 1)
('legalities', 1)
('legalize', 3)
('legally', 11)
('legend', 8)
('legendary', 1)
('legends', 3)
('leger', 8)
('legibly', 1)
('legionnaires', 1)
('legislate', 2)
('legislation', 5)
('legislator', 1)
('legislators', 2)
('legitimacy', 3)
('legitimate', 32)
('legitimately', 5)
('legitimization', 3)
('legitmacy', 1)
('legs', 21)
('lehigh', 16)
('lehman', 2)
('lehr', 5)
('lehrer', 1)
('lehtori', 8)
('lehtotie', 1)
('lei

('mandeldern', 1)
('mandell', 1)
('mandlebrot', 1)
('mandock', 2)
('mandtbacka', 12)
('mandy', 2)
('manfred', 1)
('manfredo', 1)
('manfully', 1)
('manganese', 1)
('mangled', 3)
('mangoe', 54)
('manhattan', 23)
('mania', 3)
('manias', 1)
('manifest', 2)
('manifestation', 4)
('manifestations', 9)
('manifested', 5)
('manifesting', 1)
('manifesto', 1)
('manifests', 1)
('manifold', 2)
('manifolds', 1)
('manip', 1)
('manipulate', 8)
('manipulated', 2)
('manipulates', 1)
('manipulating', 9)
('manipulation', 44)
('manipulations', 3)
('manipulative', 2)
('manish', 1)
('manitoba', 4)
('mankind', 35)
('manley', 1)
('manly', 1)
('manner', 35)
('mannered', 1)
('mannheim', 1)
('mannikin', 2)
('manning', 5)
('manny', 1)
('manocha', 1)
('manor', 1)
('manslaughter', 1)
('manson', 1)
('manta', 1)
('mantis', 85)
('manual', 34)
('manually', 4)
('manuals', 4)
('manufacture', 5)
('manufactured', 4)
('manufacturer', 5)
('manufacturers', 9)
('manufactures', 3)
('manufacturing', 11)
('manupulation', 2)
('manus

('mudpuppy', 1)
('mueller', 1)
('muenchen', 18)
('muhammad', 3)
('muhammadi', 1)
('muhammed', 2)
('muhlestein', 1)
('muir', 9)
('muirm', 2)
('mukesh', 2)
('muller', 4)
('mullian', 1)
('multi', 34)
('multiband', 1)
('multicenter', 1)
('multidimensional', 2)
('multidisciplinary', 2)
('multimap', 1)
('multimedia', 19)
('multiplanar', 1)
('multiplayer', 2)
('multiple', 37)
('multiples', 1)
('multiplexer', 2)
('multiplication', 1)
('multiplicity', 1)
('multiplot', 1)
('multiply', 5)
('multiprocessing', 1)
('multiprocessor', 5)
('multispectral', 1)
('multistate', 3)
('multitude', 5)
('multitudes', 1)
('multiverse', 8)
('multiway', 1)
('mumble', 1)
('mumbled', 1)
('mumbo', 1)
('mumford', 2)
('mumod', 1)
('mumps', 1)
('mun', 3)
('munch', 5)
('mundane', 2)
('munerum', 1)
('municipal', 1)
('munnari', 2)
('munns', 6)
('munsch', 2)
('muratorian', 2)
('murdani', 2)
('murder', 87)
('murdered', 3)
('murderer', 16)
('murderers', 5)
('murdering', 6)
('murderous', 1)
('murders', 2)
('murdoch', 2)
('muri

('nwfocus', 2)
('nwnexus', 3)
('nwu', 14)
('nx10', 6)
('nx39', 1)
('ny', 60)
('nyc', 10)
('nye', 35)
('nyeda', 35)
('nym', 1)
('nynex', 2)
('nynexst', 2)
('nyongwa', 1)
('nyssa', 2)
('nystatin', 23)
('nyu', 11)
('nyx', 44)
('nz', 51)
('nz_______________________________', 1)
('o157', 27)
('o7t', 1)
('o92a', 3)
('oac', 11)
('oahu', 4)
('oak', 18)
('oakbrook', 1)
('oakland', 45)
('oaks', 2)
('oam', 5)
('oand', 1)
('oasis', 1)
('oasys', 18)
('oath', 7)
('oaths', 1)
('ob', 16)
('obe', 2)
('obedience', 32)
('obedient', 8)
('obediently', 2)
('obel11', 1)
('obelix', 5)
('oberto', 8)
('obese', 6)
('obesity', 20)
('obesssive', 1)
('obey', 35)
('obeyed', 5)
('obeying', 13)
('obeys', 2)
('obfusc', 1)
('obfuscate', 1)
('obfuscated', 1)
('obfuscation', 1)
('obfuscations', 1)
('obgyn', 3)
('obispo', 4)
('obiwan', 1)
('object', 111)
('objected', 2)
('objecting', 4)
('objection', 19)
('objectionable', 2)
('objections', 10)
('objective', 156)
('objectively', 32)
('objectives', 11)
('objectivism', 4)
('o

('paralpomenon', 1)
('paralysis', 1)
('paralyzed', 5)
('paramax', 3)
('paramedic', 2)
('parameter', 4)
('parameterized', 1)
('parameters', 20)
('parametric', 7)
('parametrically', 2)
('paramus', 1)
('paranoia', 1)
('paranoid', 4)
('paranormal', 3)
('paraphernalia', 1)
('paraphrase', 17)
('paraphrased', 10)
('paraphrasing', 2)
('paraplegia', 1)
('parapsychologists', 1)
('parasite', 1)
('parasites', 3)
('parasitic', 2)
('parc', 1)
('parcom', 1)
('pardon', 23)
('pardons', 1)
('pare', 1)
('pared', 1)
('parens', 1)
('parent', 15)
('parentage', 2)
('parenthetcal', 1)
('parenthetical', 1)
('parenthood', 3)
('parenting', 2)
('parents', 48)
('pariah', 4)
('paridise', 1)
('paris', 14)
('parish', 17)
('parishes', 8)
('parishioners', 2)
('parishoners', 1)
('parity', 3)
('park', 35)
('parke', 5)
('parkin', 10)
('parkinson', 1)
('parkinsonism', 1)
('parkinsons', 1)
('parks', 2)
('parkway', 3)
('parlamentarian', 1)
('parlance', 2)
('parlodel', 1)
('parody', 3)
('parole', 1)
('parowan', 1)
('paroxysma

('po3', 1)
('po5', 3)
('pob', 1)
('pobox', 1)
('pocatello', 1)
('pocket', 6)
('pockets', 2)
('pocock', 1)
('pocomoco', 2)
('pod', 2)
('podiatric', 2)
('podiatry', 2)
('pods', 1)
('poduska', 1)
('poem', 7)
('poems', 1)
('poenis', 1)
('poet', 3)
('poetic', 2)
('poetry', 3)
('pof', 1)
('pogo', 3)
('pogrom', 1)
('pogue', 1)
('point', 531)
('point_node', 1)
('point_struct', 5)
('pointed', 41)
('pointedly', 1)
('pointedto', 2)
('pointer', 10)
('pointers', 17)
('pointing', 24)
('pointless', 11)
('points', 182)
('poised', 1)
('poisened', 1)
('poison', 7)
('poisoned', 4)
('poisoner', 2)
('poisoning', 11)
('poisonings', 1)
('poisonous', 1)
('poisson', 1)
('poitiers', 3)
('poits', 1)
('poke', 2)
('poland', 6)
('polarities', 1)
('polarity', 2)
('polarized', 1)
('polder', 1)
('pole', 8)
('polemics', 2)
('poles', 3)
('poley', 4)
('polgyons', 1)
('polhemus', 2)
('police', 8)
('policemen', 10)
('policies', 7)
('policman', 1)
('policy', 28)
('polio', 27)
('polish', 6)
('polished', 2)
('politcal', 1)
('

('psuedo', 2)
('psuvm', 47)
('psy', 1)
('psych', 9)
('psychadelic', 1)
('psychiatric', 6)
('psychiatry', 4)
('psychic', 2)
('psychics', 2)
('psycho', 1)
('psychoactive', 7)
('psychoactives', 6)
('psychoanalytical', 1)
('psychogenic', 2)
('psycholgical', 4)
('psychological', 37)
('psychologicall', 3)
('psychologically', 3)
('psychologist', 1)
('psychologists', 4)
('psychology', 55)
('psychology_', 1)
('psychoneuroimmunology', 1)
('psychopaths', 1)
('psychosurgery', 3)
('psychotherapy', 1)
('psychotic', 3)
('psygnosis', 6)
('pt', 33)
('pthe', 1)
('ptheriau', 1)
('ptolemic', 2)
('ptolemy2', 3)
('ptrg', 2)
('pts', 11)
('pts102', 2)
('ptsd', 1)
('ptt', 7)
('pturner', 2)
('pty', 1)
('pu', 2)
('pub', 186)
('pubic', 3)
('pubinfo', 1)
('publ', 2)
('public', 203)
('publically', 9)
('publican', 1)
('publication', 21)
('publications', 9)
('publicity', 3)
('publicizing', 1)
('publicly', 12)
('publish', 13)
('publishe', 1)
('published', 61)
('publisher', 8)
('publishers', 4)
('publishes', 1)
('publi

('reigning', 1)
('reilly', 2)
('reimburse', 1)
('rein', 4)
('reincarnation', 9)
('reinfecting', 1)
('reinforce', 1)
('reinforced', 1)
('reinhard', 4)
('reinikainen', 1)
('reinnervated', 1)
('reinnervation', 2)
('reinnoculated', 1)
('reinnoculation', 1)
('reinscribed', 2)
('reinstate', 1)
('reinterpret', 1)
('reinterpretation', 1)
('reinvent', 2)
('reinventing', 3)
('reinvoke', 1)
('reiterate', 4)
('reiterated', 1)
('reject', 58)
('reject_authority_', 1)
('rejected', 21)
('rejecting', 17)
('rejection', 15)
('rejects', 2)
('rejoice', 1)
('rejoiced', 1)
('rejoices', 2)
('rejoicing', 1)
('rejoin', 1)
('rel', 3)
('relabeling', 3)
('relapse', 1)
('relapsing', 1)
('relate', 19)
('related', 130)
('relatedness', 1)
('relates', 22)
('relating', 12)
('relation', 13)
('relational', 2)
('relations', 16)
('relationship', 74)
('relationships', 12)
('relative', 19)
('relatively', 29)
('relatives', 9)
('relativism', 9)
('relativist', 6)
('relativistic', 4)
('relativists', 3)
('relativity', 7)
('relativ

('rubenfeld', 1)
('rubery', 2)
('rubin', 5)
('rubrics', 2)
('rude', 11)
('rudi', 1)
('rudimentry', 7)
('rudolf', 6)
('rue', 2)
('rued', 3)
('ruegg', 6)
('ruff', 5)
('ruffian', 1)
('rug', 3)
('ruin', 2)
('ruined', 1)
('ruinous', 1)
('ruins', 3)
('rule', 88)
('ruled', 3)
('ruler', 8)
('rulers', 8)
('rulership', 1)
('rules', 98)
('ruling', 12)
('rulings', 3)
('rum', 5)
('ruminant', 1)
('rummage', 1)
('rumor', 1)
('rumors', 3)
('rumoured', 1)
('rumouring', 1)
('rumours', 23)
('run', 134)
('runar', 2)
('runaway', 1)
('rund', 2)
('runme', 1)
('running', 78)
('runnining', 1)
('runny', 3)
('runs', 56)
('runtime', 2)
('ruocco', 4)
('rural', 2)
('rusch', 3)
('rush', 15)
('rushdie', 97)
('rushed', 3)
('rushes', 1)
('rushing', 7)
('ruskin', 1)
('rusnews', 22)
('ruspe', 1)
('russ', 3)
('russell', 59)
('russia', 10)
('russian', 6)
('russians', 3)
('rusty', 3)
('rutgers', 279)
('ruth', 14)
('rutherford', 2)
('ruthless', 4)
('rutin', 5)
('rutland', 2)
('ruu', 17)
('ruzak', 1)
('rv', 3)
('rw', 5)
('rws

('shamos', 1)
('shampoos', 2)
('shan', 1)
('shank', 2)
('shankley', 1)
('shannon', 1)
('shanti', 11)
('shao', 1)
('shapard', 2)
('shape', 30)
('shaped', 24)
('shapes', 7)
('shapiro', 1)
('shaprio', 1)
('shar', 6)
('sharan', 1)
('shards', 1)
('share', 65)
('shared', 27)
('sharen', 5)
('shares', 4)
('shareware', 57)
('sharing', 12)
('sharnoff', 2)
('sharon', 21)
('sharp', 9)
('sharpen', 1)
('sharpening', 2)
('sharpimage', 1)
('sharply', 3)
('sharrar', 1)
('sharynk', 1)
('shatim', 8)
('shatters', 2)
('shattuck', 1)
('shaving', 1)
('shavlik', 6)
('shaw', 4)
('shawn', 2)
('shayne', 1)
('shaz', 2)
('shazad', 2)
('shazam', 1)
('shd2001', 3)
('she', 462)
('sheared', 1)
('shearers', 1)
('shearson', 5)
('sheath', 3)
('sheaths', 1)
('shed', 8)
('shedding', 2)
('sheep', 12)
('sheepishly', 1)
('sheer', 1)
('sheet', 11)
('sheets', 2)
('sheffner', 2)
('sheiks', 1)
('sheila', 6)
('shekels', 2)
('shelf', 7)
('shell', 30)
('shelley', 16)
('shellgate', 9)
('shells', 1)
('shelter', 4)
('sheltered', 4)
('s

('spenser', 9)
('spent', 38)
('sperm', 13)
('spher', 1)
('sphere', 68)
('spheres', 3)
('spherical', 2)
('sphigs', 2)
('sphinx', 21)
('spica', 1)
('spices', 3)
('spicy', 2)
('spider', 1)
('spies', 4)
('spif', 1)
('spiffier', 1)
('spikes', 12)
('spilt', 1)
('spin', 3)
('spina', 3)
('spinach', 1)
('spinal', 5)
('spine', 6)
('spinner', 3)
('spinning', 3)
('spinoza', 3)
('spins', 2)
('spiralling', 1)
('spirit', 156)
('spirits', 3)
('spiritual', 77)
('spirituality', 10)
('spiritually', 12)
('spirochetal', 2)
('spirochete', 1)
('spit', 5)
('spite', 14)
('spits', 1)
('spitting', 1)
('spk', 2)
('spl', 11)
('splainin', 1)
('splat', 4)
('spleen', 3)
('splendor', 1)
('splenomegaly', 1)
('splice', 1)
('splicing', 2)
('spline', 17)
('splines', 8)
('splinter', 2)
('splints', 1)
('split', 56)
('splits', 1)
('splitting', 7)
('spm2d', 1)
('spock', 3)
('spoiled', 6)
('spoils', 1)
('spoke', 24)
('spoke_', 2)
('spoken', 23)
('spokesman', 2)
('spokesperson', 1)
('sponge', 2)
('sponsor', 2)
('sponsored', 9)


('surgeons', 6)
('surgery', 45)
('surgical', 9)
('surgically', 4)
('surly', 2)
('surmise', 1)
('surname', 1)
('surpassed', 1)
('surpress', 1)
('surprise', 16)
('surprised', 33)
('surprises', 5)
('surprising', 22)
('surprisingly', 4)
('surrender', 81)
('surrendering', 1)
('surrey', 3)
('surrogate', 2)
('surrounded', 3)
('surrounding', 15)
('surroundings', 5)
('surv', 3)
('surveilence', 1)
('surveillance', 30)
('survey', 25)
('surveys', 4)
('surviac', 1)
('survivability', 2)
('survival', 31)
('survivalist', 1)
('survive', 31)
('survived', 4)
('survives', 2)
('surviving', 3)
('survivor', 1)
('survivors', 2)
('surya', 1)
('susan', 7)
('susanne', 1)
('suscept', 1)
('susceptible', 7)
('susie', 2)
('suspect', 86)
('suspected', 10)
('suspecting', 2)
('suspend', 2)
('suspended', 2)
('suspension', 1)
('suspicion', 4)
('suspicions', 1)
('suspicious', 4)
('suspiciously', 3)
('sussex', 2)
('sustain', 3)
('sustainable', 1)
('sustained', 1)
('sustaining', 5)
('sutherland', 7)
('suttles', 1)
('suttor'

('till', 13)
('tilt', 5)
('tilting', 1)
('tilton', 1)
('tilts', 1)
('tim', 37)
('tima', 1)
('timber', 1)
('time', 959)
('time_', 1)
('timecode', 1)
('timed', 1)
('timely', 2)
('timeout', 1)
('timeouts', 1)
('timer', 3)
('timers', 3)
('times', 178)
('times_', 2)
('timidly', 2)
('timing', 6)
('timmbake', 16)
('timmens', 2)
('timmons', 24)
('timor', 13)
('timorese', 1)
('timothy', 42)
('timothy_freeman', 1)
('tims', 4)
('tin', 105)
('tina', 1)
('tinea', 3)
('ting', 2)
('tingled', 1)
('tingles', 2)
('tingling', 6)
('tinnitus', 12)
('tiny', 9)
('tion', 5)
('tions', 1)
('tip', 9)
('tipoff', 1)
('tipped', 1)
('tipple', 1)
('tips', 5)
('tirades', 1)
('tire', 1)
('tired', 16)
('tiredness', 1)
('tiresome', 2)
('tirf', 1)
('tiring', 3)
('tis', 4)
('tissue', 34)
('tissues', 4)
('titan', 5)
('tithe', 4)
('titipu', 1)
('title', 48)
('title_', 1)
('titled', 6)
('titles', 16)
('tittle', 1)
('titus', 3)
('tius', 1)
('tive', 2)
('tj', 1)
('tjf', 2)
('tjl', 1)
('tjp', 1)
('tk', 2)
('tkelder', 2)
('tlak',

('undelete', 1)
('undeniable', 3)
('under', 291)
('undercooked', 1)
('undercooking', 1)
('undercorrection', 2)
('undercover', 1)
('undercurrent', 1)
('undercuts', 1)
('underdeveloped', 1)
('underdiagnosis', 1)
('underestimate', 1)
('underfoot', 1)
('undergirded', 1)
('undergirding', 1)
('undergo', 3)
('undergoes', 2)
('undergoing', 3)
('undergone', 3)
('undergrad', 2)
('undergrads', 2)
('undergraduate', 2)
('underground', 1)
('underlaying', 1)
('underlies', 2)
('underlines', 1)
('underlying', 17)
('undermine', 1)
('undermined', 1)
('undermining', 1)
('underneath', 7)
('undernet', 2)
('underproduce', 1)
('underscan', 1)
('underscore', 1)
('underscores', 2)
('underserved', 1)
('underside', 1)
('understand', 257)
('understandable', 7)
('understandably', 2)
('understandeth', 1)
('understanding', 123)
('understands', 2)
('understnding', 1)
('understood', 53)
('understoood', 2)
('undertake', 5)
('undertaken', 2)
('undertook', 2)
('underware', 1)
('underway', 4)
('underwear', 2)
('underwent',

('visualize', 2)
('visualized', 1)
('visualizer', 4)
('visuallib', 4)
('visually', 1)
('visuals', 4)
('vit', 2)
('vital', 3)
('vitalism', 3)
('vitamin', 63)
('vitamins', 6)
('vitiello', 5)
('vito', 2)
('vitro', 8)
('vittorio', 1)
('vituperation', 1)
('vituperousness', 5)
('viva', 3)
('viveiros', 1)
('vivid', 12)
('vivid2', 2)
('vivo', 1)
('vjpeg', 1)
('vl', 3)
('vla', 1)
('vladimir', 1)
('vlb', 3)
('vlbi', 1)
('vm', 9)
('vm1', 3)
('vm_pray', 4)
('vma', 4)
('vma7o9', 1)
('vmars', 1)
('vmcbrt', 2)
('vmd', 2)
('vmode', 4)
('vmoper', 1)
('vms', 71)
('vmsb', 1)
('vnet', 23)
('vnet3', 1)
('vnews', 14)
('vni', 1)
('vnon4', 1)
('voc', 1)
('vocabularies', 1)
('vocabulary', 2)
('vocal', 5)
('voce', 1)
('vociferously', 1)
('vodka', 1)
('voelkerding', 3)
('vogl', 4)
('vogle', 7)
('voice', 51)
('voice_', 1)
('voiced', 1)
('voicenet', 2)
('voices', 3)
('voicetype', 1)
('void', 3)
('vol', 21)
('volans', 2)
('volatilise', 1)
('volcanic', 3)
('volcano', 5)
('volcanoes', 1)
('volcanos', 1)
('volition', 

('x11', 26)
('x114', 1)
('x11r3', 1)
('x11r4', 8)
('x11r5', 3)
('x2002', 1)
('x3026', 1)
('x3252', 3)
('x354', 2)
('x3769', 1)
('x386', 1)
('x3d', 3)
('x4', 4)
('x400', 1)
('x5543', 4)
('x6127', 2)
('x92lee22', 1)
('x_c', 1)
('xa', 2)
('xakellis', 2)
('xamiga', 1)
('xanadu', 1)
('xanax', 3)
('xanim', 1)
('xanthorrhoea', 1)
('xarchie', 4)
('xas', 2)
('xball', 1)
('xbm', 5)
('xc', 1)
('xdart', 3)
('xdpyinfo', 1)
('xds', 1)
('xdvorak', 2)
('xeg', 3)
('xenix', 5)
('xenoglossolalia', 1)
('xenografts', 3)
('xenon', 3)
('xenophobia', 3)
('xenos', 1)
('xenotransplant', 1)
('xerox', 17)
('xerxes', 1)
('xfig', 8)
('xflick', 1)
('xforms', 1)
('xga', 4)
('xgif', 2)
('xgks', 1)
('xgobi', 3)
('xgopher', 2)
('xgraph', 1)
('xgrasp', 1)
('xi', 4)
('xia', 4)
('xian', 4)
('xianity', 1)
('xians', 3)
('xiantiy', 1)
('xiaopin', 1)
('xidle', 1)
('xii', 2)
('xiii', 1)
('ximage', 1)
('ximenez', 17)
('xiv', 1)
('xl', 1)
('xlh', 2)
('xli', 7)
('xlib', 6)
('xln', 1)
('xloadimage', 13)
('xm', 1)
('xmas', 1)
('xmem

In [54]:
# downscale the term frequencies for words that appear many times
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 35788)

In [59]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [64]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)

In [66]:
for doc,category in zip(docs_new, predicted):
    print("%r => %s" %(doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


# create Naive Bayes pipeline 


In [98]:
print(twenty_train.target)
for idx, name in zip(twenty_train.target[0:2], twenty_train.target_names[0:2]):
    print ("{} {}".format(idx, name))

[1 1 3 ..., 2 2 2]
1 alt.atheism
1 comp.graphics


In [90]:

from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect',  CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

# Evaluate Model

In [76]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test', 
                                 categories=categories, 
                                 shuffle=True, 
                                 random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted==twenty_test.target)

0.83488681757656458

# create SVM pipeline

In [137]:
len(twenty_test.data)

1502

In [139]:
from sklearn.linear_model import SGDClassifier
train_count= 100
text_clf_svm = Pipeline([
    ('vect', CountVectorizer()),
    ('tfid', TfidfTransformer()),
    ('clf-svm', SGDClassifier(loss='hinge', penalty = 'l2', 
                              alpha=1e-3, n_iter=5, random_state=42))
])
_ = text_clf_svm.fit(twenty_train.data[:train_count], twenty_train.target[:train_count])
predicted_svm = text_clf_svm.predict(twenty_test.data)
np.mean(predicted_svm == twenty_test.target)

0.73501997336884151

In [140]:
# Tune Classifiers using Grid Search
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
            }

In [142]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs = -1)
gs_clf = gs_clf.fit(twenty_train.data[:train_count], twenty_train.target[:train_count])

In [143]:
print("best score: {}\n".format(gs_clf.best_score_))
print("best score params: {} ".format(gs_clf.best_params_))

best score: 0.8

best score params: {'clf__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)} 


# find words with greatest mutual information

In [144]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction.text import CountVectorizer

categories = ['talk.religion.misc',
              'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',
                                      categories=categories)

X, Y = newsgroups_train.data, newsgroups_train.target
cv = CountVectorizer(max_df=0.95, min_df=2,
                                     max_features=10000,
                                     stop_words='english')
X_vec = cv.fit_transform(X)

res = dict(zip(cv.get_feature_names(),
               mutual_info_classif(X_vec, Y, discrete_features=True)
               ))


In [145]:
for k,v in res.items():
    if v > 0.05:
        print("{}\t\t {}".format(k,v))

bible		 0.07232747959557144
christ		 0.05729373368021909
christian		 0.12862867565281702
christians		 0.06851132861181007
god		 0.12252523919766867
gov		 0.05354727448578558
graphics		 0.13044709565039875
jesus		 0.09245436105573257
launch		 0.05988217938744486
moon		 0.06497778107255724
morality		 0.05023510439412315
nasa		 0.11146392824624819
orbit		 0.087254803670583
people		 0.06811837023435494
religion		 0.06769561709612532
shuttle		 0.05344097661835926
space		 0.20115901737978983
thanks		 0.060202010019767334
