In [1]:
# All code from Tutorial:
# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

# Part 1 : gensim LDA based on NLTK & SpaCy

# Run in python console
import nltk; nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tharsen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [3]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])


In [4]:
# Import Dataset
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(df.target_names.unique())
df.head()

# Convert to list
data = df.content.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:1])

['rec.autos' 'comp.sys.mac.hardware' 'rec.motorcycles' 'misc.forsale'
 'comp.os.ms-windows.misc' 'alt.atheism' 'comp.graphics'
 'rec.sport.baseball' 'rec.sport.hockey' 'sci.electronics' 'sci.space'
 'talk.politics.misc' 'sci.med' 'talk.politics.mideast'
 'soc.religion.christian' 'comp.windows.x' 'comp.sys.ibm.pc.hardware'
 'talk.politics.guns' 'talk.religion.misc' 'sci.crypt']
['From: (wheres my thing) Subject: WHAT car is this!? Nntp-Posting-Host: '
 'rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: '
 '15 I was wondering if anyone out there could enlighten me on this car I saw '
 'the other day. It was a 2-door sports car, looked to be from the late 60s/ '
 'early 70s. It was called a Bricklin. The doors were really small. In '
 'addition, the front bumper was separate from the rest of the body. This is '
 'all I know. If anyone can tellme a model name, engine specs, years of '
 'production, where this car is made, history, or whatever info you have on '
 't

In [5]:
%%time

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])
print('\n')

[['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp', 'posting', 'host', 'rac', 'wam', 'umd', 'edu', 'organization', 'university', 'of', 'maryland', 'college', 'park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']]


CPU times: user 9.01 s, sys

In [6]:
%%time

# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])
print('\n')

['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp_posting_host', 'rac_wam_umd_edu', 'organization', 'university', 'of', 'maryland_college_park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front_bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']


CPU times: user 43.1 s, sys: 168 ms, total: 43.3 s
Wa

In [7]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [8]:
%%time
# This cell takes 2-3 minutes to run on my machine.  -j

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])
print('\n')

[['where', 's', 'thing', 'car', 'nntp_post', 'host', 'rac_wam', 'umd', 'organization', 'university', 'maryland_college', 'park', 'line', 'wonder', 'anyone', 'could', 'enlighten', 'car', 'see', 'day', 'door', 'sport', 'car', 'look', 'late', 'early', 'call', 'bricklin', 'door', 'really', 'small', 'addition', 'front_bumper', 'separate', 'rest', 'body', 'know', 'anyone', 'tellme', 'model', 'name', 'engine', 'spec', 'year', 'production', 'car', 'make', 'history', 'whatev', 'info', 'funky', 'look', 'car', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst']]


CPU times: user 7min 15s, sys: 1min 21s, total: 8min 36s
Wall time: 2min 23s


In [9]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 5), (7, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 2), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1)]]


In [10]:
mywords = []
for cp in corpus:
    for id, freq in cp:
        if id2word[id].find("_") != -1:
            mywords.append(id2word[id]) 
print(len(mywords))
print('\n')
pprint(mywords)

70469


['front_bumper',
 'maryland_college',
 'nntp_post',
 'rac_wam',
 'nntp_post',
 'carson_washington',
 'floppy_disk',
 'guy_kuo',
 'heat_sink',
 'oil_leak',
 'richardson_tx',
 'nntp_post',
 'developers_toolkit',
 'distribution_usa',
 'host_magnusug',
 'magnus_ac',
 'ohio_state',
 'delivered_gateway',
 'diamond_ss',
 'latest_driver',
 'mouse_cursor',
 'nntp_post',
 'automatic_transmission',
 'chris_silvester',
 'james_callison',
 'texas_instrument',
 'nntp_post',
 'ohio_state',
 'acs_ohio',
 'asking_shipping',
 'bottom_magnus',
 'nntp_post',
 'evanston_illinois',
 'northwestern_university',
 'bob_beauchaine',
 'bronx_away',
 'death_penalty',
 'innocents_die',
 'queens_could',
 'robert_beauchaine',
 'sank_manhattan',
 'success_rate',
 'nntp_post',
 'distribution_world',
 'frank_odwyer',
 'host_solntze',
 'jon_livesey',
 'wpd_sgi',
 'nntp_post',
 'dsto_gov',
 'tv_station',
 'west_coast',
 'years_ago',
 'bob_gaj',
 'bob_gajarsky',
 'curtis_jackson',
 'flame_war',
 'lotsa_point',
 'mo

 'eleven_nigel',
 'handson_experience',
 'megatek_corporation',
 'pls_kindly',
 'randy_davis',
 'ucsd_megatek',
 'zx_pilot',
 'nntp_post',
 'po_andrew',
 'engineering_carnegie',
 'mellon_pittsburgh',
 'original_packag',
 'doug_dolven',
 'mel_hall',
 'pavel_bure',
 'panix_public',
 'jesus_christ',
 'oregon_health',
 'holy_spirit',
 'bible_teach',
 'portland_oregon',
 'spiritual_gift',
 'virgin_mary',
 'nntp_post',
 'floppy_disk',
 'distribution_world',
 'new_york',
 'jet_propulsion',
 '_',
 'plus_shipp',
 'comp_graphic',
 'public_domain',
 'ames_arc',
 'nasa_gov',
 'map_projection',
 'frequently_ask',
 'archive_name',
 'center_moffett',
 'chapel_hill',
 'customer_service',
 'geological_survey',
 'host_mahler',
 'ibm_pc',
 'jon_leech',
 'last_modifi',
 'nasa_ames',
 'naval_observatory',
 'north_carolina',
 'planetary_position',
 'three_dimensional',
 'turbo_pascal',
 'willmann_bell',
 'nntp_post',
 'latest_driver',
 'ftp_site',
 'nntp_post',
 'bush_administration',
 'clinton_administrati

 'host_localhost',
 'mcsnet_contributor',
 'nntp_post',
 'nntp_post',
 'handson_experience',
 'pls_kindly',
 'usr_lib',
 'nntp_post',
 'last_night',
 'rec_sport',
 'red_wing',
 'adams_division',
 'adirondack_gm',
 'adirondack_r',
 'ahl_mail',
 'baltimore_skipjack',
 'binghamton_gm',
 'binghamton_ranger',
 'binghamton_utica',
 'boston_bruin',
 'bri_farenell',
 'calder_cup',
 'cape_breton',
 'cdi_gm',
 'champions_phoenix',
 'champs_ahl',
 'champs_clarkson',
 'division_champion',
 'ecac_contact',
 'final_standing',
 'fredericton_canadien',
 'glens_fall',
 'gm_providence',
 'hockey_adirondack',
 'hockey_ecac',
 'list_congrat',
 'logic_clarkson',
 'maple_leafs',
 'moncton_gm',
 'moncton_hawk',
 'providence_bruin',
 'providence_gm',
 'springfield_indian',
 'springfield_providence',
 'suns_pacific',
 'tournament_champ',
 'utica_gm',
 'around_jupiter',
 'jupiter_radii',
 'mark_brader',
 'temporary_orbit',
 'arizona_tucson',
 'compton_gamma',
 'laboratory_tucson',
 'lunar_planetary',
 'neil_geh

 'finance_guy',
 'nissan_sentra',
 'service_scam',
 'david_veal',
 'health_scienc',
 'individual_liberty',
 'paul_prescod',
 'cochrane_jame',
 'attitude_toward',
 'south_florida',
 'nntp_post',
 'distribution_world',
 'aaron_ray',
 'special_investor',
 'stratus_computer',
 'wtc_bomb',
 'nntp_post',
 'distribution_world',
 'tim_ciceran',
 'st_catharine',
 'massachusetts_institute',
 'mark_ashley',
 'aaron_bryce',
 'daily_basis',
 'paul_conditt',
 'socially_unacceptable',
 'dale_cook',
 'hypocrisy_t',
 'neighbor_political',
 'ryan_scharfy',
 'smoke_pot',
 'trouble_aim',
 'thomas_wri',
 'mohit_goyal',
 'line_communication',
 'services_gu',
 'buslogic_card',
 'toshiba_cdrom',
 'nntp_post',
 'case_western',
 'cleveland_oh',
 'reserve_university',
 'cleveland_ohio',
 'thor_in',
 'robert_novitskey',
 'grad_student',
 'josh_hopkin',
 'recently_bought',
 'distribution_world',
 'greatly_appreciat',
 'opposite_direction',
 'simple_minded',
 'georgia_athen',
 'paul_hudson',
 'nntp_posting',
 'rada

 'moink_nmsu',
 'escrow_hous',
 'executive_branch',
 'richard_nixon',
 'nntp_post',
 'apr_gmt',
 'krakatoa_mailer',
 'kupajava_east',
 'psilink_do',
 'masud_khan',
 'robert_knowl',
 'nntp_post',
 'newsreader_tin',
 'western_australia',
 'serial_port',
 'loopback_connector',
 'iowa_state',
 'second_amendment',
 'nuclear_weapon',
 'dan_sorenson',
 'depends_upon',
 'dod_isu',
 'exciting_unusual',
 'exotic_distant',
 'foxvog_dougla',
 'machines_meet',
 'mass_destruction',
 'portal_system',
 'press_conference',
 'barbecued_food',
 'health_risk',
 'robert_thorson',
 'duncan_hine',
 'earl_grey',
 'nntp_post',
 'thou_shalt',
 'news_gateway',
 'jim_meritt',
 'years_ago',
 'last_night',
 'boeing_comput',
 'keywords_brick',
 'neil_william',
 'toyota_pickup',
 'brian_larose',
 'apostle_paul',
 'georgia_athen',
 'ted_kalivoda',
 'temporary_orbit',
 'ancient_mayan',
 'phil_fraer',
 'phil_fraering',
 'southwestern_louisiana',
 'televison_repo',
 'moon_land',
 'lunar_orbit',
 'distribution_na',
 'red_

 'eastern_anatolia',
 'million_muslim',
 'anti_semitic',
 'cs_dept',
 'second_amendment',
 'inalienable_right',
 'vast_majority',
 'self_defense',
 'get_rid',
 'bristol_myer',
 'equally_effective',
 'scientific_scrutiny',
 'nntp_post',
 'new_york',
 'gordon_bank',
 'ive_seen',
 'posters_view',
 'almost_exclusively',
 'watson_ibm',
 'nntp_post',
 'alaska_fairbank',
 'aurora_apr',
 'michael_adam',
 'space_station',
 'nntp_post',
 'brand_new',
 'tape_deck',
 'lictor_acsu',
 'toronto_ontario',
 'proper_channel',
 'ron_rosereader',
 'ron_roth',
 'rose_media',
 'rosemail_usenet',
 'usenet_rosemail',
 'nntp_post',
 'years_ago',
 'sci_m',
 'sci_med',
 'amino_acid',
 'grams_per',
 'kidney_ston',
 'kidney_stone',
 'osteopathic_medicine',
 'osu_college',
 'oxalic_acid',
 'nntp_post',
 'massachusetts_institute',
 'taurus_sho',
 'public_domain',
 'gamma_ray',
 'around_jupiter',
 'mark_brader',
 'temporary_orbit',
 'ancient_mayan',
 'phil_fraer',
 'phil_fraering',
 'southwestern_louisiana',
 'televi

 'proposed_newsgroup',
 'split_personally',
 'michael_nerone',
 'nntp_post',
 'newsreader_tin',
 'good_luck',
 'ford_motor',
 'gordon_lang',
 'host_slee',
 'srl_ford',
 'mb_ram',
 'memory_serf',
 'adaptec_scsi',
 'aspi_do',
 'nntp_post',
 'host_bolero',
 'cview_answer',
 'gif_viewer',
 'bryan_woodworth',
 'word_processor',
 'kenneth_gilbert',
 'pittsburgh_pa',
 'long_term',
 'standard_disclaimer',
 'coca_cola',
 'nntp_post',
 'new_york',
 'ten_year',
 'middle_east',
 'george_bush',
 'american_occupi',
 'failed_president',
 'jake_livni',
 'replaced_jimmy',
 'grenoble_france',
 'jewish_tribe',
 'oded_maler',
 'nntp_posting',
 'nntp_post',
 'ive_seen',
 'urbana_il',
 'monthian_buntan',
 'nntp_post',
 'comp_graphic',
 'ray_tracer',
 'pub_rtrace',
 'nntp_post',
 'lamont_down',
 'nntp_post',
 'distribution_world',
 'kind_soul',
 'massachusetts_amherst',
 'major_league',
 'adobe_system',
 'sherri_nichol',
 'blue_jay',
 'last_night',
 'regular_season',
 'playoff_pool',
 'gary_robert',
 'public

 'waco_today',
 'host_chopin',
 'houston_texas',
 'daniel_mccoy',
 'distribution_usa',
 'iowa_state',
 'gary_korenek',
 'mb_ram',
 'isa_eisa',
 'brian_schaufenbuel',
 'eisa_dma',
 'ftp_site',
 'anywhere_near',
 'copy_protect',
 'copy_protection',
 'encryption_schem',
 'highest_regard',
 'portland_oregon',
 'reed_college',
 'nntp_post',
 'distribution_usa',
 'computer_science',
 'plus_shipp',
 'radio_shack',
 'chapel_hill',
 'north_carolina',
 'stereo_vcr',
 'panasonic_kx',
 'oit_unc',
 'nntp_post',
 'berkeley_kstar',
 'sean_garrison',
 'james_sledd',
 'practical_purpos',
 'sun_microsystem',
 'dan_johnson',
 'jayne_kulikauskas',
 'eternal_life',
 'internet_fidonet',
 'policy_research',
 'national_capital',
 'depend_upon',
 'steve_birnbaum',
 'rejoinder_question',
 'new_york',
 'havent_seen',
 'deepak_chhabra',
 'nntp_post',
 'colorado_boulder',
 'please_respond',
 'include_stdio',
 'david_rex',
 'include_xm',
 'null_null',
 'xtappcontext_app',
 'nntp_posting',
 'shaft_drif',
 'albert_ei

 'california_berkeley',
 'distribution_inet',
 'phone_conversation',
 'brad_yearwood',
 'considered_harmful',
 'ken_shirriff',
 'probable_cause',
 'frank_odwyer',
 'san_jose',
 'christian_morality',
 'dangerous_enemie',
 'lies_friedrich',
 'ray_fischer',
 'absolute_truth',
 'clock_rate',
 'addressing_mod',
 'circuit_complex',
 'dear_friend',
 'instruction_set',
 'linux_risc',
 'reduced_instruction',
 'risc_cpus',
 'risc_instruction',
 'nntp_post',
 '_',
 'wisconsin_milwaukee',
 'dumbest_automotive',
 'years_ago',
 'united_state',
 'clipper_chip',
 'distribution_na',
 'white_house',
 'stanley_cup',
 'patrick_division',
 'california_institute',
 'catholic_church',
 'henling_lawrence',
 'getting_rid',
 'wide_range',
 'nntp_post',
 'white_house',
 'strong_cryptography',
 'hal_finney',
 'dorothy_dennings',
 'national_capital',
 'mark_baker',
 'michael_covington',
 'terrance_heath',
 'irrigate_desert',
 'pleasant_yankee',
 'allegheny_college',
 'edward_t',
 'wishful_think',
 'brigham_young',

 'ive_seen',
 'biblical_rape',
 'pseudo_random',
 'digital_telephony',
 'known_plaintext',
 'escrow_agent',
 '_',
 'self_defense',
 'georgia_athen',
 'michael_covington',
 'someone_els',
 'associate_research',
 'georgia_amateur',
 'intelligence_program',
 'phone_athen',
 'radio_tmi',
 'scientist_artificial',
 'norman_hamer',
 'shoei_rf',
 'centerstand_chipp',
 'grf_dropp',
 'passenger_helmet',
 'fast_polygon',
 'lucas_adamski',
 'wuarchive_wustl',
 'msdos_upload',
 'vga_mode',
 'bis_modem',
 'graham_toal',
 'corresponding_sampl',
 'distribution_usa',
 'wisconsin_madison',
 'darius_lecointe',
 'paul_harvey',
 'duck_pond',
 'ten_commandment',
 'unix_log',
 'old_testament',
 'nhlpa_poll',
 'nntp_post',
 'youve_got',
 'ini_fil',
 'original_poster',
 'mb_ram',
 'encore_computer',
 'sysgem_encore',
 'nntp_post',
 'distribution_world',
 'makes_sense',
 'news_reader',
 'administrative_computing',
 'emmet_gil',
 'levine_triumph',
 'man_rik',
 'student_billing',
 'atf_burn',
 'dividian_ranch',
 

 'brian_kendig',
 'gods_promise',
 'san_francisco',
 'brian_ceccarelli',
 'feet_tall',
 'comme_aucun',
 'croire_netre',
 'ends_rousseau',
 'pas_mieux',
 'que_jai',
 'suis_fait',
 'vus_jose',
 'starfleet_headquarter',
 'tax_dollar',
 'pgp_public',
 'lewis_glendenn',
 'estimating_wiretap',
 'iowa_iowa',
 'james_holthaus',
 'james_holthau',
 'robin_hanson',
 'wiretap_clipper',
 'nntp_post',
 'california_berkeley',
 'gun_owner',
 'criminals_ala',
 'federal_martial',
 'firearms_deter',
 'nd_amendment',
 'near_future',
 'unnesessary_opponent',
 'concealed_carry',
 'nntp_post',
 'colorado_spring',
 'abs_equipp',
 'cs_itc',
 'new_brunswick',
 'theodore_kaldis',
 'rutgers_remus',
 'rutgers_univ',
 'views_express',
 'loren_petrich',
 'light_bulb',
 'computer_science',
 'gordon_bank',
 'jxp_skepticism',
 'someone_els',
 'front_bumper',
 'distribution_usa',
 'host_magnusug',
 'magnus_ac',
 'ohio_state',
 'nntp_posting',
 'steering_wheel',
 'nntp_post',
 'distribution_world',
 'apr_gmt',
 'useragen

 'ottawa_senator',
 'nntp_post',
 'go_ahead',
 'janet_reno',
 'atf_burn',
 'dividian_ranch',
 'feb_th',
 'plains_nodak',
 'newsreader_tin',
 'hard_disk',
 'hard_drive',
 'ms_do',
 'low_level',
 'last_resort',
 'xxxx_xxxx',
 'ive_seen',
 'brandeis_university',
 'nntp_post',
 'ohio_state',
 'robert_beauchaine',
 'host_solntze',
 'jon_livesey',
 'wpd_sgi',
 'keith_allan',
 'political_atheist',
 'sgi_com',
 'uunet_olivea',
 'zaphod_mp',
 'capital_punishment',
 'unusual_punishment',
 'nntp_post',
 'houston_tx',
 'dan_sorenson',
 'alt_cosuard',
 'bailey_bb',
 'bis_bis',
 'houston_texas',
 'jim_wray',
 'ye_olde',
 'yob_sccsi',
 'computer_science',
 'apr_gmt',
 'department_stanford',
 'nntp_post',
 'distribution_world',
 'new_york',
 'white_sox',
 'thu_apr',
 'los_angele',
 'colorado_rockie',
 'san_diego',
 'major_league',
 'red_sox',
 'san_francisco',
 'thursday_april',
 'alphabetical_order',
 'atlanta_brav',
 'baltimore_oriole',
 'blue_jay',
 'california_angel',
 'cincinnati_red',
 'clevelan

 'nikolaos_foti',
 'nikolaos_fotis',
 'phoenix_oulu',
 'planetary_prob',
 'radiosity_code',
 'radiosity_package',
 'resource_list',
 'sgi_rad',
 'subscription_request',
 'texture_maps',
 'texture_temp',
 'unizh_ch',
 'utah_rast',
 'utah_raster',
 'uucp_mcsun',
 'volume_renderer',
 'nntp_post',
 'richardson_tx',
 'opinions_express',
 'archive_name',
 'convex_computer',
 'convex_com',
 'computer_science',
 'gordon_bank',
 'jxp_skepticism',
 'health_care',
 'iowa_state',
 'computer_science',
 'edinburgh_eh',
 'computer_science',
 'bear_arm',
 'kennedy_brew',
 'kennedy_jame',
 'gary_coffman',
 'spider_man',
 'cardinal_ximenez',
 'sun_microsystem',
 'dan_johnson',
 'eternal_death',
 'nntp_post',
 'winnipeg_jet',
 'internet_fidonet',
 'cup_essensa',
 'daryl_turner',
 'manitoba_canada',
 'manitoba_winnipeg',
 'norris_sel',
 'sel_nne',
 'ccu_umanitoba',
 'gimme_break',
 'ini_file',
 'svein_pedersen',
 'updating_delet',
 'updating_win',
 'sysedit_exe',
 'urbana_il',
 'cell_church',
 'jim_elliot

 'years_ago',
 'massachusetts_institute',
 'wrigley_field',
 'dave_kingman',
 'donald_boell',
 'best_homerun',
 'roger_clemen',
 'charles_kozierok',
 'nntp_post',
 'power_consumption',
 'david_lesher',
 'nntp_post',
 'distribution_usa',
 'fist_include',
 'foxtrot_iscp',
 'jeans_jacket',
 'malls_buy',
 'navy_submarine',
 'richard_pierson',
 'std_disclaimer',
 'uunet_bcr',
 'vnet_internet',
 'dave_tharp',
 'peter_tattam',
 'nntp_post',
 'abolish_selective',
 'mississippi_state',
 'ra_msstate',
 'marc_mueller',
 'pork_happy',
 'aviation_week',
 'fletcher_adam',
 'distribution_usa',
 'microsoft_corp',
 'fort_collin',
 'tt_font',
 'clipper_chip',
 'serial_number',
 'session_key',
 'law_enforcement',
 'hash_function',
 'nntp_post',
 'america_online',
 'good_luck',
 'new_jersey',
 'baby_bike',
 'chris_behanna',
 'dod_fxwg',
 'jubilees_r',
 'nec_zx',
 'wide_glide',
 'wild_corn',
 'saturn_sl',
 'cant_afford',
 'nntp_post',
 'ohio_state',
 'window_manager',
 'public_domain',
 'default_colormap',

 'middle_east',
 'harry_mamaysky',
 'backed_force',
 'brad_hernlem',
 'lebanese_territory',
 'lebanese_village',
 'lebanese_resistance',
 'israels_occupation',
 'reckless_disregard',
 'ncr_corp',
 'san_jose',
 'los_angele',
 'lindros_recchi',
 'new_brunswick',
 'new_jersey',
 'los_angel',
 'power_play',
 'tampa_bay',
 'period_hartford',
 'sanderson_cassel',
 'third_period',
 'cook_charlie',
 'ny_islander',
 'scorer_pt',
 'summary_parse',
 'ciccarelli_coffey',
 'lemieux_tocchet',
 'murphy_chelio',
 'neely_oat',
 'total_scorer',
 'image_process',
 'portal_system',
 'david_sternlight',
 'pgp_public',
 'bontchev_virus',
 'fax_fachbereich',
 'hamburg_tel',
 'informatik_agn',
 'koelln_strasse',
 'mail_hamburg',
 'regards_vesselin',
 'request_vogt',
 'vesselin_bontchev',
 'vesselin_vladimirov',
 'virus_t',
 'anymore_weird',
 'cont_education',
 'david_veal',
 'elevator_shaft',
 'vice_versa',
 'continuing_education',
 'self_defense',
 'tennessee_division',
 'colorado_boulder',
 'mere_presence',

 'sun_microsystem',
 'stand_alone',
 'lcd_display',
 'parallel_port',
 'east_coast',
 'jorge_lach',
 'public_access',
 'unix_brookline',
 'seventh_century',
 'nntp_posting',
 'case_western',
 'cleveland_oh',
 'host_hela',
 'ins_cwru',
 'reserve_university',
 'swap_file',
 'arizona_tucson',
 'config_sys',
 'virtual_memory',
 'permanent_swap',
 'martin_linsenbigler',
 'emm_exe',
 'nntp_post',
 'distribution_world',
 'vms_vnews',
 'judge_denied',
 'ruling_appear',
 'daniel_reitman',
 'oregon_uoregon',
 'massachusetts_institute',
 'aaron_bryce',
 'virtual_memory',
 'swiss_federal',
 'nntp_post',
 'distribution_world',
 'host_kelvin',
 'jet_propulsion',
 'jpl_nasa',
 'lab_telo',
 'new_zealand',
 'part_caterpillar',
 'part_vegetable',
 'ron_baalke',
 'vms_vnew',
 'thursday_april',
 'command_loss',
 'galileo_update',
 'gain_antenna',
 'op_command',
 'round_trip',
 'extreme_ultraviolet',
 'dn_volt',
 'loss_timer',
 'nntp_post',
 'carson_washington',
 'washington_seattle',
 'good_luck',
 'monda

 'uucp_uunet',
 'news_gateway',
 'angmar_cosmo',
 'frank_benson',
 'proline_internet',
 'distribution_usa',
 'computer_science',
 'semi_auto',
 'andy_freeman',
 'jason_kratz',
 'department_stanford',
 'texas_instrument',
 'fred_mccall',
 'allen_sherzer',
 'mary_shafer',
 'nasa_ame',
 'warrent_develop',
 'working_ssto',
 'moon_resident',
 'nntp_post',
 'radar_detector',
 'law_enforcement',
 'police_department',
 'nntp_post',
 'host_enterpoop',
 'ms_window',
 'works_fine',
 'boulder_co',
 'necessarily_reflect',
 'hard_disk',
 'floppy_drive',
 'hard_drive',
 'floppy_drif',
 'disk_driv',
 'nntp_post',
 'johns_hopkin',
 'homewood_academic',
 'summer_sublet',
 'nntp_post',
 'nntp_post',
 'texas_instrument',
 'andrew_molitor',
 'captain_crunch',
 'decoder_ring',
 'domino_theory',
 'mining_salt',
 'nntp_post',
 'research_scientist',
 'southern_california',
 'usc_isi',
 'video_card',
 'pasadena_ca',
 'los_angel',
 'law_enforcement',
 'sheriffs_department',
 'distribution_world',
 'video_card',


 'american_express',
 'self_defense',
 'law_abid',
 'andy_freeman',
 'gang_member',
 'jason_kratz',
 'department_stanford',
 'practicing_shoot',
 'shooting_range',
 'georgia_athen',
 'michael_covington',
 'terrance_heath',
 'often_wonder',
 'calgary_alberta',
 'man_retard',
 'sooooo_stuppid',
 'teams_ahve',
 'tie_breaker',
 'mount_royal',
 'nntp_post',
 'heavy_duty',
 'electronic_odometer',
 'caps_lock',
 'boulder_co',
 'weeks_ago',
 'public_access',
 'cs_dept',
 'access_unix',
 'denver_community',
 'denver_math',
 'nyx_public',
 'distribution_world',
 'new_york',
 'prime_minister',
 'united_state',
 'st_petersburg',
 'los_angele',
 'extermination_ohanus',
 'longer_exist',
 'might_serve',
 'mountain_pass',
 'serdar_argic',
 'single_turkish',
 'soul_sahak',
 'soviet_armenia',
 'human_right',
 'ottoman_empire',
 'eastern_anatolia',
 'armed_force',
 'international_agreement',
 'tartar_villag',
 'armenian_collaboration',
 'invading_russian',
 'muslim_population',
 'ohanus_appressian',
 'gu

 'merely_point',
 'intelligent_machine',
 'mcrcim_mcgill',
 'occupied_territorie',
 'jordan_river',
 'undeclared_war',
 'jonas_flygare',
 'joseph_weitz',
 'rabbi_shoham',
 'wb_revoke',
 'zionist_code',
 'distribution_usa',
 'new_york',
 'american_express',
 'ive_seen',
 'semi_auto',
 'gang_member',
 'carrying_revolver',
 'practicing_shoot',
 'regular_patrolman',
 'shooting_range',
 'peterborough_ontario',
 'grant_totten',
 'programmer_analyst',
 'nntp_post',
 'distribution_world',
 'southern_california',
 'los_angele',
 'caspian_usc',
 'los_angel',
 'six_month',
 'zhenghao_yeh',
 'nntp_post',
 'case_western',
 'reserve_university',
 'cleveland_ohio',
 'thor_in',
 'apple_ergo',
 'host_cunixb',
 'nntp_posting',
 'elias_davidsson',
 'middle_east',
 'peter_garfiel',
 'nazi_eugenic',
 'theories_circulat',
 'chris_metcalfe',
 'unconventional_proposal',
 'box_reykjavik',
 'catholic_church',
 'martin_luther',
 'alec_lee',
 'ftp_site',
 'speaker_sound',
 'denver_dept',
 'math_comp',
 'computer_

 'low_level',
 'isc_rit',
 'mfm_rll',
 'rely_upon',
 'houston_tx',
 'larry_overack',
 'absolute_truth',
 'velasco_jr',
 'virgilio_dean',
 'self_contradictory',
 'carol_alvin',
 'nntp_post',
 'oklahoma_norman',
 'ecn_uoknor',
 'manual_transmission',
 'boise_idaho',
 'nntp_post',
 'years_ago',
 'digital_equipment',
 'sean_mcmain',
 'burkhard_neidecker',
 'cec_karlsruhe',
 'nntp_post',
 'science_fiction',
 'space_billboard',
 'launch_vehicle',
 'robert_heinlein',
 'marvin_batty',
 'future_observer',
 'hideous_vision',
 'schwarzenegger_paint',
 'nntp_post',
 'hardy_washington',
 'seattle_mariner',
 'jim_lefebvre',
 'worst_manager',
 'nntp_post',
 'works_fine',
 'ms_do',
 'nntp_post',
 'gods_promise',
 'news_gateway',
 'mon_apr',
 'new_york',
 'white_sox',
 'cornell_univ',
 'cs_dept',
 'edward_ted',
 'ithaca_ny',
 'get_rid',
 'nl_east',
 'rec_sport',
 'major_league',
 'tsk_tsk',
 'red_sox',
 'pinch_runner',
 'date_thu',
 'baltimore_oriole',
 'cleveland_indian',
 'kansas_city',
 'minor_leagu

 'orville_wright',
 'distribution_world',
 'extermination_ohanus',
 'longer_exist',
 'might_serve',
 'mountain_pass',
 'serdar_argic',
 'single_turkish',
 'soul_sahak',
 'soviet_armenia',
 'cold_blood',
 'million_muslim',
 'turkish_genocide',
 'adolf_hitler',
 'peace_lov',
 'netcom_online',
 'msg_sensitivity',
 'alt_psychology',
 'carlton_place',
 'charter_member',
 'chips_world',
 'communications_service',
 'gourmet_chocolate',
 'infj_club',
 'infj_mean',
 'jkn_international',
 'jon_nor',
 'jon_noring',
 'livermore_ca',
 'login_gu',
 'anecdotal_evidence',
 'clinical_trial',
 'yeast_hypothesis',
 'xlib_xt',
 'stand_alone',
 'doug_dolven',
 'mel_hall',
 'nntp_post',
 '_',
 'gordon_bank',
 'osteopathic_medicine',
 'homeopathy_respectable',
 'ive_seen',
 'line_communication',
 'services_gu',
 'pc_geo',
 'pc_geos',
 'gerald_olchowy',
 'toronto_chemistry',
 'regular_season',
 'doug_gilmour',
 'tampa_bay',
 'pat_burn',
 'nntp_post',
 'newsreader_tin',
 'second_amendment',
 'distribution_na',

 'braindead_driver',
 'herschel_mayo',
 'nntp_post',
 'host_carina',
 'nanci_ann',
 'horrible_death',
 'distribution_world',
 'clipper_chip',
 'white_house',
 'pgp_public',
 'graham_toal',
 'mantis_consultant',
 'tony_lezard',
 'sore_thumb',
 'nntp_post',
 'motif_widget',
 'bugsbunny_synoptic',
 'ken_lee',
 'synoptics_communication',
 'man_retard',
 'sooooo_stuppid',
 'teams_ahve',
 'tie_breaker',
 'waterloo_ontario',
 'mortice_kern',
 'nntp_post',
 'new_york',
 'power_supply',
 'original_packag',
 'isc_rit',
 'worlds_larg',
 'indians_serie',
 'jays_vs',
 'runs_scor',
 'david_tate',
 'bruce_klopfenstein',
 'distribution_usa',
 'white_house',
 'bill_clinton',
 'aint_charity',
 'mark_wilson',
 'mob_call',
 'mob_rule',
 'money_wilson',
 'ncr_engineer',
 'prettier_merely',
 'someone_els',
 'block_grant',
 'years_ago',
 'anywhere_near',
 'new_york',
 'new_zealand',
 'jesus_christ',
 'kept_secret',
 'united_state',
 'great_britain',
 'soviet_union',
 'nuclear_weapon',
 'united_stat',
 'washi

 'netcom_online',
 'communications_service',
 'login_gu',
 'feb_th',
 'search_warrant',
 'arrest_warrant',
 'distribution_world',
 'extermination_ohanus',
 'longer_exist',
 'might_serve',
 'mountain_pass',
 'serdar_argic',
 'single_turkish',
 'soul_sahak',
 'soviet_armenia',
 'panos_tamamidi',
 'clipper_chip',
 'key_escrow',
 'secret_algorithm',
 'encryption_wiretap',
 'brad_templeton',
 'clarinet_communication',
 'corp_sunnyvale',
 'georgia_institute',
 'capitals_mike',
 'emotional_music',
 'friedman_hrivnak',
 'go_hornet',
 'go_skin',
 'mike_patton',
 'kevin_dineen',
 'miami_colon',
 '_',
 'cs_dept',
 'new_testament',
 'salt_lake',
 'ibm_rs',
 'alan_terlep',
 'cardinal_ximenez',
 'rochester_mi',
 'new_testament',
 'mike_cobb',
 'white_house',
 'wiretap_chip',
 'law_enforcement',
 'nasa_ame',
 'attorney_general',
 'encryption_device',
 'curt_howland',
 'electronic_surveillance',
 'forfeiture_super',
 'justice_asset',
 'shall_utilize',
 'surplus_fund',
 'big_bubba',
 'prime_minister',


 'andrew_infante',
 'dod_joan',
 'north_acpub',
 'youve_got',
 'richard_pierson',
 'plus_shipp',
 'line_communication',
 'services_gu',
 'best_offer',
 'distribution_world',
 'benedikt_rosenau',
 'pgp_public',
 'mats_andtbacka',
 'weak_atheism',
 'last_modifi',
 'last_resort',
 'mantis_consultant',
 'tony_lezard',
 'austin_texas',
 'unisql_inc',
 'dale_adam',
 'brian_hughe',
 'ns_simm',
 'ns_simms',
 'arts_comic',
 'dartmouth_college',
 'moderator_rec',
 'nntp_post',
 'west_bank',
 'research_centre',
 'islam_border',
 'absentee_landlord',
 'ilyess_bdira',
 'palestine_mandate',
 'united_nation',
 'intelligent_machine',
 'mcrcim_mcgill',
 'netcom_online',
 'weeks_ago',
 'communications_service',
 'login_gu',
 'distribution_na',
 'bell_laboratorie',
 'robert_nichol',
 'host_cunixb',
 'nntp_posting',
 'hewlett_packard',
 'nntp_post',
 'flame_war',
 'jet_propulsion',
 'clinton_administration',
 'space_station',
 'white_house',
 'thursday_april',
 'budget_cut',
 'decades_ago',
 'remote_sens'

 'darren_gibbon',
 'opel_gt',
 'opel_owner',
 'sporty_look',
 'hard_drive',
 'software_engineer',
 'space_telescope',
 'baltimore_md',
 'nntp_post',
 'case_western',
 'cleveland_oh',
 'ins_cwru',
 'reserve_university',
 'host_slc',
 'andrew_spencer',
 'balance_shaft',
 'distribution_usa',
 'years_ago',
 'side_effect',
 'avoid_recurrence',
 'crohns_disease',
 'crohns_ibd',
 'fresh_vegetable',
 'intestinal_lin',
 'intestinal_lining',
 'john_eyle',
 'lipoxygenase_inhibitor',
 'physical_therapist',
 'nntp_post',
 'youve_got',
 'mail_sa',
 'pennsylvania_school',
 'alan_sepinwall',
 'sports_radio',
 'kenneth_gilbert',
 'medicine_dammit',
 'donald_mackie',
 'nntp_post',
 'works_fine',
 'config_sy',
 'human_stupidity',
 'swap_file',
 'win_ini',
 'princeton_planetary',
 'carlosn_carlo',
 'phoenix_princeton',
 'carlos_niederstrass',
 'nntp_posting',
 'cellular_infrastructure',
 'motorola_inc',
 'speedy_mercer',
 'david_svoboda',
 'ama_dod',
 'cog_chicago',
 'concours_mmmmmmmmmm',
 'dave_svoboda'

 'go_ahead',
 'line_communication',
 'services_gu',
 'black_market',
 'crypto_anarchy',
 'digital_pseudonyms',
 'knowledge_reputation',
 'mailsafe_available',
 'markets_apto',
 'melbourne_australia',
 'monash_university',
 'ms_window',
 'michael_panayiotaki',
 'melb_australia',
 'grp_file',
 'exit_code',
 'my_int_var',
 'works_fine',
 'penn_state',
 'frank_crary',
 'internal_passport',
 'six_month',
 'emergency_medical',
 'paul_havemann',
 'newsreader_tin',
 'opel_owner',
 'matthew_macintyre',
 'serial_number',
 'sales_tax',
 'voice_fax',
 'caveat_emptor',
 'nntp_post',
 'science_fiction',
 'santa_cruz',
 'marcus_lindroo',
 'plus_shipping',
 'nntp_post',
 'lance_colostate',
 'tim_clock',
 'basil_hamdan',
 'tv_station',
 'flame_thrower',
 'carlos_carrion',
 'inflaming_passion',
 'costa_mesa',
 'candida_albican',
 'steve_dyer',
 'fair_amount',
 'consulting_cambridge',
 'harvard_rayssd',
 'linus_spdcc',
 'daily_basis',
 'anal_retentive',
 'nntp_posting',
 'nick_pettefar',
 'tin_version',


 'richard_warner',
 'ftp_cica',
 'math_cs',
 'jorge_lach',
 'al_devilbiss',
 'computing_center',
 'hi_r',
 'hi_re',
 'jeff_hite',
 'service_provider',
 'nntp_post',
 'wants_convertible',
 'keith_nuetzman',
 'host_cunixb',
 'nntp_posting',
 'peter_garfiel',
 'distribution_usa',
 'please_respond',
 'vms_vnews',
 'flight_center',
 'shuttle_launch',
 'nntp_post',
 'cs_dept',
 'distribution_inet',
 'homeopathy_tradition',
 'avoiding_mistak',
 'lee_lady',
 'sci_med',
 'sci_psychology',
 'russell_turpin',
 'errors_turpin',
 'nntp_post',
 'vesa_local',
 'video_card',
 'local_bu',
 'oklahoma_norman',
 'diamond_stealth',
 'mb_ram',
 'ecn_uoknor',
 'ati_graphic',
 'stealth_vlb',
 'years_ago',
 'policy_research',
 'united_state',
 'cdp_nf',
 'cdp_uucp',
 'cpr_apr',
 'human_right',
 'nf_id',
 'human_being',
 'elias_davidsson',
 'middle_east',
 'unconventional_peace',
 'peaceful_solution',
 'mixed_marriage',
 'mixed_marriag',
 'mixed_stock',
 'box_reykjavik',
 'proposal_unconventional',
 'affirmativ

 'art_microcircuit',
 'cabinet_official',
 'consider_incorporat',
 'frequent_consultation',
 'global_marketplace',
 'images_hdtv',
 'information_superhighway',
 'mat_heyman',
 'permitting_wider',
 'presidents_directive',
 'wireless_communication',
 'totally_unorganized',
 'nntp_post',
 'distribution_world',
 'works_fine',
 'vms_vnew',
 'rosie_uh',
 'data_strobe',
 'motorola_xc',
 'nntp_post',
 'newsreader_tin',
 '_',
 'athena_widget',
 'xlib_xt',
 'comp_sy',
 'ibm_rs',
 'mix_gl',
 'suresh_thennarangam',
 'larry_pyeatt',
 'nntp_post',
 'keith_allan',
 'political_atheist',
 'case_western',
 'reserve_university',
 'bobby_mozumder',
 'keith_ryan',
 'student_cwru',
 'virile_man',
 'amateur_radio',
 'apr_gmt',
 'radio_shack',
 'douglas_rand',
 'osf_motif',
 'ham_radio',
 'randall_rhea',
 'radio_operator',
 'signal_surge',
 'nntp_post',
 'distribution_world',
 'nntp_posting',
 'robert_hite',
 'philadelphia_phillie',
 'pitching_staff',
 'spring_train',
 'nntp_post',
 'latest_driver',
 'new_yor

 'dave_duff',
 'nntp_post',
 'host_enterpoop',
 'windows_nt',
 'adam_adam',
 'distribution_usa',
 'digital_equipment',
 'opinions_expressed',
 'sb_pro',
 'henry_spencer',
 'allen_sherzer',
 'coffee_churchill',
 'evil_genius',
 'cost_estimate',
 'warrent_develop',
 'working_ssto',
 'moon_resident',
 'allen_lady',
 'brian_kendig',
 'holy_spirit',
 'gods_promise',
 'san_francisco',
 'brian_ceccarelli',
 'comme_aucun',
 'croire_netre',
 'ends_rousseau',
 'pas_mieux',
 'que_jai',
 'suis_fait',
 'vus_jose',
 'starfleet_headquarter',
 'sampling_rate',
 'nntp_post',
 'okcforum_osrhe',
 'okcforum_unix',
 'islamic_authority',
 'bill_conner',
 'computer_science',
 '_',
 'wiretap_chip',
 'graham_toal',
 'corporate_acceptance',
 'nntp_post',
 'distribution_world',
 'los_angel',
 'dev_null',
 'user_interface',
 'event_handler',
 'virtual_reality',
 'held_responsible',
 'floppy_disk',
 'distribution_world',
 'clipper_chip',
 'serial_number',
 'session_key',
 'key_escrow',
 'dorothy_denn',
 'escrow_ag

 'hp_hut',
 'makes_sense',
 'gerald_olchowy',
 'toronto_chemistry',
 'press_conference',
 'nelson_lu',
 'mike_keenan',
 'mark_messier',
 'rangers_messier',
 'nntp_post',
 'internet_uucp',
 'isc_rit',
 'vancouver_canada',
 'link_british',
 'tammy_healy',
 'walla_walla',
 'apr_gmt',
 'benedikt_rosenau',
 'roman_catholic',
 'generally_accept',
 'middle_ag',
 'msg_sensitivity',
 'flight_center',
 'nasa_goddard',
 'nntp_post',
 'nsw_australia',
 'ray_traced',
 'nntp_post',
 'federal_government',
 'new_york',
 'united_state',
 'los_angele',
 'talk_politic',
 'po_box',
 'reagan_bush',
 'thirty_year',
 'adolf_hitler',
 'mein_kampf',
 'rush_limbaugh',
 'socialized_medicine',
 'gulf_war',
 'aryan_race',
 'judeo_christian',
 'divinely_inspir',
 'matt_freivald',
 'case_western',
 'practical_purpos',
 'cleveland_ohio',
 'human_being',
 'strongly_suspect',
 'applied_physic',
 'graduate_student',
 'absolute_truth',
 'electrical_engg',
 'mans_intimidat',
 'velasco_jr',
 'virgilio_dean',
 'wannabee_bul

In [11]:
id2word[0]

'addition'

In [12]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('addition', 1),
  ('anyone', 2),
  ('body', 1),
  ('bricklin', 1),
  ('bring', 1),
  ('call', 1),
  ('car', 5),
  ('could', 1),
  ('day', 1),
  ('door', 2),
  ('early', 1),
  ('engine', 1),
  ('enlighten', 1),
  ('front_bumper', 1),
  ('funky', 1),
  ('history', 1),
  ('host', 1),
  ('info', 1),
  ('know', 1),
  ('late', 1),
  ('lerxst', 1),
  ('line', 1),
  ('look', 2),
  ('mail', 1),
  ('make', 1),
  ('maryland_college', 1),
  ('model', 1),
  ('name', 1),
  ('neighborhood', 1),
  ('nntp_post', 1),
  ('organization', 1),
  ('park', 1),
  ('production', 1),
  ('rac_wam', 1),
  ('really', 1),
  ('rest', 1),
  ('s', 1),
  ('see', 1),
  ('separate', 1),
  ('small', 1),
  ('spec', 1),
  ('sport', 1),
  ('tellme', 1),
  ('thank', 1),
  ('thing', 1),
  ('umd', 1),
  ('university', 1),
  ('whatev', 1),
  ('where', 1),
  ('wonder', 1),
  ('year', 1)]]

In [13]:
%%time
# This cell takes about 4 minutes to run on my machine.  -j

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

CPU times: user 8min 20s, sys: 1min 22s, total: 9min 42s
Wall time: 4min 8s


In [14]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.094*"space" + 0.044*"image" + 0.025*"display" + 0.021*"blue" + '
  '0.019*"earth" + 0.018*"print" + 0.017*"research" + 0.016*"project" + '
  '0.016*"satellite" + 0.015*"tank"'),
 (1,
  '0.050*"food" + 0.045*"bus" + 0.044*"specifically" + 0.036*"msg" + '
  '0.030*"eat" + 0.030*"motorcycle" + 0.025*"ride" + 0.018*"confuse" + '
  '0.016*"entry" + 0.016*"shift"'),
 (2,
  '0.029*"use" + 0.027*"system" + 0.021*"window" + 0.018*"card" + 0.016*"file" '
  '+ 0.016*"run" + 0.014*"program" + 0.013*"problem" + 0.012*"also" + '
  '0.011*"need"'),
 (3,
  '0.080*"game" + 0.076*"team" + 0.045*"play" + 0.045*"win" + 0.044*"player" + '
  '0.036*"hockey" + 0.030*"season" + 0.026*"contact" + 0.020*"goal" + '
  '0.020*"fan"'),
 (4,
  '0.112*"circuit" + 0.018*"stable" + 0.012*"cam" + 0.011*"chemistry" + '
  '0.010*"scout" + 0.007*"tran" + 0.006*"arabic" + 0.005*"randomly" + '
  '0.004*"axis" + 0.000*"tablet"'),
 (5,
  '0.129*"gun" + 0.052*"_" + 0.036*"weapon" + 0.025*"discipline" + '
  '0.023*"cri

In [15]:
%%time

# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)
print('\n')


Perplexity:  -14.792846126242049

Coherence Score:  0.5025181203779839


CPU times: user 27.2 s, sys: 5.3 s, total: 32.5 s
Wall time: 28 s


In [16]:
%%time

# Visualize the topics
#
# If you get an error like this: "pyLDAvis/_prepare.py:257: FutureWarning: Sorting because non-concatenation axis is not aligned. 
# A future version of pandas will change to not sort by default."
#
# then from the command line do: "pip install pandas==0.21.0"

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)  # sort=False ? sort=True
vis
print('\n')



CPU times: user 9.56 s, sys: 2.29 s, total: 11.8 s
Wall time: 10.9 s


In [17]:
#import pandas as pd
#pd.__version__

vis