In [1]:
import pandas as pd
import numpy as np

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

np.random.seed(400)

import nltk
nltk.download('wordnet')

from sklearn.datasets import fetch_20newsgroups

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\RaviVerma\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
train = fetch_20newsgroups(subset='train', shuffle = True)

In [3]:
test = fetch_20newsgroups(subset='test', shuffle = True)

In [4]:
list(train.target_names)

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [5]:
train.data[5:6]

['From: dfo@vttoulu.tko.vtt.fi (Foxvog Douglas)\nSubject: Re: Rewording the Second Amendment (ideas)\nOrganization: VTT\nLines: 58\n\nIn article <1r1eu1$4t@transfer.stratus.com> cdt@sw.stratus.com (C. D. Tavares) writes:\n>In article <1993Apr20.083057.16899@ousrvr.oulu.fi>, dfo@vttoulu.tko.vtt.fi (Foxvog Douglas) writes:\n>> In article <1qv87v$4j3@transfer.stratus.com> cdt@sw.stratus.com (C. D. Tavares) writes:\n>> >In article <C5n3GI.F8F@ulowell.ulowell.edu>, jrutledg@cs.ulowell.edu (John Lawrence Rutledge) writes:\n>\n>> >> The massive destructive power of many modern weapons, makes the\n>> >> cost of an accidental or crimial usage of these weapons to great.\n>> >> The weapons of mass destruction need to be in the control of\n>> >> the government only.  Individual access would result in the\n>> >> needless deaths of millions.  This makes the right of the people\n>> >> to keep and bear many modern weapons non-existant.\n\n>> >Thanks for stating where you\'re coming from.  Needless to 

In [6]:
stemmer = SnowballStemmer("english")

In [7]:
def grammer(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def new_doc_process(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(grammer(token))
    return result

In [8]:
new_docs = []
for doc in train.data:
    new_docs.append(new_doc_process(doc))

In [11]:
new_docs[5:6]

[['vttoulu',
  'foxvog',
  'dougla',
  'subject',
  'reword',
  'second',
  'amend',
  'idea',
  'organ',
  'line',
  'articl',
  'transfer',
  'stratus',
  'stratus',
  'tavar',
  'write',
  'articl',
  'ousrvr',
  'oulu',
  'vttoulu',
  'foxvog',
  'dougla',
  'write',
  'articl',
  'transfer',
  'stratus',
  'stratus',
  'tavar',
  'write',
  'articl',
  'ulowel',
  'ulowel',
  'jrutledg',
  'ulowel',
  'john',
  'lawrenc',
  'rutledg',
  'write',
  'massiv',
  'destruct',
  'power',
  'modern',
  'weapon',
  'make',
  'cost',
  'accident',
  'crimial',
  'usag',
  'weapon',
  'great',
  'weapon',
  'mass',
  'destruct',
  'need',
  'control',
  'govern',
  'individu',
  'access',
  'result',
  'needl',
  'death',
  'million',
  'make',
  'right',
  'peopl',
  'bear',
  'modern',
  'weapon',
  'exist',
  'thank',
  'state',
  'come',
  'needl',
  'disagre',
  'count',
  'believ',
  'individu',
  'right',
  'weapon',
  'mass',
  'destruct',
  'hard',
  'believ',
  'support',
  'neigh

In [12]:
dictionary = gensim.corpora.Dictionary(new_docs)

In [14]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count = count + 1
    if count > 5:
        break

0 addit
1 bodi
2 bricklin
3 bring
4 bumper
5 call


In [15]:
dictionary.filter_extremes(no_below=10, no_above=0.1, keep_n= 100000)

In [16]:
bow_corpus = [dictionary.doc2bow(doc) for doc in new_docs]

In [17]:
model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 8, 
                                   id2word = dictionary,                                    
                                   passes = 15,
                                   workers = 2)

In [18]:
for idx, topic in model.print_topics(-1):
    print(f"Topic: {idx} \nWords: {topic}")
    print("\n")

Topic: 0 
Words: 0.012*"christian" + 0.008*"jesus" + 0.006*"exist" + 0.005*"moral" + 0.005*"bibl" + 0.005*"word" + 0.005*"religion" + 0.004*"church" + 0.004*"life" + 0.004*"claim"


Topic: 1 
Words: 0.014*"window" + 0.014*"file" + 0.009*"program" + 0.006*"chip" + 0.006*"encrypt" + 0.006*"version" + 0.005*"imag" + 0.005*"avail" + 0.005*"softwar" + 0.005*"graphic"


Topic: 2 
Words: 0.018*"game" + 0.015*"team" + 0.011*"play" + 0.010*"player" + 0.007*"hockey" + 0.006*"season" + 0.005*"leagu" + 0.005*"score" + 0.004*"basebal" + 0.003*"divis"


Topic: 3 
Words: 0.009*"sale" + 0.007*"presid" + 0.007*"price" + 0.006*"sell" + 0.005*"money" + 0.004*"list" + 0.004*"offer" + 0.004*"bike" + 0.004*"program" + 0.004*"clinton"


Topic: 4 
Words: 0.007*"ohio" + 0.006*"pitt" + 0.006*"cleveland" + 0.006*"cwru" + 0.005*"food" + 0.005*"bank" + 0.005*"freenet" + 0.005*"colorado" + 0.005*"gordon" + 0.005*"scienc"


Topic: 5 
Words: 0.010*"govern" + 0.007*"armenian" + 0.006*"israel" + 0.005*"kill" + 0.005*"i

In [19]:
num = 50
test_document = test.data[num]
print(test_document)

From: carter@ecf.toronto.edu (CARTER EDWARD A)
Subject: Re: Good Reasons to Wave at each other
Organization: University of Toronto, Engineering Computing Facility
Lines: 19

jlevine@rd.hydro.on.ca (Jody Levine) writes:
>Has anyone, while driving a cage, ever waved at bikers? I get the urge,
>but I've never actually done it.

Oh yeah, all the time.  On a nice spring/summer day, I roll down the window
and drive around looking for bikes.  When a bike motors by in the opposite
direction, I stick my arm out and hi5'em.  My arm feels like a million 
bucks when I'm doing this a 60km/h.  I do the same thing with cyclists.
The only problem with hi5ing a cyclist is their always in the right hand lane.
I hafta roll down the other window and hi5 them on the back.  Oh well, I 
think they appreciate the thought. 

Regards, Ted.

---
University of Toronto Computer Engineering               
PowerUsersGroupChairman
'89 FZR600: I'm taking a ride with my best friend.                  DoD#:886699




In [26]:
bow_vector = dictionary.doc2bow(new_doc_process(test_document))
for index, score in sorted(model[bow_vector], key=lambda tup: -1*tup[1]):
    print(f"Score: {score}\n Topic: {model.print_topic(index, 5)}\n")

Score: 0.39201977849006653
 Topic: 0.017*"drive" + 0.007*"scsi" + 0.006*"control" + 0.006*"power" + 0.006*"speed"

Score: 0.3341057002544403
 Topic: 0.009*"sale" + 0.007*"presid" + 0.007*"price" + 0.006*"sell" + 0.005*"money"

Score: 0.14700612425804138
 Topic: 0.007*"ohio" + 0.006*"pitt" + 0.006*"cleveland" + 0.006*"cwru" + 0.005*"food"

Score: 0.11621055006980896
 Topic: 0.014*"window" + 0.014*"file" + 0.009*"program" + 0.006*"chip" + 0.006*"encrypt"

