In [1]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset="train",shuffle=True)
newsgroups_test = fetch_20newsgroups(subset="test",shuffle=True)

In [2]:
print(list(newsgroups_train.target_names))

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


## Now Data preprocessing to feed to LDA model
The data will be be processed in the following steps
- Tokenization : Split the text into sentence and sentences into words,also all words are lowered and punctuation is removed
- All stopwords are removed, i.e helping verbs(mostly)
- Words are lemmatized : words in third person are changed to first person and words in past and future tenses are changed to present
- Words are stemmed : words are converted to there base class like dancing is converted to danc which is root of the word

In [5]:
!pip install gensim



In [26]:
import numpy as np 
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer,SnowballStemmer
from nltk.stem.porter import *

In [28]:
simple_preprocess("I tried so hard ")

['tried', 'so', 'hard']

In [8]:
np.random.seed(42)

In [14]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/pranavprajapati/nltk_data...


True

### Lemmatizer example

In [15]:
print(WordNetLemmatizer().lemmatize('ran',pos='v'))

run


### Stemmer example

In [18]:
import pandas as pd
stemmer = SnowballStemmer('english')
original_words = ["dancing", "loving" , "horses","ran", "sensational", "beautifully", "flies", "dies"]
base_words = [stemmer.stem(words) for words in original_words]
pd.DataFrame(data={"orginal_words":original_words,"base_words":base_words})

Unnamed: 0,orginal_words,base_words
0,dancing,danc
1,loving,love
2,horses,hors
3,ran,ran
4,sensational,sensat
5,beautifully,beauti
6,flies,fli
7,dies,die


In [31]:
"""
    Function to preprocess data in the entire dataset
"""
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text,pos='v'))

def preprocessor(text):
    result =[]
    for token in simple_preprocess(text):
        if token is not STOPWORDS and len(token)>3:
            result.append(lemmatize_stemming(token))
    return result

In [32]:
"""
    Test the function before pre-processing on the data 
"""
sent_sample = "I tried so hard and got so far, in the end it doesn't even matter"
words = []
for word in sent_sample.split(' '):
    words.append(word)
    
print(words)
#Tokenizing and running the preprocessing function
print(preprocessor(sent_sample))

['I', 'tried', 'so', 'hard', 'and', 'got', 'so', 'far,', 'in', 'the', 'end', 'it', "doesn't", 'even', 'matter']
['tri', 'hard', 'doesn', 'even', 'matter']


In [33]:
#Now preprocess you main data using the function
prepro_data = []
for doc in newsgroups_train.data:
    prepro_data.append(preprocessor(doc))

## Create a bag of words with the preprocessed dataset

In [36]:
"""
    Create a dictionary from preprocessed data containing the number of times word appears in the training dataset 
"""
from gensim.corpora import Dictionary
dictionary = Dictionary(prepro_data)

In [43]:
dictionary.filter_extremes(no_below=15,no_above=0.1,keep_n=10000)

In [48]:
"""
    Create a bag of words for each document 
"""
bow_corpus = [dictionary.doc2bow(doc) for doc in prepro_data]

In [49]:
'''
Preview BOW for our sample preprocessed document
'''
document_num = 20
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Word 15 ("name") appears 1 time.
Word 19 ("rest") appears 1 time.
Word 116 ("rather") appears 1 time.
Word 173 ("clear") appears 1 time.
Word 352 ("refer") appears 1 time.
Word 366 ("true") appears 1 time.
Word 380 ("abov") appears 1 time.
Word 410 ("technolog") appears 1 time.
Word 463 ("christian") appears 1 time.
Word 479 ("exampl") appears 1 time.
Word 502 ("jew") appears 1 time.
Word 506 ("lead") appears 1 time.
Word 508 ("littl") appears 3 time.
Word 549 ("wors") appears 2 time.
Word 756 ("keith") appears 3 time.
Word 766 ("punish") appears 1 time.
Word 843 ("california") appears 1 time.
Word 905 ("institut") appears 1 time.
Word 964 ("similar") appears 1 time.
Word 1037 ("allan") appears 1 time.
Word 1038 ("anti") appears 1 time.
Word 1039 ("arriv") appears 1 time.
Word 1040 ("austria") appears 1 time.
Word 1041 ("caltech") appears 2 time.
Word 1042 ("distinguish") appears 1 time.
Word 1043 ("german") appears 1 time.
Word 1044 ("germani") appears 3 time.
Word 1045 ("hitler") app

In [51]:
from gensim.models import LdaMulticore

In [52]:
lda_model = LdaMulticore(bow_corpus,num_topics=8,
                        id2word=dictionary,
                        passes=10,
                        workers=2)

In [53]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")


Topic: 0 
Words: 0.009*"govern" + 0.006*"encrypt" + 0.005*"israel" + 0.005*"public" + 0.005*"secur" + 0.004*"isra" + 0.004*"chip" + 0.004*"presid" + 0.004*"clipper" + 0.004*"american"


Topic: 1 
Words: 0.014*"armenian" + 0.008*"turkish" + 0.007*"kill" + 0.005*"live" + 0.004*"greek" + 0.004*"turk" + 0.004*"down" + 0.004*"armenia" + 0.004*"turkey" + 0.004*"leav"


Topic: 2 
Words: 0.016*"window" + 0.012*"file" + 0.008*"program" + 0.007*"card" + 0.006*"version" + 0.006*"softwar" + 0.005*"imag" + 0.005*"graphic" + 0.005*"avail" + 0.005*"color"


Topic: 3 
Words: 0.017*"space" + 0.014*"drive" + 0.013*"nasa" + 0.009*"scsi" + 0.007*"control" + 0.007*"data" + 0.006*"orbit" + 0.006*"disk" + 0.005*"launch" + 0.004*"hard"


Topic: 4 
Words: 0.006*"wire" + 0.004*"power" + 0.004*"food" + 0.004*"caus" + 0.004*"pitt" + 0.004*"ohio" + 0.004*"medic" + 0.004*"effect" + 0.003*"water" + 0.003*"health"


Topic: 5 
Words: 0.010*"christian" + 0.007*"jesus" + 0.006*"exist" + 0.004*"moral" + 0.004*"bibl" + 0.

## Check the model on unseen data

In [57]:
num = 100
unseen_document = newsgroups_test.data[num]
print(unseen_document)

Subject: help
From: C..Doelle@p26.f3333.n106.z1.fidonet.org (C. Doelle)
Lines: 13

Hello All!

    It is my understanding that all True-Type fonts in Windows are loaded in
prior to starting Windows - this makes getting into Windows quite slow if you
have hundreds of them as I do.  First off, am I correct in this thinking -
secondly, if that is the case - can you get Windows to ignore them on boot and
maybe make something like a PIF file to load them only when you enter the
applications that need fonts?  Any ideas?


Chris

 * Origin: chris.doelle.@f3333.n106.z1.fidonet.org (1:106/3333.26)



In [59]:
# Data preprocessing step for the unseen document
bow_vector = dictionary.doc2bow(preprocessor(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.9717214107513428	 Topic: 0.016*"window" + 0.012*"file" + 0.008*"program" + 0.007*"card" + 0.006*"version"
