In [2]:
# import necessary packages
import pandas as pd
import re
import spacy
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
import nltk.corpus as nltkcorpus
from gensim.models import CoherenceModel


In [3]:
# load dataset
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(df.head())
'''
this content column in this dataframe has lots of new lines, emails, white spaces, etc. Lets
do some pre-processing using regular expressions
'''

content = df.content.values.tolist()
content = [re.sub(r'\S*@\S*\s?', '', sent) for sent in content]
content = [re.sub(r'\s+', ' ', sent) for sent in content]
content = [re.sub(r"\'", "", sent) for sent in content]




                                                content  target  \
0     From: lerxst@wam.umd.edu (where's my thing)\nS...       7   
1     From: guykuo@carson.u.washington.edu (Guy Kuo)...       4   
10    From: irwin@cmptrc.lonestar.org (Irwin Arnstei...       8   
100   From: tchen@magnus.acs.ohio-state.edu (Tsung-K...       6   
1000  From: dabl2@nlm.nih.gov (Don A.B. Lindbergh)\n...       2   

                 target_names  
0                   rec.autos  
1       comp.sys.mac.hardware  
10            rec.motorcycles  
100              misc.forsale  
1000  comp.os.ms-windows.misc  


In [4]:
# further pre-processing like tokenization and other stuffs using genism
def tokenize(sentences):
    for line in sentences:
        yield simple_preprocess(str(line), deacc= True) # deacc= true removes punctuations


process_content = list(tokenize(content))
# there were 11314 rows. Lets print first row to see how the data has been processed
print(process_content[:1])

'''
next step is to Build the bigram and trigram models using genism
Bigrams means 2 words frequently occurring together in the document. Trigrams means 3 words frequently occurring.
'''
bigram = gensim.models.Phrases(process_content, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[process_content], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mode = gensim.models.phrases.Phraser(bigram)
trigram_mode = gensim.models.phrases.Phraser(trigram)


[['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp', 'posting', 'host', 'rac', 'wam', 'umd', 'edu', 'organization', 'university', 'of', 'maryland', 'college', 'park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']]




In [5]:
# futher pre-processing by removing stop words, Make Bigrams and Lemmatize the words
stop_words = nltkcorpus.stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
nlp = spacy.load('en_core_web_sm')

def stopwords_remove(text):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in text]

def make_bigrams(text):
    return [bigram_mode[doc] for doc in text]

def make_trigrams(text):
    return [trigram_mode[bigram_mode[doc]] for doc in text]

def do_lemmatization(text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): # this allowed postages comes from spacy   
    texts_out = []
    for sent in text:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


In [6]:
# lets pre-process using above function
content_withoutStopwords = stopwords_remove(process_content)
content_bigram = make_bigrams(content_withoutStopwords)

# lets lemmatize the text allowing only noun, adj, vb, adv
content_lemmatize = do_lemmatization(content_bigram, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(content_lemmatize[:1])

[['where', 's', 'thing', 'car', 'nntp_poste', 'host', 'umd', 'organization', 'university', 'maryland_college', 'park', 'line', 'wonder', 'anyone', 'could', 'enlighten', 'car', 'see', 'day', 'door', 'sport', 'car', 'look', 'late', 'early', 'call', 'bricklin', 'door', 'really', 'small', 'addition', 'front_bumper', 'separate', 'rest', 'body', 'know', 'anyone', 'tellme', 'model', 'name', 'engine', 'specs', 'year', 'production', 'car', 'make', 'history', 'info', 'funky', 'look', 'car', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst']]


In [7]:
# Finally lets create dictionary and corpus which is needed for the LDA topic modelling
id2word = corpora.Dictionary(content_lemmatize)

_corpus = [id2word.doc2bow(text) for text in content_lemmatize]

# next step is to train the LDA model from gensim

_ldamodel = gensim.models.ldamodel.LdaModel(corpus= _corpus,
                                            id2word= id2word,
                                            num_topics=20,
                                            random_state=50,
                                            update_every=1,
                                            chunksize=150,
                                            passes=10,
                                            alpha='auto',
                                            per_word_topics= True
                                            )



In [9]:
# computing the coherence score to judge how good a given topic model is

_coherence_model_lda = CoherenceModel(model=_ldamodel, texts=content_lemmatize, dictionary=id2word,
                                      coherence='c_v' )

coherence_lda = _coherence_model_lda.get_coherence()

print('Coherence Score: ', coherence_lda)

Coherence Score:  0.5397608545029745



After this, we can do further tasks like
visualizing the topic words,
improving the coherence score,
Find the most representative document for each topic,
Finding the dominant topic in each sentence,
and many more complex NLP modeling.  

