In [1]:
import pandas as pd
import re
import gensim
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from string import punctuation
from gensim.corpora import Dictionary
from nltk.tokenize import word_tokenize
from gensim.models.ldamodel import LdaModel,CoherenceModel
#import pyLDAvis
#import pyLDAvis.gensim
from sklearn.decomposition import LatentDirichletAllocation as LDA

import matplotlib.pyplot as plt
%matplotlib inline



In [2]:
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')

In [3]:
df.head()

Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
10,From: irwin@cmptrc.lonestar.org (Irwin Arnstei...,8,rec.motorcycles
100,From: tchen@magnus.acs.ohio-state.edu (Tsung-K...,6,misc.forsale
1000,From: dabl2@nlm.nih.gov (Don A.B. Lindbergh)\n...,2,comp.os.ms-windows.misc


In [4]:
def removing_email(text):
    text = re.sub('\S*@\S*\s',' ',text)
    return text
def only_words(text):
    text = re.sub('\W+',' ',text)
    return text

In [5]:
stop_words = list(set(stopwords.words('english')))+list(punctuation)+['\n','----','----\n\n\n\n\n']
lem = WordNetLemmatizer()

In [6]:

def cleaning(text):
    text = text.lower()
    words = word_tokenize(text)
    words = [w for w in words if w not in stop_words]
    words = [w for w in words if len(w)>=3]
    lemma = [lem.lemmatize(w,'v') for w in words]
    return lemma

In [7]:
df['without email'] = df['content'].apply(removing_email)
df['only words'] = df['without email'].apply(only_words)
df['clean content'] = df['only words'].apply(cleaning)

In [8]:
df.head()

Unnamed: 0,content,target,target_names,without email,only words,clean content
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos,From: (where's my thing)\nSubject: WHAT car i...,From where s my thing Subject WHAT car is this...,"[thing, subject, car, nntp, post, host, rac3, ..."
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware,From: (Guy Kuo)\nSubject: SI Clock Poll - Fin...,From Guy Kuo Subject SI Clock Poll Final Call ...,"[guy, kuo, subject, clock, poll, final, call, ..."
10,From: irwin@cmptrc.lonestar.org (Irwin Arnstei...,8,rec.motorcycles,From: (Irwin Arnstein)\nSubject: Re: Recommen...,From Irwin Arnstein Subject Re Recommendation ...,"[irwin, arnstein, subject, recommendation, duc..."
100,From: tchen@magnus.acs.ohio-state.edu (Tsung-K...,6,misc.forsale,From: (Tsung-Kun Chen)\nSubject: ** Software ...,From Tsung Kun Chen Subject Software forsale l...,"[tsung, kun, chen, subject, software, forsale,..."
1000,From: dabl2@nlm.nih.gov (Don A.B. Lindbergh)\n...,2,comp.os.ms-windows.misc,From: (Don A.B. Lindbergh)\nSubject: Diamond ...,From Don A B Lindbergh Subject Diamond SS24X W...,"[lindbergh, subject, diamond, ss24x, win, mous..."


In [9]:
clean_doc = list(df['clean content'].values)

In [10]:

dictionary = Dictionary(clean_doc)

In [11]:
corpus = [dictionary.doc2bow(doc) for doc in clean_doc]

In [12]:
ldamodel = LdaModel(corpus=corpus,id2word=dictionary,num_topics=5,random_state=42,update_every=1,passes=50, chunksize=100)

In [13]:
print(ldamodel.print_topics())

[(0, '0.013*"use" + 0.009*"line" + 0.008*"subject" + 0.007*"file" + 0.006*"card" + 0.006*"organization" + 0.006*"windows" + 0.005*"program" + 0.005*"drive" + 0.005*"system"'), (1, '0.009*"say" + 0.008*"would" + 0.007*"people" + 0.007*"one" + 0.006*"write" + 0.005*"know" + 0.005*"think" + 0.005*"make" + 0.005*"god" + 0.004*"jesus"'), (2, '0.101*"max" + 0.048*"g9v" + 0.012*"b8f" + 0.011*"a86" + 0.007*"bhj" + 0.007*"1d9" + 0.006*"giz" + 0.005*"2tm" + 0.005*"145" + 0.005*"75u"'), (3, '0.011*"line" + 0.011*"subject" + 0.011*"organization" + 0.008*"write" + 0.008*"article" + 0.008*"get" + 0.007*"post" + 0.007*"one" + 0.007*"would" + 0.006*"like"'), (4, '0.010*"game" + 0.010*"team" + 0.007*"year" + 0.006*"play" + 0.006*"hockey" + 0.006*"win" + 0.006*"line" + 0.005*"organization" + 0.005*"subject" + 0.005*"university"')]


In [14]:
print(ldamodel.log_perplexity(corpus))

-8.81730806873834


In [15]:
coherence = CoherenceModel(ldamodel,texts=clean_doc,dictionary=dictionary,coherence='c_v')

In [16]:
coherence.get_coherence()

0.6560721238777083

In [17]:
coherence = CoherenceModel(ldamodel,texts=clean_doc,dictionary=dictionary,coherence='u_mass')

In [18]:
coherence.get_coherence()

-1.222459724954589