In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)


#### Prepare Stopwords

In [3]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

  signature = inspect.formatargspec(
  from collections import Sequence, defaultdict


#### Import Newsgroups Data

In [4]:
# Import Dataset
df = pd.read_json('/home/sandeep/Desktop/samudsan/Machine learning/newsgroups.json')
print(df.target_names.unique())
df.head()

['rec.autos' 'comp.sys.mac.hardware' 'comp.graphics' 'sci.space'
 'talk.politics.guns' 'sci.med' 'comp.sys.ibm.pc.hardware'
 'comp.os.ms-windows.misc' 'rec.motorcycles' 'talk.religion.misc'
 'misc.forsale' 'alt.atheism' 'sci.electronics' 'comp.windows.x'
 'rec.sport.hockey' 'rec.sport.baseball' 'soc.religion.christian'
 'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']


Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


#### Remove emails and newline characters

In [7]:
# Convert to list
data = df.content.values.tolist()

# Remove Emails
data = [re.sub(r'S*@S*s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub(r's+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub(r"\'", "", sent) for sent in data]

pprint(data[:1])

['From: lerx twam.umd.edu (where  my thing)\n'
 'Subject: WHAT car i  thi !?\n'
 'Nntp-Po ting-Ho t: rac3.wam.umd.edu\n'
 'Organization: Univer ity of Maryland, College Park\n'
 'Line : 15\n'
 '\n'
 ' I wa  wondering if anyone out there could enlighten me on thi  car I  aw\n'
 'the other day. It wa  a 2-door  port  car, looked to be from the late 60 /\n'
 'early 70 . It wa  called a Bricklin. The door  were really  mall. In '
 'addition,\n'
 'the front bumper wa   eparate from the re t of the body. Thi  i  \n'
 'all I know. If anyone can tellme a model name, engine  pec , year \n'
 'of production, where thi  car i  made, hi tory, or whatever info you\n'
 'have on thi  funky looking car, plea e e-mail.\n'
 '\n'
 'Thank ,\n'
 '- IL\n'
 '   ---- brought to you by your neighborhood Lerx t ----\n'
 '\n'
 '\n'
 '\n'
 '\n']


####  Tokenize words and Clean-up text

In [11]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
data_words = list(sent_to_words(data))
print(data_words[:1])

[['from', 'lerx', 'twam', 'umd', 'edu', 'where', 'my', 'thing', 'subject', 'what', 'car', 'thi', 'nntp', 'po', 'ting', 'ho', 'rac', 'wam', 'umd', 'edu', 'organization', 'univer', 'ity', 'of', 'maryland', 'college', 'park', 'line', 'wa', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'thi', 'car', 'aw', 'the', 'other', 'day', 'it', 'wa', 'door', 'port', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'wa', 'called', 'bricklin', 'the', 'door', 'were', 'really', 'mall', 'in', 'addition', 'the', 'front', 'bumper', 'wa', 'eparate', 'from', 'the', 're', 'of', 'the', 'body', 'thi', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'pec', 'year', 'of', 'production', 'where', 'thi', 'car', 'made', 'hi', 'tory', 'or', 'whatever', 'info', 'you', 'have', 'on', 'thi', 'funky', 'looking', 'car', 'plea', 'mail', 'thank', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerx']]


#### Creating Bigram and Trigram Models

In [12]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['from', 'lerx', 'twam', 'umd', 'edu', 'where', 'my', 'thing', 'subject', 'what', 'car', 'thi', 'nntp', 'po', 'ting_ho', 'rac_wam_umd', 'edu', 'organization', 'univer_ity', 'of', 'maryland_college_park', 'line', 'wa', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'thi', 'car', 'aw', 'the', 'other', 'day', 'it', 'wa', 'door', 'port', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'wa', 'called', 'bricklin', 'the', 'door', 'were', 'really', 'mall', 'in', 'addition', 'the', 'front_bumper', 'wa', 'eparate', 'from', 'the', 're', 'of', 'the', 'body', 'thi', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'pec', 'year', 'of', 'production', 'where', 'thi', 'car', 'made', 'hi_tory', 'or', 'whatever', 'info', 'you', 'have', 'on', 'thi', 'funky', 'looking', 'car', 'plea', 'mail', 'thank', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerx']


#### Remove Stopwords, Make Bigrams and Lemmatize


In [14]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['thi', 'nntp', 'park', 'line', 'wonder', 'could', 'enlighten', 'thi', 'car', 'day', 'door', 'port', 'car', 'look', 'late', 'early', 'call', 'really', 'mall', 'addition', 'know', 'model', 'name', 'year', 'production', 'thi', 'car', 'make', 'info', 'thi', 'funky', 'look', 'car', 'plea', 'mail', 'thank', 'bring', 'neighborhood', 'lerx']]


#### Create the Dictionary and Corpus needed for Topic Modeling

The two main inputs to the LDA topic model are the dictionary(id2word) and the corpus. Let’s create them.



In [15]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])


# Gensim creates a unique id for each word in the document. The produced corpus shown above is a mapping of (word_id, word_frequency).

[[(0, 1), (1, 1), (2, 1), (3, 4), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 2), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 4), (30, 1), (31, 1)]]


ref: 
https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
    