<a href="https://colab.research.google.com/github/shraddha-an/Dataset-Randomizer/blob/master/topic_modelling_lda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Topic Modelling with LDA**

### **Skills**: Topic Modelling, Visualization, NLP, Data Preprocessing.

In this project, I use Gensim's implementation of Latent Dirichlet Allocation to view topics in the News Category Dataset.

### **1) Downloading Data**

In [49]:
# Topic Modelling with Gensim's Implementation of LDA

# Importing libraries
import re
import numpy as np, pandas as pd
import spacy # lemmatization
nlp = spacy.load("en_core_web_sm")

import matplotlib.pyplot as plt, seaborn as sns
from pprint import pprint

# gensim modules
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel


# Stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

# Downloading non standard libraries
!pip install pyLDAvis


In [40]:
# Importing the dataset
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(df.target_names.unique())
df.tail()

['rec.autos' 'comp.sys.mac.hardware' 'comp.graphics' 'sci.space'
 'talk.politics.guns' 'sci.med' 'comp.sys.ibm.pc.hardware'
 'comp.os.ms-windows.misc' 'rec.motorcycles' 'talk.religion.misc'
 'misc.forsale' 'alt.atheism' 'sci.electronics' 'comp.windows.x'
 'rec.sport.hockey' 'rec.sport.baseball' 'soc.religion.christian'
 'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']


Unnamed: 0,content,target,target_names
11309,From: jim.zisfein@factory.com (Jim Zisfein) \n...,13,sci.med
11310,From: ebodin@pearl.tufts.edu\nSubject: Screen ...,4,comp.sys.mac.hardware
11311,From: westes@netcom.com (Will Estes)\nSubject:...,3,comp.sys.ibm.pc.hardware
11312,From: steve@hcrlgw (Steven Collins)\nSubject: ...,1,comp.graphics
11313,From: gunning@cco.caltech.edu (Kevin J. Gunnin...,8,rec.motorcycles


### **2) Data Preprocessing**

#### **I) Removing unwanted characters**

In [None]:
# Removing emails, new line characters and random single quotes
data = df.content.values.tolist()

data = [re.sub('\S*@\S*\s?', '', sent) for sent in data] # Removing Emails
data = [re.sub('\s+', ' ', sent) for sent in data] # Removiing new line characters
data = [re.sub("\'", "", sent) for sent in data] # Removing random single quotes

pprint(data[:6])


#### **II) Tokenization**

In [43]:
# Tokenize sentences & removing punctuation
def token_sentences(sentences):
    for sentence in sentences:
        yield(simple_preprocess(sentence, deacc = True))

tokens = list(token_sentences(data))
print(tokens[:9])



#### **III) Bigrams & Trigrams**

In [46]:
# Building Bigrams & Trigrams
from gensim.models.phrases import Phrases, Phraser

# Detecting Phrases of 2 & 3 words based on collocation counts
biphrases =  Phrases(tokens, min_count = 5, threshold = 100)
triphrases =  Phrases(biphrases[tokens], threshold = 100)

# Now forming Bigrams and Trigrams
bigrams = Phraser(biphrases)
trigrams = Phraser(triphrases)

print('Bigram example: ', bigrams[tokens[10]])
print('Trigram example: ', trigrams[bigrams[tokens[10]]])




Bigram example:  ['from', 'irwin', 'arnstein', 'subject', 're', 'recommendation', 'on', 'duc', 'summary', 'whats', 'it', 'worth', 'distribution_usa', 'expires_sat', 'may', 'gmt', 'organization', 'computrac_inc', 'richardson_tx', 'keywords', 'ducati', 'gts', 'how', 'much', 'lines', 'have', 'line', 'on', 'ducati', 'gts', 'model', 'with', 'on', 'the', 'clock', 'runs', 'very', 'well', 'paint', 'is', 'the', 'bronze', 'brown', 'orange', 'faded', 'out', 'leaks', 'bit', 'of', 'oil', 'and', 'pops', 'out', 'of', 'st', 'with', 'hard', 'accel', 'the', 'shop', 'will', 'fix', 'trans', 'and', 'oil_leak', 'they', 'sold', 'the', 'bike', 'to', 'the', 'and', 'only', 'owner', 'they', 'want', 'and', 'am', 'thinking', 'more', 'like', 'any', 'opinions', 'out', 'there', 'please', 'email', 'me', 'thanks', 'it', 'would', 'be', 'nice', 'stable', 'mate', 'to', 'the', 'beemer', 'then', 'ill', 'get', 'jap', 'bike', 'and', 'call', 'myself', 'axis', 'motors', 'tuba', 'irwin', 'honk', 'therefore', 'am', 'computrac', '

#### **IV) Stopwords & Lemmatization**

In [50]:
# Further Cleaning up the texts
# Functions for removing stopwords, bigram- & trigram-making & lemmatization

def stopword_removal(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def bigrammer(texts):
    return [bigrams[doc] for doc in texts]

def trigrammer(texts):
    return [trigrams[bigrams[doc]] for doc in texts]

def lemmatizer(texts, allowed_postags = ['NOUN', 'ADV', 'ADJ', 'VERB']):
    texts_out = []
    for sentence in texts:
        doc = nlp(' '.join(sentence))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

# Calling the functions
cleaned_tokens = stopword_removal(tokens)
tokens_bigrams = bigrammer(cleaned_tokens)

nlp = spacy.load('en', disable = ['parser', 'ner'])
lemmatized_tokens = lemmatizer(tokens_bigrams, allowed_postags = ['NOUN', 'ADV', 'ADJ', 'VERB'])

print(lemmatized_tokens[:1])

[['where', 'thing', 'car', 'nntp_poste', 'host', 'park', 'line', 'wonder', 'could', 'enlighten', 'car', 'see', 'day', 'door', 'sport', 'car', 'look', 'late', 'early', 'call', 'bricklin', 'door', 'really', 'small', 'addition', 'separate', 'rest', 'body', 'know', 'tellme', 'model', 'name', 'engine', 'year', 'production', 'car', 'make', 'history', 'info', 'funky', 'look', 'car', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst']]


#### **V) Dictionary & Corpus**

In [77]:
# Creating the dictionary & corpus for LDA
dict_ = corpora.Dictionary(lemmatized_tokens) # Dictionary

# Corpus (Term Document Frequency)
corpus = [dict_.doc2bow(text) for text in lemmatized_tokens]

# Checking a given word and its frequency
pprint([[(dict_[id], freq) for id, freq in entry] for entry in corpus[5:6]])


[[('make', 2),
  ('thank', 1),
  ('year', 1),
  ('number', 1),
  ('usage', 1),
  ('access', 1),
  ('back', 1),
  ('be', 1),
  ('give', 1),
  ('great', 1),
  ('people', 2),
  ('use', 2),
  ('would', 2),
  ('article', 5),
  ('point', 1),
  ('write', 5),
  ('check', 1),
  ('right', 3),
  ('accidental', 1),
  ('agree', 1),
  ('allegedly', 1),
  ('allow', 1),
  ('analysis', 1),
  ('argument', 1),
  ('bear', 1),
  ('believe', 3),
  ('biological', 1),
  ('blank', 1),
  ('can', 1),
  ('class', 1),
  ('come', 1),
  ('commonly', 1),
  ('company', 1),
  ('consider', 1),
  ('control', 1),
  ('cost', 1),
  ('count', 1),
  ('course', 1),
  ('crimial', 1),
  ('death', 1),
  ('define', 1),
  ('destructive', 1),
  ('disagree', 1),
  ('doubt', 1),
  ('easily', 1),
  ('even', 1),
  ('evidently', 1),
  ('find', 1),
  ('first', 2),
  ('follow', 1),
  ('government', 1),
  ('hand', 1),
  ('handgun', 1),
  ('hard', 1),
  ('hope', 1),
  ('idea', 1),
  ('immediately', 1),
  ('individual', 3),
  ('keep', 3),
  (

### **3) LDA Model**

#### **I) Training the Model**

In [58]:
# Multiprocessing
import multiprocessing as mp
cores = mp.cpu_count()
print(cores)

# Building the LDA Model
from gensim.models import LdaMulticore

model = LdaMulticore(corpus = corpus,
                     id2word = dict_,
                     num_topics = 20,
                     random_state = 10,
                     passes = 10,
                     workers = cores,
                     chunksize = 100,
                     per_word_topics = True)

# Viewing the topics in the LDA Model
pprint(model.print_topics())

2
[(0,
  '0.056*"drive" + 0.023*"scsi" + 0.021*"patient" + 0.013*"test" + '
  '0.012*"brain" + 0.012*"disease" + 0.010*"pain" + 0.009*"ide" + '
  '0.009*"slave" + 0.009*"headache"'),
 (1,
  '0.017*"government" + 0.016*"gun" + 0.014*"people" + 0.014*"would" + '
  '0.014*"right" + 0.013*"state" + 0.009*"law" + 0.009*"write" + '
  '0.008*"country" + 0.007*"article"'),
 (2,
  '0.037*"key" + 0.013*"use" + 0.012*"encryption" + 0.010*"bit" + 0.010*"chip" '
  '+ 0.010*"system" + 0.008*"public" + 0.008*"security" + 0.008*"clipper" + '
  '0.007*"technology"'),
 (3,
  '0.015*"would" + 0.014*"say" + 0.013*"people" + 0.012*"may" + 0.012*"write" '
  '+ 0.010*"reason" + 0.010*"think" + 0.009*"evidence" + 0.009*"know" + '
  '0.009*"make"'),
 (4,
  '0.018*"kill" + 0.017*"soldier" + 0.016*"israeli" + 0.015*"attack" + '
  '0.015*"lebanese" + 0.015*"village" + 0.013*"death" + 0.011*"man" + '
  '0.010*"armenian" + 0.009*"terrorist"'),
 (5,
  '0.017*"church" + 0.012*"law" + 0.012*"sin" + 0.011*"word" + 0.00

#### **II) Topic Visualization**

In [62]:
# Visualizing topics & topic separation
import pyLDAvis
import pyLDAvis.gensim

vis = pyLDAvis.gensim.prepare(model, corpus, dict_)
pyLDAvis.display(vis)




#### **III)  Evaluating Model Performance**

In [75]:
# Coherence Score
coherence_score = CoherenceModel(model = model,
                                 texts = lemmatized_tokens,
                                 dictionary = dict_,
                                 coherence = 'c_v').get_coherence()
                                 
print('Coherence Score: ', coherence_score)

Coherence Score:  0.48884438116396717


### **4) Alternate: Mallet's LDA**
--- 
Prerequisites: Install Java, Mallet. [Tutorial](https://github.com/polsci/colab-gensim-mallet/blob/master/topic-modeling-with-colab-gensim-mallet.ipynb)

### **Install Java**

In [65]:
import os       #importing os to set environment variable
def install_java():
  !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
  !java -version       #check java version
install_java()

openjdk version "11.0.8" 2020-07-14
OpenJDK Runtime Environment (build 11.0.8+10-post-Ubuntu-0ubuntu118.04.1)
OpenJDK 64-Bit Server VM (build 11.0.8+10-post-Ubuntu-0ubuntu118.04.1, mixed mode, sharing)


### **Install Mallet**

In [None]:
!wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
!unzip mallet-2.0.8.zip

In [76]:
# Setting path to Mallet binary
import os

os.environ['MALLET_HOME'] = '/content/mallet-2.0.8'
mallet_path = '/content/mallet-2.0.8/bin/mallet'

# Mallet's LDA Implementation
from gensim.models.wrappers import LdaMallet

lda_mallet_model = LdaMallet(mallet_path = mallet_path,
                             corpus = corpus,
                             num_topics = 20,
                             id2word = dict_)

# Looking at the topics
pprint(lda_mallet_model.show_topics(formatted = False))

# Evaluating Model Performance
mallet_coherence_score = CoherenceModel(model = lda_mallet_model,
                                        texts = lemmatized_tokens,
                                        dictionary = dict_,
                                        coherence = 'c_v')

print("\nMallet LDA Model's Coherence Score: ", mallet_coherence_score.get_coherence())


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


[(15,
  [('ax', 0.18934863747642172),
   ('max', 0.15258658747040174),
   ('line', 0.10856042059637998),
   ('host', 0.033631657101577235),
   ('bike', 0.025444475659188507),
   ('ride', 0.017859292852269535),
   ('ca', 0.017618493398081633),
   ('distribution_usa', 0.017498093670987678),
   ('keyword', 0.01003331059116266),
   ('organization', 0.009672111409880804)]),
 (5,
  [('drug', 0.013009106374462124),
   ('study', 0.01120784549184429),
   ('food', 0.009006304413089163),
   ('doctor', 0.008455919143400381),
   ('effect', 0.008080656459521666),
   ('problem', 0.007655358751125788),
   ('patient', 0.007330131091764235),
   ('eat', 0.007155008505954168),
   ('find', 0.006754728309816872),
   ('case', 0.006679675773041129)]),
 (19,
  [('write', 0.1982137794159342),
   ('line', 0.15409696626027786),
   ('article', 0.14193365466402041),
   ('host', 0.06453076268783668),
   ('organization', 0.05049617238446272),
   ('reply', 0.029345052452509214),
   ('nntp_poste', 0.023958037992628297)