# LDA loop over n_topics for labelled documents from 20 NewsGroups dataset to check perplexity

Imports

In [None]:
import numpy as np; import pandas as pd; import matplotlib.pyplot as plt
%matplotlib inline
import codecs 
from glob import glob
import os
import pickle
import copy
import pyorient
import ast

In [None]:
from __future__ import print_function
from time import time
import string
import re
# random
from random import shuffle

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
%matplotlib inline
from pylab import *

In [None]:
from gensim import corpora, models, similarities

In [None]:
n_top_words = 20

## 1. IMPORTING DOCS FROM 20 NEWSGROUPS DATASET

In [None]:
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism',
'comp.graphics',
'comp.os.ms-windows.misc',
'comp.sys.ibm.pc.hardware',
'comp.sys.mac.hardware',
'comp.windows.x',
'misc.forsale',
'rec.autos',
'rec.motorcycles',
'rec.sport.baseball',
'rec.sport.hockey',
'sci.crypt',
'sci.electronics',
'sci.med',
'sci.space',
'soc.religion.christian',
'talk.politics.guns',
'talk.politics.mideast',
'talk.politics.misc',
'talk.religion.misc']

min_n_topics = 20
max_n_topics = 26

In [None]:
newsgroups_train = fetch_20newsgroups(subset='train',
                                      remove=('headers', 'footers','quotes'),
                                      categories=categories)

In [None]:
cwd_path = os.getcwd()

#### TOTAL NUMBER OF DOC

In [None]:
n_docs = newsgroups_train.filenames.shape[0]
n_docs

# 2. LDA to find the topic most-associated with each word

## 2.1 From Strings to Vectors

### Vectorization WITHOUT Lemmatization

In [None]:
t0 = time()
tf_vectorizer = CountVectorizer(encoding='utf-8', analyzer='word', stop_words='english',
                                ngram_range = (1,1), max_df=0.95, min_df = 50, token_pattern = '[a-zA-Z]{2,}').fit(newsgroups_train.data)
tf_docs = tf_vectorizer.transform(newsgroups_train.data)
print("fit vectorizer without lemmatization done in %0.3fs." % (time() - t0))

### WITH TFIDF (active/deactivate following cell to perform/not perform TFIDF)

In [None]:
n_features = len(tf_vectorizer.get_feature_names())

## 2.2 LDA implementation

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [None]:
n_topics_loop = range(min_n_topics, max_n_topics+1)

In [None]:
print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
      % (n_docs, n_features))

perplexity = []

for i_topics in n_topics_loop:
    lda = LatentDirichletAllocation(n_topics=i_topics, max_iter=10, 
                                    learning_method='batch', learning_offset=50.,
                                    evaluate_every=1, n_jobs=-1, random_state=1)
    t0 = time()
    lda.fit(tf_docs)
    print("done in %0.3fs." % (time() - t0))
    # printing the vocabularies
    print("\nTopics in LDA model:")
    tf_feature_names = tf_vectorizer.get_feature_names()
    print_top_words(lda, tf_feature_names, n_top_words)
    # perplexity model
    perplexity.append(lda.perplexity(tf_docs))

### Perplexity plot

In [None]:
fig = plt.figure()
axes = fig.add_axes([0.1, 0.1, 0.8, 0.8])

axes.plot(n_topics_loop, perplexity, 'rd-')

# Set label for axis x
axes.set_xlabel('# Topics')
# Set label for axis y
axes.set_ylabel('Perplexity')
# Set the title
axes.set_title('LDA');

In [None]:
fig.savefig(cwd_path +'/results/LDA_perpl_vs_topic.png', dpi = 200)