## References

In [None]:
# https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730
# http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html
# https://github.com/derekgreene/topic-model-tutorial/blob/master/3%20-%20Parameter%20Selection%20for%20NMF.ipynb

## Notebook Setup

In [None]:
# Import libraries
import gensim.models
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import os
import pickle
from gensim import corpora
from gensim.models import Word2Vec
from itertools import combinations
from sklearn import decomposition
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Display graphs in jupyter
%matplotlib inline

## Load Vectorized Corpora and Dictionaries

In [None]:
# Load no pooling corpus
if (os.path.exists("../outputs/tourism_no_pooling.dict")):
   dictionary_no_pooling = corpora.Dictionary.load("../outputs/tourism_no_pooling.dict")
   corpus_no_pooling = corpora.MmCorpus("../outputs/tourism_no_pooling.mm")
   print("Vectorized no pooling corpus loaded!")
else:
   print("Please run preprocessing script first!")

## Load Documents and Unpreprocessed Tokenized Documents

In [None]:
# Load no pooling documents
with open ("../outputs/nmf_documents_no_pooling.p", "rb") as fp:
    documents_no_pooling = pickle.load(fp)

# load no pooling unpreprocessed tokenized documents
with open ("../outputs/tokenized_documents_no_pooling_unpp.p", "rb") as fp:
    tokenized_documents_no_pooling = pickle.load(fp)

## Apply TFIDF

In [None]:
# Tfidf is usually used before running NMF
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english')
tfidf_no_pooling = tfidf_vectorizer.fit_transform(documents_no_pooling)
tfidf_feature_names_no_pooling = tfidf_vectorizer.get_feature_names()

## Run NMF Model and Determine Number of Topics

In [None]:
# Define range of possible topic numbers
kmin, kmax = 4, 8

In [None]:
# Run NMF model for different number of topics
topic_models = []
# Try each value of k
for k in range(kmin,kmax+1):
    print("Applying NMF for k=%d ..." % k)
    # run NMF
    model = decomposition.NMF( init="nndsvd", n_components=k) 
    W = model.fit_transform(tfidf_no_pooling)
    H = model.components_    
    # store for later
    topic_models.append((k,W,H))

In [None]:
# Prepare tokenized documents
tokenized_documents_no_pooling = [[item.replace("#","") for item in document] for document in tokenized_documents_no_pooling]

In [None]:
# Display tokenized documents
tokenized_documents_no_pooling

In [None]:
# Build a word2vec model
w2v_model = gensim.models.Word2Vec(tokenized_documents_no_pooling, min_count=2, size=500)

In [None]:
# Define function to calculate coherence
def calculate_coherence(w2v_model, term_rankings):
    overall_coherence = 0.0
    for topic_index in range(len(term_rankings)):
        # check each pair of terms
        pair_scores = []
        for pair in combinations(term_rankings[topic_index], 2):
            pair_scores.append(w2v_model.similarity(pair[0], pair[1]) )
        # get the mean for all pairs in this topic
        topic_score = sum(pair_scores) / len(pair_scores)
        overall_coherence += topic_score
    # get the mean score across all topics
    return overall_coherence / len(term_rankings)

In [None]:
# Define function to get top terms
def get_descriptor(all_terms, H, topic_index, top):
    # reverse sort the values to sort the indices
    top_indices = np.argsort(H[topic_index,:] )[::-1]
    # now get the terms corresponding to the top-ranked indices
    top_terms = []
    for term_index in top_indices[0:top]:
        top_terms.append(all_terms[term_index])
    return top_terms

In [None]:
# Run functions
k_values = []
coherences = []
for (k,W,H) in topic_models:
    # get all of the topic descriptors - the term_rankings, based on top 10 terms
    term_rankings = []
    for topic_index in range(k):
        term_rankings.append(get_descriptor(tfidf_feature_names_no_pooling, H, topic_index, 10))
    # now calculate the coherence based on our Word2vec model
    k_values.append( k )
    coherences.append(calculate_coherence(w2v_model,term_rankings))
    print("K=%02d: Coherence=%.4f" % ( k, coherences[-1] ) )

In [None]:
# Graph settings
plt.style.use("ggplot")
matplotlib.rcParams.update({"font.size": 14})

In [None]:
# Create graph
fig = plt.figure(figsize=(13,7))

# Create the line plot
ax = plt.plot( k_values, coherences )
plt.xticks(k_values)
plt.xlabel("Number of Topics")
plt.ylabel("Mean Coherence")

# Add the points
plt.scatter( k_values, coherences, s=120)

# Find and annotate the maximum point on the plot
ymax = max(coherences)
xpos = coherences.index(ymax)
best_k = k_values[xpos]
plt.annotate( "k=%d" % best_k, xy=(best_k, ymax), xytext=(best_k, ymax), textcoords="offset points", fontsize=16)
plt.savefig("coherence_scores_nmf")

# Show the plot
plt.show()

In [None]:
# Select the best number of topics according to tc-w2v coherence measure
k = best_k

# Get the model that we generated earlier.
W = topic_models[k-kmin][1]
H = topic_models[k-kmin][2]

In [None]:
# Print the top words for each topic
for topic_index in range(k):
    descriptor = get_descriptor(tfidf_feature_names_no_pooling, H, topic_index, 20)
    str_descriptor = ", ".join( descriptor )
    print("Topic %02d: %s" % ( topic_index+1, str_descriptor ) )

Upon inspection, NMF works very well (seems to be more interpretable than the LDA no pooling model). However, the topics are very repetitive and not as clearly interpretable and differentiatable as the topcis detected by the LDA model with hashtag pooling. The problem of repetitions of keywords among different topics complicates the matter.