In [1]:
# Goal: Topic models with NMF and LDA (Scikit-learn)
# Result: implementations of topic models in NMF and LDA (a comparison)
# Output: document-word matrices
# Resource: https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda
# Resource: https://github.com/derekgreene/topic-model-tutorial/
# TO DO: improve text pre-processing; determine "best" number of topics for each model w/parameter selection

In [73]:
# Load libraries
from pprint import pprint
import pandas as pd
import numpy as np
from itertools import combinations

import gensim
from gensim.models import FastText

from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import GridSearchCV

import pyLDAvis
import pyLDAvis.sklearn

In [13]:
# DATA Read in docs (corpus) from csv; use column 'combined'
print("Loading dataset...")
docs = pd.read_csv("data/ERI-combined-2009-2019.csv")
docs['combined'] = docs['title'].astype(str) + ' ' + docs['abstract'].astype(str)
data = docs['combined']
print(data[:1])
data_samples = data[:n_samples]
len(data_samples)

Loading dataset...
0    Streams and Urbanization Urbanization encompas...
Name: combined, dtype: object


3700

In [74]:
# DATA text processing
raw_documents = []
snippets = []
for line in data:
    text = line.strip()
    raw_documents.append( line.strip().lower() ) # Read the documents from the input file again
    snippets.append( text[0:min(len(text),100)] )
print("Read %d raw text documents" % len(raw_documents))

custom_stop_words = []
with open( "mallet-2.0.8/stoplists/en.txt", "r" ) as fin:
    for line in fin.readlines():
        custom_stop_words.append( line.strip().lower() )
# note that we need to make it hashable
print("Stopword list has %d entries" % len(custom_stop_words) )

print(raw_documents[:1])

Read 3770 raw text documents
Stopword list has 524 entries
['streams and urbanization urbanization encompasses a diverse array of watershed alterations that influence the physical, chemical, and biological characteristics of streams. in this chapter, we summarize lessons learned from the last half century of research on urban streams and provide a critique of various mitigation strategies, including recent approaches that explicitly address geomorphic processes. we focus first on the abiotic conditions (primarily hydrologic and geomorphic) and their changes in streams that accompany urbanization, recognizing that these changes may vary with geomorphic context and climatic region. we then discuss technical approaches and limitations to (1) mitigating water-quantity and water-quality degradation through site design, riparian protection, and structural stormwater-management strategies; and (2) restoring urban streams in those watersheds where the economic, social, and political contexts c

In [56]:
# NLP Create document-word matrix from documents (tf-idf features)
# Load the dataset and vectorize it
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(strip_accents = 'unicode',
                                   stop_words='english',
                                   lowercase = True,
                                   token_pattern = r'\b[a-zA-Z]{3,}\b',
                                   max_df=0.5, # words occur in at least half of documents
                                   min_df=2, # words occur in more than one document
                                   max_features=n_features)
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print(tfidf.shape)

Extracting tf-idf features for NMF...
(3700, 1000)


In [76]:
# NLP, Use term frequency (raw term count) features for LDA
# Convert documents into document-term matrix, possibly as raw counts or in TF-IDF form
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words='english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df=0.5, # words occur in at least half of documents
                                min_df=2, # words occur in more than one document
                                max_features=n_features) # copy same paramters from tfidf_vectorizer
tf = tf_vectorizer.fit_transform(data_samples)
print(tf.shape)
A = tf_vectorizer.fit_transform(raw_documents)
print( "Created %d X %d document-term matrix" % (A.shape[0], A.shape[1]) )
terms = tf_vectorizer.get_feature_names()
print("Vocabulary has %d distinct terms" % len(terms))
print(terms[100:120])

Extracting tf features for LDA...
(3700, 1000)
Created 3770 X 1000 document-term matrix
Vocabulary has 1000 distinct terms
['century', 'challenge', 'challenges', 'change', 'changes', 'changing', 'channel', 'characteristics', 'characterization', 'characterize', 'characterized', 'chemical', 'china', 'chlorophyll', 'circulation', 'clear', 'climate', 'climatic', 'cloud', 'coast']


In [58]:
# NLP save this document-term matrix, terms, and snippets for later use
from sklearn.externals import joblib
joblib.dump((A,terms,snippets), "articles-raw.pkl")

['articles-raw.pkl']

In [59]:
# TOPIC MODEL: Parameter Selection for NMF
# Create the Topic Models
kmin, kmax = 4, 15 # specify an initial range of "sensible" values
from sklearn import decomposition
# try each value of k
for k in range(kmin,kmax+1):
    print("Applying NMF for k=%d ..." % k )
    # run NMF
    model = decomposition.NMF( init="nndsvd", n_components=k ) 
    W = model.fit_transform( A )
    H = model.components_    
    # store for later
    topic_models.append( (k,W,H) )

Applying NMF for k=4 ...
Applying NMF for k=5 ...
Applying NMF for k=6 ...
Applying NMF for k=7 ...
Applying NMF for k=8 ...
Applying NMF for k=9 ...
Applying NMF for k=10 ...
Applying NMF for k=11 ...
Applying NMF for k=12 ...
Applying NMF for k=13 ...
Applying NMF for k=14 ...
Applying NMF for k=15 ...


In [82]:
# Parameter Selection for NMF
# Build a Word Embedding (with Word2Vec | FastText) in Gensim

# Define a class that will generate documents in a form that can be consumed by Gensim's Word2Vec implementation
import re
class TokenGenerator:
    def __init__( self, documents, stopwords ):
        self.documents = documents
        self.stopwords = stopwords
        self.tokenizer = re.compile( r"(?u)\b\w\w+\b" )

    def __iter__( self ):
        print("Building Word2Vec model ...")
        for doc in self.documents:
            tokens = []
            for tok in self.tokenizer.findall( doc ):
                if tok in self.stopwords:
                    tokens.append( "<stopword>" )
                elif len(tok) >= 2:
                    tokens.append( tok )
            yield tokens
            
# Build a Skipgram Word2Vec model from all documents in the input file using Gensim
#docgen = TokenGenerator( raw_documents, custom_stop_words )
# the model has 500 dimensions, the minimum document-term frequency is 2
#w2v_model = gensim.models.Word2Vec(docgen, size=500, min_count=2, sg=1)
#print( "Model has %d terms" % len(w2v_model.wv.vocab) )
#w2v_model.save("w2v-model.bin")
# To re-load this model, run
#w2v_model = gensim.models.Word2Vec.load("w2v-model.bin")

# Build a FastText model as an alternative approach to word embedding
# ft_model = FastText(raw_documents, size=500, window=5, min_count=2, workers=4, sg=1)
# print( "Model has %d terms" % len(ft_model.wv.vocab) )
sim = ft_model.wv.most_similar("climate")
print(sim)

[('z', 0.0695747435092926), ('(', 0.06237316504120827), ('\xa0', 0.06135527044534683), ('}', 0.061038702726364136), ('9', 0.0584850087761879), ('œ', 0.058113835752010345), ('8', 0.05796496942639351), ('7', 0.057263102382421494), ('ã', 0.05616030842065811), ('ƒ', 0.05508892238140106)]


In [79]:
# Parameter Selection for NMF
# Use Word2Vec model to calculate coherence scores for each of these models

# coherence score
def calculate_coherence( w2v_model, term_rankings ):
    overall_coherence = 0.0
    for topic_index in range(len(term_rankings)):
        # check each pair of terms
        pair_scores = []
        for pair in combinations( term_rankings[topic_index], 2 ):
            pair_scores.append( w2v_model.similarity(pair[0], pair[1]) )
        # get the mean for all pairs in this topic
        topic_score = sum(pair_scores) / len(pair_scores)
        overall_coherence += topic_score
    # get the mean score across all topics
    return overall_coherence / len(term_rankings)

# get the topic descriptor (i.e. list of top terms) for each topic
def get_descriptor( all_terms, H, topic_index, top ):
    # reverse sort the values to sort the indices
    top_indices = np.argsort( H[topic_index,:] )[::-1]
    # now get the terms corresponding to the top-ranked indices
    top_terms = []
    for term_index in top_indices[0:top]:
        top_terms.append( all_terms[term_index] )
    return top_terms

# process each of the models for different values of k
k_values = []
coherences = []
for (k,W,H) in topic_models:
    # Get all of the topic descriptors - the term_rankings, based on top 10 terms
    term_rankings = []
    for topic_index in range(k):
        term_rankings.append( get_descriptor( terms, H, topic_index, 10 ) )
    # Now calculate the coherence based on our Word2vec model
    k_values.append( k )
    coherences.append( calculate_coherence( w2v_model, term_rankings ) )
    print("K=%02d: Coherence=%.4f" % ( k, coherences[-1] ) )

  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()


KeyError: "word 'using' not in vocabulary"

In [11]:
# Define parameters for models
n_samples = 3700 # number of data points (3700 in set)
n_features = 1000 # time complexity is polynomial in NMF
n_components = 10 # number of topics
n_top_words = 20 # N significant words returned

In [12]:
# Return top words from each model
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [54]:
# Topic Modeling (NMF) Fit the model (Frobenius norm)
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))

nmf_fm = NMF(n_components=n_components, 
          random_state=1,
          alpha=.1, 
          l1_ratio=.5).fit(tfidf)

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf_fm, tfidf_feature_names, n_top_words)

# Log likelihood: Higher the better
print("Log Likelihood: ", nmf_fm.score(tfidf))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", nmf_fm.perplexity(tfidf))

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=3700 and n_features=1000...

Topics in NMF model (Frobenius norm):
Topic #0: data water use model environmental based using research analysis land spatial energy study results used models project approach information high
Topic #1: mantle crust rocks crustal zircon subduction metamorphism lavas arc metamorphic continental isotopic exhumation monazite deformation high ages dates melting upper
Topic #2: snow swe cover water snowmelt snowpack equivalent model modis microwave covered mountain sierra grain resolution scattering nevada sensing depth remote
Topic #3: soil moisture microbial soils vegetation plant microwave surface dry water organic drought biomass carbon plants algorithm band season nitrogen temperature
Topic #4: ice sea sheet glacial antarctic holocene level arctic shelf ages rise bay past circulation maximum warming early west margin age
Topic #5: species plant diversity native habitat richness biomass e

AttributeError: 'NMF' object has no attribute 'score'

In [51]:
# Topic Modeling (NMF) Fit the model (generalized Kullback-Leibler divergence ~ Probabilistic Latent Semantic Indexing)
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))

nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)

print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)
tfidf_vectorizer.get_params()

# Log likelihood: Higher the better
print("Log Likelihood: ", lda_tf.score(tfidf))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_tf.perplexity(tfidf))

Fitting the NMF model (generalized Kullback-Leibler divergence) with tf-idf features, n_samples=3700 and n_features=1000...

Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: based results use analysis high used environmental different studies including potential model study using models important effects systems data approach
Topic #1: data new high evolution crust age ages crustal rocks deformation lower record mantle isotope continental early history low upper consistent
Topic #2: data based using used algorithm model measurements surface accuracy water accurate remote resolution analysis observations area results method sensing band
Topic #3: carbon high concentrations higher organic activity growth microbial increased low production significantly nanoparticles results soil composition effects concentration exposed different
Topic #4: climate changes change annual water precipitation associated increased america climatic variability nevada patterns central li

In [78]:
# Topic model (LDA) grid search to find best number of topics
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model

# Define Search Param
search_params = {'n_components': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100], 
                 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(tf)

# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
#print("Model Perplexity: ", best_lda_model.perplexity(tf))



Best Model's Params:  {'learning_decay': 0.5, 'n_components': 10}
Best Log Likelihood Score:  -667324.5189372872


NameError: name 'data_vectorized' is not defined

In [7]:
# Topic Modeling (LDA) Fit the model with found parameters

print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))

# for TF LDA
lda_tf = LatentDirichletAllocation(n_components=10, 
                                   #n_components=n_components,
                                   max_iter=5,
                                   learning_method='online',
                                   learning_offset=50.,
                                   learning_decay=0.5,
                                   random_state=0)


lda_tf.fit(tf)

# Log likelihood: Higher the better
print("Log Likelihood: ", lda_tf.score(tf))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_tf.perplexity(tf))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda_tf, tf_feature_names, n_top_words)

# See model parameters
pprint(lda.get_params())

Fitting LDA models with tf features, n_samples=3700 and n_features=1000...
Log Likelihood:  -1944449.5032006903
Perplexity:  567.3650123490625

Topics in LDA model:
Topic #0: research project change climate management environmental program conservation systems development ucsb data global california restoration natural use ecosystem provide understanding
Topic #1: data using model based snow surface vegetation used resolution method satellite spatial analysis remote cover observations measurements spectral models time
Topic #2: soil plant plants nanoparticles effects water soils species leaf high results increased native nitrogen litter effect exposure uptake decomposition stress
Topic #3: carbon production biomass microbial community environmental organic phytoplankton diversity effects processes oil cycle emissions biogeochemical marine communities life global primary
Topic #4: ocean mantle wave source high isotopic data samples isotope seismic lavas element waves oceanic surface tra

NameError: name 'pprint' is not defined

In [8]:
# VISUALIZATION pyLDAvis for LDA model from scikit-learn
pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(lda_tf, tf, tf_vectorizer)
pyLDAvis.save_html(vis, 'pyLDAvis/lda-scikitlearn-10.html') #saves pyLDAvis graphs as standalone webpage
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
