In [64]:
# Goal: Data preprocessing, NLP, topic models with hLDA with tomotopy
# Result: implementations of topic model with low perplexity score
# Output: tables with topic, keywords, children, parent, level (for visualization)
# Resource: https://bab2min.github.io/tomotopy/v0.6.2/en/
# TO DO: refine text processing (stopwords; n-grams, lemmatization); re-iterate w/beta-loss

import sys
import re
import pandas as pd
from pprint import pprint
import tomotopy as tp

In [2]:
# DATA Read in docs (corpus) from csv; concatenate title and abstract to new column for topic model
docs = pd.read_csv("data/ERI-combined-2009-2019.csv")
docs['combined'] = docs['title'].astype(str) + ' ' + docs['abstract'].astype(str)
data = docs['combined'].values.tolist()
data = [re.sub('\s+', ' ', sent) for sent in data] #remove new line characters
data = [re.sub("\'", "", sent) for sent in data] #remove single quotes
pprint(data[:1])

['Streams and Urbanization Urbanization encompasses a diverse array of '
 'watershed alterations that influence the physical, chemical, and biological '
 'characteristics of streams. In this chapter, we summarize lessons learned '
 'from the last half century of research on urban streams and provide a '
 'critique of various mitigation strategies, including recent approaches that '
 'explicitly address geomorphic processes. We focus first on the abiotic '
 'conditions (primarily hydrologic and geomorphic) and their changes in '
 'streams that accompany urbanization, recognizing that these changes may vary '
 'with geomorphic context and climatic region. We then discuss technical '
 'approaches and limitations to (1) mitigating water-quantity and '
 'water-quality degradation through site design, riparian protection, and '
 'structural stormwater-management strategies; and (2) restoring urban streams '
 'in those watersheds where the economic, social, and political contexts can '
 'supp

In [4]:
# NLP Lemmatize and remove stopwords; generate corpus
# To do: Generate ngrams (NLTK) - https://snippets.aktagon.com/snippets/619-how-to-generate-n-grams-with-python-and-nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

stemmer = WordNetLemmatizer() # both reduce words to same form; stemming is algorithmic, lemmatization uses corpus only
stopwords = stopwords.words('english')
stopwords.extend(['data', 'study', 'project', 'research', 'collaborative', 'use', 'include', 'result', 'increase', 'high', 'low', 'large', 'include', 'based']) # extends defaults with custom words

corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(stemmer=stemmer.lemmatize), 
                         stopwords=lambda x: len(x) <= 2 or x in stopwords)
corpus.process(data)

3770

In [122]:
# TOPIC MODELING (hLDA) - testing hierarchical LDA topic model with tomotopy
h_mdl = tp.HLDAModel(depth=9,corpus=corpus,seed=1)
    
for i in range(0, 100, 10): #Train the model using Gibbs-sampling
    h_mdl.train(10)
    print('Iteration: {}\tLog-likelihood: {}'.format(i, h_mdl.ll_per_word))

print("Total number of topics: ", h_mdl.k)
print("Total number of live topics: ", h_mdl.live_k)
print("Depth: ", h_mdl.depth)
print("Perplexity: ", h_mdl.perplexity)

Iteration: 0	Log-likelihood: -9.156869662799986
Iteration: 10	Log-likelihood: -8.776159236498593
Iteration: 20	Log-likelihood: -8.596020156789908
Iteration: 30	Log-likelihood: -8.491875142965466
Iteration: 40	Log-likelihood: -8.421475270606024
Iteration: 50	Log-likelihood: -8.376578551009924
Iteration: 60	Log-likelihood: -8.34440432527948
Iteration: 70	Log-likelihood: -8.321666336599915
Iteration: 80	Log-likelihood: -8.305640856965953
Iteration: 90	Log-likelihood: -8.2931085972526
Total number of topics:  1256
Total number of live topics:  1077
Depth:  9
Perplexity:  3996.237599170139


In [123]:
# TOPIC MODELING (hLDA) - explore the topics (children, parents, depth, number of topics per level) as table
rows = []
for k in range(h_mdl.k):
    topic = k,
    keyword = h_mdl.get_topic_words(k, top_n=10),
    docs = h_mdl.num_docs_of_topic(k),
    children = h_mdl.children_topics(k),
    parent = h_mdl.parent_topic(k),
    level = h_mdl.level(k),
    rows.append([topic, keyword, docs, children, parent, level])

topics_df = pd.DataFrame(rows, columns=["Topic", "Keywords", "Docs", "Children", "Parent", "Level"])
topics_df.to_csv('outputs/hLDA-9-level.csv')
topics_df.head()

Unnamed: 0,Topic,Keywords,Docs,Children,Parent,Level
0,"(0,)","([(model, 0.015386220999062061), (system, 0.00...","(3770,)","([12, 11, 10, 9, 8],)","(-1,)","(0,)"
1,"(1,)","([(model, 2.9001477741985582e-05), (change, 2....","(0,)","([],)","(-1,)","(-1,)"
2,"(2,)","([(model, 2.9001477741985582e-05), (change, 2....","(0,)","([],)","(-1,)","(-1,)"
3,"(3,)","([(model, 2.9001477741985582e-05), (change, 2....","(0,)","([],)","(-1,)","(-1,)"
4,"(4,)","([(model, 2.9001477741985582e-05), (change, 2....","(0,)","([],)","(-1,)","(-1,)"
