In [154]:
# Goal: Data preprocessing, NLP, topic models and perplexity scores with hLDA (tomotopy)
# Output: tables per level (topic, keywords, children, parent, level)
# Resource: https://bab2min.github.io/tomotopy/v0.6.2/en/
# To do: refine text processing; extract document-topic distributions at each level; pivot table

import sys
import re
import pandas as pd
from pprint import pprint
import tomotopy as tp

In [2]:
# DATA Read in docs (corpus) from csv; concatenate title and abstract to new column for topic model
docs = pd.read_csv("data/ERI-combined-2009-2019.csv")
docs['combined'] = docs['title'].astype(str) + ' ' + docs['abstract'].astype(str)
data = docs['combined'].values.tolist()
data = [re.sub('\s+', ' ', sent) for sent in data] #remove new line characters
data = [re.sub("\'", "", sent) for sent in data] #remove single quotes
pprint(data[:1])

['Streams and Urbanization Urbanization encompasses a diverse array of '
 'watershed alterations that influence the physical, chemical, and biological '
 'characteristics of streams. In this chapter, we summarize lessons learned '
 'from the last half century of research on urban streams and provide a '
 'critique of various mitigation strategies, including recent approaches that '
 'explicitly address geomorphic processes. We focus first on the abiotic '
 'conditions (primarily hydrologic and geomorphic) and their changes in '
 'streams that accompany urbanization, recognizing that these changes may vary '
 'with geomorphic context and climatic region. We then discuss technical '
 'approaches and limitations to (1) mitigating water-quantity and '
 'water-quality degradation through site design, riparian protection, and '
 'structural stormwater-management strategies; and (2) restoring urban streams '
 'in those watersheds where the economic, social, and political contexts can '
 'supp

In [3]:
# NLP Lemmatize and remove stopwords; generate corpus
# TO DO: refine text processing (https://snippets.aktagon.com/snippets/619-how-to-generate-n-grams-with-python-and-nltk)
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

stemmer = WordNetLemmatizer() # both reduce words to same form; stemming is algorithmic, lemmatization uses corpus only
stopwords = stopwords.words('english')
stopwords.extend(['data', 'study', 'project', 'research', 'collaborative', 'use', 'include', 'result', 'increase', 'high', 'low', 'large', 'include', 'based']) # extends defaults with custom words

corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(stemmer=stemmer.lemmatize), 
                         stopwords=lambda x: len(x) <= 2 or x in stopwords)
corpus.process(data)

3770

In [141]:
# TOPIC MODELING (hLDA) - testing hierarchical LDA topic model with tomotopy
h_mdl = tp.HLDAModel(depth=4,corpus=corpus,seed=1)
    
for i in range(0, 100, 10): #Train the model using Gibbs-sampling
    h_mdl.train(10)
    print('Iteration: {}\tLog-likelihood: {}'.format(i, h_mdl.ll_per_word))

print("Number of topics: ", h_mdl.k)
print("Number of live topics: ", h_mdl.live_k)
print("Number of documents: ", len(h_mdl.docs))
print("Model perplexity: ", h_mdl.perplexity)

Iteration: 0	Log-likelihood: -8.874829779208273
Iteration: 10	Log-likelihood: -8.549331638287489
Iteration: 20	Log-likelihood: -8.420679203788414
Iteration: 30	Log-likelihood: -8.34197303038161
Iteration: 40	Log-likelihood: -8.289698554923566
Iteration: 50	Log-likelihood: -8.249507336026864
Iteration: 60	Log-likelihood: -8.226158093088047
Iteration: 70	Log-likelihood: -8.212455045505013
Iteration: 80	Log-likelihood: -8.20045246715981
Iteration: 90	Log-likelihood: -8.195003170291228
Number of topics:  560
Number of live topics:  522
Number of documents:  3770
Model perplexity:  3622.8024772368335


In [176]:
# OUTPUT (hLDA) - Explore the topics (children, parents, depth, number of topics per level) as csv
rows = []
for k in range(h_mdl.k):
    topic = k,
    keyword = h_mdl.get_topic_words(k, top_n=10),
    num_docs = h_mdl.num_docs_of_topic(k),
    children = h_mdl.children_topics(k),
    parent = h_mdl.parent_topic(k),
    level = h_mdl.level(k),
    rows.append([topic, keyword, num_docs, children, parent, level])

topics_df = pd.DataFrame(rows, columns=["Topic", "Keywords", "Num_Docs", "Children", "Parent", "Level"])
#topics_df.to_csv('outputs/hLDA/hLDA-4-level.csv')

In [206]:
# OUTPUT (hLDA) - Explore the documents as csv
# TO DO: extract document-topic distributions (https://github.com/bab2min/tomotopy/issues/44)
#len(h_mdl.docs)
#doc_ins = h_mdl.docs[89] #document instance
#print(doc_ins.weight)
#print(doc_ins.words)
#print(doc_ins.get_topic_dist()) #returning levels
#print(doc_ins.get_topics(top_n=100))
#print(doc_ins.get_words(top_n=-1))

In [205]:
# VISUALIZATION - Transform topic table to hierarchical data structure for treemap
# TO DO: pivot table
h_data = pd.read_csv("outputs/hLDA/hLDA-4-level.csv")
level_index = h_data[h_data['Parent'] == -1 ].index #drop topics with parent -1 (top level)
h_data.drop(level_index, inplace=True) #preserve original topic numbers
pivoted_h = h_data.pivot(index='Topic', columns='Level', values='Parent')
pivoted_h.head(30)

Level,1,2,3
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8,0.0,,
9,0.0,,
10,0.0,,
11,0.0,,
12,0.0,,
13,0.0,,
14,0.0,,
15,0.0,,
16,,8.0,
17,,10.0,
