## Text Processing Utilities for City-Data Corpus

The functions below were used to normalize the City-Data Corpus texts:

In [10]:
from __future__ import division
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans, SpectralClustering, kmeans_plusplus
from sklearn.manifold import SpectralEmbedding
from nltk.corpus import stopwords
import nltk
import re
from scipy.cluster.hierarchy import fclusterdata
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectKBest, chi2
import pickle
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
import pandas as pd
import os
import re
import gensim
from collections import Counter
import plotly.express as px
from plotly.offline import plot
from sklearn.cluster import MiniBatchKMeans
import pandas as pd
import itertools
from bs4 import BeautifulSoup as soup
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_short,strip_non_alphanum, strip_tags,strip_multiple_whitespaces, preprocess_documents, preprocess_string, strip_numeric, remove_stopwords, strip_tags, strip_punctuation, stem_text
from gensim.test.utils import datapath
from gensim.models.word2vec import Text8Corpus
from gensim.models.phrases import Phrases, Phraser
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx


def flatten_list(somelist):
    """"
    Function to flatten a list of lists.

    Args:
        somelist: List of lists.

    Returns:
        Merged list
    """
    if any(isinstance(el, list) for el in somelist) == False:
        return somelist
    flat_list = list(itertools.chain(*somelist))
    return flat_list


stops = stopwords.words('english') + ['said','know','maybe','post','advertisements','advertisement','posted','thread','like','could','should','would','thing']
wn = WordNetLemmatizer()
stemmer = nltk.PorterStemmer()
def text_process(text):
    """
    Function to normalize text.

    Args:
        texts: string

    Returns:
        string
    """
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if not re.findall(r'\.com|___|Advertisements|-|_',token)]

    return ' '.join([wn.lemmatize(token) for token in tokens if len(token) > 3 and token not in stops and not re.findall(r'[0-9]',token) and not re.findall(r'htt',token)])

def make_graph_citydata(dataframe):
    """
    Generates networkx graph of City-Data.com Corpus Forum Data

    Args:
        dataframe: City-Data.com Corpus Dataframe

    Returns:
        networkx Graph
    """
    edges = [(str(x),str(y)) for (x,y) in list(zip(dataframe.post_id.tolist(),dataframe.quote_id.tolist()))]
    G = nx.MultiDiGraph()
    for i in range(len(dataframe)):
        G.add_node(dataframe.iloc[i]['post_id'],text=dataframe.iloc[i]['post'])
                 

    for i in range(len(dataframe)):
        if dataframe.iloc[i]['quote_id'] != '' and dataframe.iloc[i]['quote_id'] not in G.nodes():
            try:
                G.add_node(dataframe.iloc[i]['quote_id'],text=dataframe.iloc[i]['quote'])
            except:
                G.add_node(dataframe.iloc[i]['quote_id'],text=None)
        
    G.add_edges_from(edges)
    try:
        G.remove_node('')
    except:
        pass
    return G

def get_paths_city_data(dataframe):
    """
    Function to extract threaded posts from a Pandas DataFrame.
    
    Args:
        dataframe: City-Data.com Corpus DataFrame
 
    Returns:
        networkx graph, list of threads
    """
    G = make_graph_citydata(dataframe)
    sink_nodes = [node for node, outdegree in dict(G.out_degree(G.nodes())).items() if outdegree == 0]
    source_nodes = [node for node, indegree in dict(G.in_degree(G.nodes())).items() if indegree == 0]
    ss_nodes = [(source, sink) for sink in sink_nodes for source in source_nodes]
    paths = []
    for (source,sink) in ss_nodes:
        for path in nx.all_simple_paths(G, source=source, target=sink):
            paths.append(path)
    return G, paths

def make_thread_embeddings(dataframe, model):
    """
    Function to convert City-Data.com Corpus posts and quoted posts into a network graph and embeddings.

    Args:
        dataframe: City-Data.com Corpus Dataframe
        model: sentence-transformer model

    Returns:
        networkx graph
        City-Data.com Corpus threads and singleton posts
        City-Data.com Corpus thread and post embeddings
    """
    dataframe.fillna('',inplace=True)
    id_text = {}
    for i in range(len(dataframe)):
        
        id_text[dataframe.iloc[i]['quote_id']] = dataframe.iloc[i]['quote']
        id_text[dataframe.iloc[i]['post_id']] = dataframe.iloc[i]['post']

    G, paths = get_paths_city_data(dataframe)
    chains = []
    for path in paths:
        p = []
        for x in path:
            try:
                p.append(id_text[x])
            except:
                p.append('')
        chains.append(p)
    joint_chains = [' '.join(chain) for chain in chains]
    embeddings = model.encode(joint_chains)
    singletons = [node for node in G.nodes() if node not in flatten_list(paths)]
    singleton_embeddings = model.encode([id_text[s] for s in singletons])
    singleton_texts = [id_text[s] for s in singletons]
    return G, paths, joint_chains, embeddings, singletons, singleton_embeddings,singleton_texts
    

In [None]:
G, paths = get_paths_city_data(citydata)

## Gensim LDA Modeling Scripts

In [3]:
from gensim import models
def gensim_lda(texts, topic_num=5, topic_word_priors=None,numwords=25, eta_=None, tfidf=False):
    """
    Gensim lda wrapper with guided topic modeling.

    Args:
        texts: list of strings
        topic_num: (int) number of topics
        topic_word_priors: list of words (string) to guide modeling
        numwords: (int) number of topical terms
        eta_: None or list of ints
        tfidf: (bool) if True, then use gensim tfidf term weighting (default is False)
        
    """
    
    if tfidf != False:
        #process texts
        #build gensim dictionary
        processed_texts = [text_tokenize(text) for text in texts]
        dictionary = gensim.corpora.Dictionary(processed_texts)
    
        #build bag-of-words representation
        bow = [dictionary.doc2bow(text.split()) for text in texts]
        tfidf = models.TfidfModel(bow)
        corpus_tfidf = tfidf[bow]
        #guided lda with eta
        if topic_word_priors and eta_ != None:
            etas = []
        
            for r in range(len(topic_word_priors)):
                eta = []
                for i in range(len(dictionary)):
                    
                    if dictionary[i] in topic_word_priors[r]:
                        eta.append(np.array(eta_))
                    else:
                        eta.append(np.array(1/topic_num))
                etas.append(eta)
    
            model = gensim.models.ldamodel.LdaModel(
                corpus=corpus_tfidf, id2word=dictionary, num_topics=topic_num,
                random_state=42, chunksize=100, eta=np.array(etas),
                eval_every=-1, update_every=1,
                passes=150, alpha='auto', per_word_topics=True)
    
            #transform corpus into topics
            transformed_corpus = model[corpus_tfidf]
    
            #extract topical terms
            topical_terms = [[token for token in m[1].split('"') if not re.findall(r'\d',token)] for m in  model.show_topics(num_words=numwords)]
            return model, topical_terms, transformed_corpus, processed_texts
            
        else:
            #standard lda
            model = gensim.models.ldamodel.LdaModel(
                corpus=corpus_tfidf, id2word=dictionary, num_topics=topic_num,
                random_state=42, chunksize=100, eta=None,
                eval_every=-1, update_every=1,
                passes=150, alpha='auto', per_word_topics=True)
            
            #transform corpus into topics
            transformed_corpus = model[corpus_tfidf]
            topical_terms = [[token for token in m[1].split('"') if not re.findall(r'\d',token)] for m in  model.show_topics(num_words=numwords)]
            return model, topical_terms, transformed_corpus, processed_texts
    else:
        #process texts
        #build gensim dictionary
        processed_texts = [text_tokenize(text) for text in texts]
        dictionary = gensim.corpora.Dictionary(processed_texts)
        
        #build bag-of-words representation
        bow = [dictionary.doc2bow(text.split()) for text in texts]
        #tfidf = models.TfidfModel(bow)
        #corpus_tfidf = tfidf[bow]
    
    
        #guided lda with eta
        if topic_word_priors and eta_ != None:
            etas = []
        
            for r in range(len(topic_word_priors)):
                eta = []
                for i in range(len(dictionary)):
                    
                    if dictionary[i] in topic_word_priors[r]:
                        eta.append(np.array(eta_))
                    else:
                        eta.append(np.array(1/topic_num))
                etas.append(eta)
    
            model = gensim.models.ldamodel.LdaModel(
                corpus=bow, id2word=dictionary, num_topics=topic_num,
                random_state=42, chunksize=100, eta=np.array(etas),
                eval_every=-1, update_every=1,
                passes=150, alpha='auto', per_word_topics=True)
    
            #transform corpus into topics
            transformed_corpus = model[bow]
    
            #extract topical terms
            topical_terms = [[token for token in m[1].split('"') if not re.findall(r'\d',token)] for m in  model.show_topics(num_words=numwords)]
            return model, topical_terms, transformed_corpus, processed_texts
        
        else:
        #standard lda
            model = gensim.models.ldamodel.LdaModel(
                corpus=bow, id2word=dictionary, num_topics=topic_num,
                random_state=42, chunksize=100, eta=None,
                eval_every=-1, update_every=1,
                passes=150, alpha='auto', per_word_topics=True)
            
            #transform corpus into topics
            transformed_corpus = model[bow]
            
            #extract topical terms
            topical_terms = [[token for token in m[1].split('"') if not re.findall(r'\d',token)] for m in  model.show_topics(num_words=25)]
            return model, topical_terms, transformed_corpus, processed_texts
        

## City-Data.com Corpus

In [4]:
#load City-Data.com Corpus from Zenodo
citydata = pd.read_csv("https://zenodo.org/records/10086354/files/citydata.csv?download=1")

## Sentence Embedding-Based Topic Modeler

In [6]:
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans, SpectralClustering, kmeans_plusplus
from sklearn.manifold import SpectralEmbedding
from nltk.corpus import stopwords
import nltk
import re
from scipy.cluster.hierarchy import fclusterdata
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectKBest, chi2

def topic_model(texts, transformer, clusters=3,vectorizer='cv',mindf=5, ngrams=(1,1),kbest=5000,init_='k-means++'):
    """
    Function to SE-Topic Model City-Data.com Corpus.

    Args:
        texts: list of strings
        transformer: sentence-transformer model 
        clusters: (int) number of topics to derive
        vectorizer: (string) 'cv' or 'tfidf'
        mindf: (int) minimum threshold for token inclusion
        ngrams: (tuple) ngram range for tokens
        kbest: max textual features for topic modeling
        init_: string or list of topic priors
    Returns:
        dictionary: dataframe of texts, topical term weights, vectroizer, clusterer, topics
    """
        
    embeddings = transformer.encode(texts)
    #if cluster centers provided:
    if type(init_) == np.ndarray:
        print('setting cluster centers')
    
    #instantiate kmeans clusterer 
    km = KMeans(n_clusters=clusters,init=init_, random_state=0)
    km.fit(embeddings)

    #process texts for topical word extraction
    processed_threads = [text_process(text) for text in texts]

    #group text by kmeans cluster label
    df = pd.DataFrame({'text':processed_threads,'label':km.labels_})    
    df_grouped = df.groupby('label')['thread'].apply(list)

    if vectorizer == 'cv': #sklearn CountVectorizer
        cv = CountVectorizer(stop_words='english',min_df=mindf,ngram_range=ngrams)
        CX = cv.fit_transform(df.text)
        
        #select kbest features
        kbc = SelectKBest(chi2, k=kbest).fit(CX, km.labels_)

        ff = pd.DataFrame()
        ff['term'] = np.asarray(cv.get_feature_names_out())[kbc.get_support()]
        #extract topical terms
        if clusters > 2:
            for i in range(clusters):
                ff['coef_'+str(i)] = kbc.transform(cv.transform([' '.join(df.groupby('label')['text'].apply(list)[i])])).toarray().tolist()[0]
                                                 
        else:
            ff['coef_0'] = kbc.transform(cv.transform([' '.join(df.groupby('label')['text'].apply(list)[i])])).toarray().tolist()[0]
            ff['coef_1'] = kbc.transform(cv.transform([' '.join(df.groupby('label')['text'].apply(list)[i])])).toarray().tolist()[0]
            
        ff = pd.DataFrame([ff.iloc[i] for i in range(len(ff)) if len(ff.iloc[i]['term']) > 3])
        topics = get_topical_terms(ff,term_count=25)
        print(get_topical_coherence(processed_threads, topics,metric='u_mass'))
        return {'texts':df,'term_weights':ff,'vectorizer':cv, 'clusterer':km, 'topics':topics}
   
    elif vectorizer == 'tfidf':
        tfidf = TfidfVectorizer(stop_words='english',min_df=mindf, ngram_range=ngrams)
        CX = tfidf.fit_transform(df.thread)
        kbc = SelectKBest(chi2, k=kbest).fit(CX, km.labels_)

        ff = pd.DataFrame()
        ff['term'] = np.asarray(tfidf.get_feature_names_out())[kbc.get_support()]
        if clusters > 2:
            for i in range(clusters):
                ff['coef_'+str(i)] = kbc.transform(tfidf.transform([' '.join(df.groupby('label')['text'].apply(list)[i])])).toarray().tolist()[0]
                                                 
        else:
            ff['coef_0'] = kbc.transform(tfidf.transform([' '.join(df.groupby('label')['text'].apply(list)[i])])).toarray().tolist()[0]
            ff['coef_1'] = kbc.transform(tfidf.transform([' '.join(df.groupby('label')['text'].apply(list)[i])])).toarray().tolist()[0]
            
        ff = pd.DataFrame([ff.iloc[i] for i in range(len(ff)) if len(ff.iloc[i]['term']) > 3])
        topics = get_topical_terms(ff,term_count=25)
        print(get_topical_coherence(processed_threads, topics,metric='u_mass'))

        return {'texts':df,'term_weights':ff,'vectorizer':tfidf, 'clusterer':km, 'topics':topics}

from sentence_transformers import SentenceTransformer

#instantiate transformer model to create embeddings
smodel = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

### Default Topic Modeling

In [None]:
CITY, city_paths, city_chains, city_embeddings, city_singletons,city_singleton_embeddings, city_singleton_texts = make_thread_embeddings(citydata,smodel)


In [None]:
topics = topic_model(texts, smodel, clusters=4,vectorizer='cv',mindf=5, ngrams=(1,1),kbest=5000,init_='k-means++'):

## Gensim Topic Modeling Functions

The following functions were used to derive LDA and guided LDA topic models in gensim for comparison:

In [None]:
from gensim import models

#custom text normalizer for City-Data Corpus
def text_tokenize(text):
    text = text.lower()
    tokens = nltk.wordpunct_tokenize(text)
    tokens = [token for token in tokens if not re.findall(r'\.com|___|Advertisements|-|_',token)]
    lemmas = [wn.lemmatize(token) for token in tokens if len(token) > 2 and token not in stops]
    filtered = [lemma for lemma in lemmas if not re.findall(r'[0-9]',lemma) and not re.findall(r'htt',lemma)]
    return filtered
    
def gensim_lda(texts, topic_num=5, topic_word_priors=None,numwords=25, eta_=None, tfidf=False):
    """gensim lda wrapper with guided topic modeling"""

    if tfidf != False:
        #process texts
        #build gensim dictionary
        processed_texts = [text_tokenize(text) for text in texts]
        dictionary = gensim.corpora.Dictionary(processed_texts)
    
        #build bag-of-words representation
        bow = [dictionary.doc2bow(text.split()) for text in texts]
        tfidf = models.TfidfModel(bow)
        corpus_tfidf = tfidf[bow]
        #guided lda with eta
        if topic_word_priors and eta_ != None:
            etas = []
        
            for r in range(len(topic_word_priors)):
                eta = []
                for i in range(len(dictionary)):
                    
                    if dictionary[i] in topic_word_priors[r]:
                        eta.append(np.array(eta_))
                    else:
                        eta.append(np.array(1/topic_num))
                etas.append(eta)
    
            model = gensim.models.ldamodel.LdaModel(
                corpus=corpus_tfidf, id2word=dictionary, num_topics=topic_num,
                random_state=42, chunksize=100, eta=np.array(etas),
                eval_every=-1, update_every=1,
                passes=150, alpha='auto', per_word_topics=True)
    
            #transform corpus into topics
            transformed_corpus = model[corpus_tfidf]
    
            #extract topical terms
            topical_terms = [[token for token in m[1].split('"') if not re.findall(r'\d',token)] for m in  model.show_topics(num_words=numwords)]
            return model, topical_terms, transformed_corpus, processed_texts
            
        else:
            #standard lda
            model = gensim.models.ldamodel.LdaModel(
                corpus=corpus_tfidf, id2word=dictionary, num_topics=topic_num,
                random_state=42, chunksize=100, eta=None,
                eval_every=-1, update_every=1,
                passes=150, alpha='auto', per_word_topics=True)
            
            #transform corpus into topics
            transformed_corpus = model[corpus_tfidf]
            topical_terms = [[token for token in m[1].split('"') if not re.findall(r'\d',token)] for m in  model.show_topics(num_words=numwords)]
            return model, topical_terms, transformed_corpus, processed_texts
    else:
        #process texts
        #build gensim dictionary
        processed_texts = [text_tokenize(text) for text in texts]
        dictionary = gensim.corpora.Dictionary(processed_texts)
        
        #build bag-of-words representation
        bow = [dictionary.doc2bow(text.split()) for text in texts]
        #tfidf = models.TfidfModel(bow)
        #corpus_tfidf = tfidf[bow]
    
    
        #guided lda with eta
        if topic_word_priors and eta_ != None:
            etas = []
        
            for r in range(len(topic_word_priors)):
                eta = []
                for i in range(len(dictionary)):
                    
                    if dictionary[i] in topic_word_priors[r]:
                        eta.append(np.array(eta_))
                    else:
                        eta.append(np.array(1/topic_num))
                etas.append(eta)
    
            model = gensim.models.ldamodel.LdaModel(
                corpus=bow, id2word=dictionary, num_topics=topic_num,
                random_state=42, chunksize=100, eta=np.array(etas),
                eval_every=-1, update_every=1,
                passes=150, alpha='auto', per_word_topics=True)
    
            #transform corpus into topics
            transformed_corpus = model[bow]
    
            #extract topical terms
            topical_terms = [[token for token in m[1].split('"') if not re.findall(r'\d',token)] for m in  model.show_topics(num_words=numwords)]
            return model, topical_terms, transformed_corpus, processed_texts
        
        else:
        #standard lda
            model = gensim.models.ldamodel.LdaModel(
                corpus=bow, id2word=dictionary, num_topics=topic_num,
                random_state=42, chunksize=100, eta=None,
                eval_every=-1, update_every=1,
                passes=150, alpha='auto', per_word_topics=True)
            
            #transform corpus into topics
            transformed_corpus = model[bow]
            
            #extract topical terms
            topical_terms = [[token for token in m[1].split('"') if not re.findall(r'\d',token)] for m in  model.show_topics(num_words=25)]
            return model, topical_terms, transformed_corpus, processed_texts
        

## Network Graph Functions

The following functions were used to convert City-Data.com Corpus forum data into network graphs of posts and replies.


In [None]:
import networkx as nx
def make_graph_citydata(dataframe):
    edges = [(str(x),str(y)) for (x,y) in list(zip(dataframe.post_id.tolist(),dataframe.quote_id.tolist()))]
    G = nx.MultiDiGraph()
    for i in range(len(dataframe)):
        G.add_node(dataframe.iloc[i]['post_id'],text=dataframe.iloc[i]['post'])
                 

    for i in range(len(dataframe)):
        if dataframe.iloc[i]['quote_id'] != '' and dataframe.iloc[i]['quote_id'] not in G.nodes():
            try:
                G.add_node(dataframe.iloc[i]['quote_id'],text=dataframe.iloc[i]['quote'])
            except:
                G.add_node(dataframe.iloc[i]['quote_id'],text=None)
        
    G.add_edges_from(edges)
    G.remove_node('')
    
    return G

def get_paths_city_data(dataframe):
    G = make_graph_citydata(dataframe)
    sink_nodes = [node for node, outdegree in dict(G.out_degree(G.nodes())).items() if outdegree == 0]
    source_nodes = [node for node, indegree in dict(G.in_degree(G.nodes())).items() if indegree == 0]
    ss_nodes = [(source, sink) for sink in sink_nodes for source in source_nodes]
    paths = []
    for (source,sink) in ss_nodes:
        for path in nx.all_simple_paths(G, source=source, target=sink):
            paths.append(path)
    return G, paths

def make_thread_embeddings(dataframe, model):
    id_text = {}
    for i in range(len(dataframe)):
        
        id_text[dataframe.iloc[i]['quote_id']] = dataframe.iloc[i]['quote']
        id_text[dataframe.iloc[i]['post_id']] = dataframe.iloc[i]['post']

    G, paths = get_paths_city_data(dataframe)
    chains = []
    for path in paths:
        p = []
        for x in path:
            try:
                p.append(id_text[x])
            except:
                p.append('')
        chains.append(p)
    joint_chains = [' '.join(chain) for chain in chains]
    embeddings = model.encode(joint_chains)
    singletons = [node for node in G.nodes() if node not in flatten_list(paths)]
    singleton_embeddings = model.encode([id_text[s] for s in singletons])
    singleton_texts = [id_text[s] for s in singletons]
    return G, paths, joint_chains, embeddings, singletons, singleton_embeddings,singleton_texts