## Text Processing Utilities for City-Data Corpus

The functions below were used to normalize the City-Data Corpus texts:

In [90]:
from __future__ import division
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans, SpectralClustering, kmeans_plusplus
from sklearn.manifold import SpectralEmbedding
from nltk.corpus import stopwords
import nltk
import re
from scipy.cluster.hierarchy import fclusterdata
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectKBest, chi2
import pickle
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
import pandas as pd
import os
import re
import gensim
from collections import Counter
import plotly.express as px
from plotly.offline import plot
from sklearn.cluster import MiniBatchKMeans
import pandas as pd
import itertools
from bs4 import BeautifulSoup as soup
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_short,strip_non_alphanum, strip_tags,strip_multiple_whitespaces, preprocess_documents, preprocess_string, strip_numeric, remove_stopwords, strip_tags, strip_punctuation, stem_text
from gensim.test.utils import datapath
from gensim.models.word2vec import Text8Corpus
from gensim.models.phrases import Phrases, Phraser
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx

def open_pickle(fname):
    with open(fname,'rb') as s:
        data = pickle.load(s)
    return data

def save_pickle(f, filename):
    with open(filename, 'wb') as h:
        pickle.dump(f,h,2)

def flatten_list(somelist):
    """"
    Function to flatten a list of lists.

    Args:
        somelist: List of lists.

    Returns:
        Merged list
    """
    if any(isinstance(el, list) for el in somelist) == False:
        return somelist
    flat_list = list(itertools.chain(*somelist))
    return flat_list


stops = stopwords.words('english') + ['said','know','maybe','post','advertisements','advertisement','posted','thread','like','could','should','would','thing']
wn = WordNetLemmatizer()
stemmer = nltk.PorterStemmer()
def text_process(text):
    """
    Function to normalize text.

    Args:
        texts: string

    Returns:
        string
    """
    
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if not re.findall(r'\.com|___|Advertisements|-|_',token)]

    return ' '.join([wn.lemmatize(token) for token in tokens if len(token) > 3 and token not in stops and not re.findall(r'[0-9]',token) and not re.findall(r'htt',token)])

def make_graph_citydata(dataframe):
    """
    Generates networkx graph of City-Data.com Corpus Forum Data

    Args:
        dataframe: City-Data.com Corpus Dataframe

    Returns:
        networkx Graph
    """
    
    
    
    edges = [(str(x),str(y)) for (x,y) in list(zip(dataframe.post_id.tolist(),dataframe.quote_id.tolist()))]
    
    G = nx.MultiDiGraph()
    G.add_edges_from(edges)
    try:
        G.remove_node('')
    except:
        pass
        
    for i in range(len(dataframe)):
        G.add_node(str(dataframe.iloc[i]['post_id']),text=dataframe.iloc[i]['post'])
                 

    for i in range(len(dataframe)):
        if dataframe.iloc[i]['quote_id'] != '' and dataframe.iloc[i]['quote_id'] not in G.nodes():
            try:
                G.add_node(str(dataframe.iloc[i]['quote_id']),text=dataframe.iloc[i]['quote'])
            except:
                G.add_node(str(dataframe.iloc[i]['quote_id']),text=None)
        else:
            G.add_node(str(dataframe.iloc[i]['quote_id']),text=None)
        
    
    return G

def get_paths_city_data(dataframe):
    """
    Function to extract threaded posts from a Pandas DataFrame.
    
    Args:
        dataframe: City-Data.com Corpus DataFrame
 
    Returns:
        networkx graph, list of threads
    """
    G = make_graph_citydata(dataframe)
    sink_nodes = [node for node, outdegree in dict(G.out_degree(G.nodes())).items() if outdegree == 0]
    source_nodes = [node for node, indegree in dict(G.in_degree(G.nodes())).items() if indegree == 0]
    ss_nodes = [(source, sink) for sink in sink_nodes for source in source_nodes]
    paths = []
    for (source,sink) in ss_nodes:
        for path in nx.all_simple_paths(G, source=source, target=sink):
            paths.append(path)
    return G, paths

def make_thread_embeddings(dataframe, model):
    """
    Function to convert City-Data.com Corpus posts and quoted posts into a network graph and embeddings.

    Args:
        dataframe: City-Data.com Corpus Dataframe
        model: sentence-transformer model

    Returns:
        networkx graph
        City-Data.com Corpus threads and singleton posts
        City-Data.com Corpus thread and post embeddings
    """
    dataframe.fillna('',inplace=True)
    id_text = {}
    for i in range(len(dataframe)):
        
        id_text[dataframe.iloc[i]['quote_id']] = dataframe.iloc[i]['quote']
        id_text[dataframe.iloc[i]['post_id']] = dataframe.iloc[i]['post']

    G, paths = get_paths_city_data(dataframe)
    chains = []
    for path in paths:
        p = []
        for x in path:
            try:
                p.append(id_text[x])
            except:
                p.append('')
        chains.append(p)
    joint_chains = [' '.join(chain) for chain in chains]
    embeddings = model.encode(joint_chains)
    singletons = [node for node in G.nodes() if node not in flatten_list(paths)]
    singleton_embeddings = model.encode([id_text[s] for s in singletons])
    singleton_texts = [id_text[s] for s in singletons]
    return G, paths, joint_chains, embeddings, singletons, singleton_embeddings,singleton_texts
    

## Topical Coherence and Diversity Functions

In [165]:
from gensim.test.utils import common_corpus, common_dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.test.utils import datapath
from gensim.models.word2vec import Text8Corpus
from gensim.models.phrases import Phrases, Phraser
from scipy.spatial import distance
from itertools import combinations

#Topical Diversity functions are from Terragni (2023)
#https://github.com/silviatti/topic-model-diversity
def proportion_unique_words(topics, topk=10):
    """
    compute the proportion of unique words

    Parameters
    ----------
    topics: a list of lists of words
    topk: top k words on which the topic diversity will be computed
    """
    if topk > len(topics[0]):
        raise Exception('Words in topics are less than '+str(topk))
    else:
        unique_words = set()
        for topic in topics:
            unique_words = unique_words.union(set(topic[:topk]))
        puw = len(unique_words) / (topk * len(topics))
        return puw


def irbo(topics, weight=0.9, topk=10):
    """
    compute the inverted rank-biased overlap

    Parameters
    ----------
    topics: a list of lists of words
    weight: p (float), default 1.0: Weight of each
        agreement at depth d:p**(d-1). When set
        to 1.0, there is no weight, the rbo returns
        to average overlap.
    topk: top k words on which the topic diversity
          will be computed

    Returns
    -------
    irbo : score of the rank biased overlap over the topics
    """
    if topk > len(topics[0]):
        raise Exception('Words in topics are less than topk')
    else:
        collect = []
        for list1, list2 in combinations(topics, 2):
            word2index = get_word2index(list1, list2)
            indexed_list1 = [word2index[word] for word in list1]
            indexed_list2 = [word2index[word] for word in list2]
            rbo_val = rbo(indexed_list1[:topk], indexed_list2[:topk], p=weight)[2]
            collect.append(rbo_val)
        return 1 - np.mean(collect)


def word_embedding_irbo(topics, word_embedding_model, weight=0.9, topk=10):
    '''
    compute the word embedding-based inverted rank-biased overlap

    Parameters
    ----------
    topics: a list of lists of words
    weight: p (float), default 1.0: Weight of each agreement at depth d:
    p**(d-1). When set to 1.0, there is no weight, the rbo returns to average overlap.
    
    Returns
    -------
    weirbo: word embedding-based inverted rank_biased_overlap over the topics
    '''
    if topk > len(topics[0]):
        raise Exception('Words in topics are less than topk')
    else:
        collect = []
        for list1, list2 in combinations(topics, 2):
            word2index = get_word2index(list1, list2)
            index2word = {v: k for k, v in word2index.items()}
            indexed_list1 = [word2index[word] for word in list1]
            indexed_list2 = [word2index[word] for word in list2]
            rbo_val = word_embeddings_rbo(indexed_list1[:topk], indexed_list2[:topk], p=weight,
                                          index2word=index2word, word2vec=word_embedding_model)[2]
            collect.append(rbo_val)
        return 1 - np.mean(collect)


def pairwise_jaccard_diversity(topics, topk=10):
    '''
    compute the average pairwise jaccard distance between the topics 
  
    Parameters
    ----------
    topics: a list of lists of words
    topk: top k words on which the topic diversity
          will be computed
    
    Returns
    -------
    pjd: average pairwise jaccard distance
    '''
    dist = 0
    count = 0
    for list1, list2 in combinations(topics, 2):
        js = 1 - len(set(list1).intersection(set(list2)))/len(set(list1).union(set(list2)))
        dist = dist + js
        count = count + 1
    return dist/count


def pairwise_word_embedding_distance(topics, word_embedding_model, topk=10):
    """
    :param topk: how many most likely words to consider in the evaluation
    :return: topic coherence computed on the word embeddings similarities
    """
    if topk > len(topics[0]):
        raise Exception('Words in topics are less than topk')
    else:
        count = 0
        sum_dist = 0
        for list1, list2 in combinations(topics, 2):
            count = count+1
            word_counts = 0
            dist = 0
            for word1 in list1[:topk]:
                for word2 in list2[:topk]:
                    dist = dist + distance.cosine(word_embedding_model.wv[word1], word_embedding_model.wv[word2])
                    word_counts = word_counts + 1

            dist = dist/word_counts
            sum_dist = sum_dist + dist
        return sum_dist/count


def centroid_distance(topics, word_embedding_model, topk=10):
    """
    :param topk: how many most likely words to consider in the evaluation
    :return: topic coherence computed on the word embeddings similarities
    """
    if topk > len(topics[0]):
        raise Exception('Words in topics are less than topk')
    else:
        count = 0
        for list1, list2 in combinations(topics, 2):
            count = count + 1
            centroid1 = np.zeros(word_embedding_model.vector_size)
            centroid2 = np.zeros(word_embedding_model.vector_size)
            for word1 in list1[:topk]:
                centroid1 = centroid1 + word_embedding_model[word1]
            for word2 in list2[:topk]:
                centroid2 = centroid2 + word_embedding_model[word2]
            centroid1 = centroid1 / len(list1[:topk])
            centroid2 = centroid2 / len(list2[:topk])
        return distance.cosine(centroid1, centroid2)


def get_word2index(list1, list2):
    words = set(list1)
    words = words.union(set(list2))
    word2index = {w: i for i, w in enumerate(words)}
    return word2index
"""Rank-biased overlap, a ragged sorted list similarity measure.
See http://doi.acm.org/10.1145/1852102.1852106 for details. All functions
directly corresponding to concepts from the paper are named so that they can be
clearly cross-identified.
The definition of overlap has been modified to account for ties. Without this,
results for lists with tied items were being inflated. The modification itself
is not mentioned in the paper but seems to be reasonable, see function
``overlap()``. Places in the code which diverge from the spec in the paper
because of this are highlighted with comments.
The two main functions for performing an RBO analysis are ``rbo()`` and
``rbo_dict()``; see their respective docstrings for how to use them.
The following doctest just checks that equivalent specifications of a
problem yield the same result using both functions:
    >>> lst1 = [{"c", "a"}, "b", "d"]
    >>> lst2 = ["a", {"c", "b"}, "d"]
    >>> ans_rbo = _round(rbo(lst1, lst2, p=.9))
    >>> dct1 = dict(a=1, b=2, c=1, d=3)
    >>> dct2 = dict(a=1, b=2, c=2, d=3)
    >>> ans_rbo_dict = _round(rbo_dict(dct1, dct2, p=.9, sort_ascending=True))
    >>> ans_rbo == ans_rbo_dict
    True
"""

import math
from bisect import bisect_left
from collections import namedtuple


RBO = namedtuple("RBO", "min res ext")
RBO.__doc__ += ": Result of full RBO analysis"
RBO.min.__doc__ = "Lower bound estimate"
RBO.res.__doc__ = "Residual corresponding to min; min + res is an upper bound estimate"
RBO.ext.__doc__ = "Extrapolated point estimate"


def _round(obj):
    if isinstance(obj, RBO):
        return RBO(_round(obj.min), _round(obj.res), _round(obj.ext))
    else:
        return round(obj, 3)


def set_at_depth(lst, depth):
    ans = set()
    for v in lst[:depth]:
        if isinstance(v, set):
            ans.update(v)
        else:
            ans.add(v)
    return ans


def raw_overlap(list1, list2, depth):
    """Overlap as defined in the article.
    """
    set1, set2 = set_at_depth(list1, depth), set_at_depth(list2, depth)
    return len(set1.intersection(set2)), len(set1), len(set2)


def overlap(list1, list2, depth):
    """Overlap which accounts for possible ties.
    This isn't mentioned in the paper but should be used in the ``rbo*()``
    functions below, otherwise overlap at a given depth might be > depth which
    inflates the result.
    There are no guidelines in the paper as to what's a good way to calculate
    this, but a good guess is agreement scaled by the minimum between the
    requested depth and the lengths of the considered lists (overlap shouldn't
    be larger than the number of ranks in the shorter list, otherwise results
    are conspicuously wrong when the lists are of unequal lengths -- rbo_ext is
    not between rbo_min and rbo_min + rbo_res.
    >>> overlap("abcd", "abcd", 3)
    3.0
    >>> overlap("abcd", "abcd", 5)
    4.0
    >>> overlap(["a", {"b", "c"}, "d"], ["a", {"b", "c"}, "d"], 2)
    2.0
    >>> overlap(["a", {"b", "c"}, "d"], ["a", {"b", "c"}, "d"], 3)
    3.0
    """
    ov = agreement(list1, list2, depth) * min(depth, len(list1), len(list2))
    return ov
    # NOTE: comment the preceding and uncomment the following line if you want
    # to stick to the algorithm as defined by the paper
    # return raw_overlap(list1, list2, depth)[0]


def agreement(list1, list2, depth):
    """Proportion of shared values between two sorted lists at given depth.
    >>> _round(agreement("abcde", "abdcf", 1))
    1.0
    >>> _round(agreement("abcde", "abdcf", 3))
    0.667
    >>> _round(agreement("abcde", "abdcf", 4))
    1.0
    >>> _round(agreement("abcde", "abdcf", 5))
    0.8
    >>> _round(agreement([{1, 2}, 3], [1, {2, 3}], 1))
    0.667
    >>> _round(agreement([{1, 2}, 3], [1, {2, 3}], 2))
    1.0
    """
    len_intersection, len_set1, len_set2 = raw_overlap(list1, list2, depth)
    return 2 * len_intersection / (len_set1 + len_set2)


def cumulative_agreement(list1, list2, depth):
    return (agreement(list1, list2, d) for d in range(1, depth + 1))


def average_overlap(list1, list2, depth=None):
    """Calculate average overlap between ``list1`` and ``list2``.
    >>> _round(average_overlap("abcdefg", "zcavwxy", 1))
    0.0
    >>> _round(average_overlap("abcdefg", "zcavwxy", 2))
    0.0
    >>> _round(average_overlap("abcdefg", "zcavwxy", 3))
    0.222
    >>> _round(average_overlap("abcdefg", "zcavwxy", 4))
    0.292
    >>> _round(average_overlap("abcdefg", "zcavwxy", 5))
    0.313
    >>> _round(average_overlap("abcdefg", "zcavwxy", 6))
    0.317
    >>> _round(average_overlap("abcdefg", "zcavwxy", 7))
    0.312
    """
    depth = min(len(list1), len(list2)) if depth is None else depth
    return sum(cumulative_agreement(list1, list2, depth)) / depth


def rbo_at_k(list1, list2, p, depth=None):
    # ``p**d`` here instead of ``p**(d - 1)`` because enumerate starts at
    # 0
    depth = min(len(list1), len(list2)) if depth is None else depth
    d_a = enumerate(cumulative_agreement(list1, list2, depth))
    return (1 - p) * sum(p ** d * a for (d, a) in d_a)


def rbo_min(list1, list2, p, depth=None):
    """Tight lower bound on RBO.
    See equation (11) in paper.
    >>> _round(rbo_min("abcdefg", "abcdefg", .9))
    0.767
    >>> _round(rbo_min("abcdefgh", "abcdefg", .9))
    0.767
    """
    depth = min(len(list1), len(list2)) if depth is None else depth
    x_k = overlap(list1, list2, depth)
    log_term = x_k * math.log(1 - p)
    sum_term = sum(
        p ** d / d * (overlap(list1, list2, d) - x_k) for d in range(1, depth + 1)
    )
    return (1 - p) / p * (sum_term - log_term)


def rbo_res(list1, list2, p):
    """Upper bound on residual overlap beyond evaluated depth.
    See equation (30) in paper.
    NOTE: The doctests weren't verified against manual computations but seem
    plausible. In particular, for identical lists, ``rbo_min()`` and
    ``rbo_res()`` should add up to 1, which is the case.
    >>> _round(rbo_res("abcdefg", "abcdefg", .9))
    0.233
    >>> _round(rbo_res("abcdefg", "abcdefghijklmnopqrstuvwxyz", .9))
    0.239
    """
    S, L = sorted((list1, list2), key=len)
    s, l = len(S), len(L)
    x_l = overlap(list1, list2, l)
    # since overlap(...) can be fractional in the general case of ties and f
    # must be an integer --> math.ceil()
    f = int(math.ceil(l + s - x_l))
    # upper bound of range() is non-inclusive, therefore + 1 is needed
    term1 = s * sum(p ** d / d for d in range(s + 1, f + 1))
    term2 = l * sum(p ** d / d for d in range(l + 1, f + 1))
    term3 = x_l * (math.log(1 / (1 - p)) - sum(p ** d / d for d in range(1, f + 1)))
    return p ** s + p ** l - p ** f - (1 - p) / p * (term1 + term2 + term3)


def rbo_ext(list1, list2, p):
    """RBO point estimate based on extrapolating observed overlap.
    See equation (32) in paper.
    NOTE: The doctests weren't verified against manual computations but seem
    plausible.
    >>> _round(rbo_ext("abcdefg", "abcdefg", .9))
    1.0
    >>> _round(rbo_ext("abcdefg", "bacdefg", .9))
    0.9
    """
    S, L = sorted((list1, list2), key=len)
    s, l = len(S), len(L)
    x_l = overlap(list1, list2, l)
    x_s = overlap(list1, list2, s)
    # the paper says overlap(..., d) / d, but it should be replaced by
    # agreement(..., d) defined as per equation (28) so that ties are handled
    # properly (otherwise values > 1 will be returned)
    # sum1 = sum(p**d * overlap(list1, list2, d)[0] / d for d in range(1, l + 1))
    sum1 = sum(p ** d * agreement(list1, list2, d) for d in range(1, l + 1))
    sum2 = sum(p ** d * x_s * (d - s) / s / d for d in range(s + 1, l + 1))
    term1 = (1 - p) / p * (sum1 + sum2)
    term2 = p ** l * ((x_l - x_s) / l + x_s / s)
    return term1 + term2


def rbo(list1, list2, p):
    """Complete RBO analysis (lower bound, residual, point estimate).
    ``list`` arguments should be already correctly sorted iterables and each
    item should either be an atomic value or a set of values tied for that
    rank. ``p`` is the probability of looking for overlap at rank k + 1 after
    having examined rank k.
    >>> lst1 = [{"c", "a"}, "b", "d"]
    >>> lst2 = ["a", {"c", "b"}, "d"]
    >>> _round(rbo(lst1, lst2, p=.9))
    RBO(min=0.489, res=0.477, ext=0.967)
    """
    if not 0 <= p <= 1:
        raise ValueError("The ``p`` parameter must be between 0 and 1.")
    args = (list1, list2, p)
    return RBO(rbo_min(*args), rbo_res(*args), rbo_ext(*args))


def sort_dict(dct, *, ascending=False):
    """Sort keys in ``dct`` according to their corresponding values.
    Sorts in descending order by default, because the values are
    typically scores, i.e. the higher the better. Specify
    ``ascending=True`` if the values are ranks, or some sort of score
    where lower values are better.
    Ties are handled by creating sets of tied keys at the given position
    in the sorted list.
    >>> dct = dict(a=1, b=2, c=1, d=3)
    >>> list(sort_dict(dct)) == ['d', 'b', {'a', 'c'}]
    True
    >>> list(sort_dict(dct, ascending=True)) == [{'a', 'c'}, 'b', 'd']
    True
    """
    scores = []
    items = []
    # items should be unique, scores don't have to
    for item, score in dct.items():
        if not ascending:
            score *= -1
        i = bisect_left(scores, score)
        if i == len(scores):
            scores.append(score)
            items.append(item)
        elif scores[i] == score:
            existing_item = items[i]
            if isinstance(existing_item, set):
                existing_item.add(item)
            else:
                items[i] = {existing_item, item}
        else:
            scores.insert(i, score)
            items.insert(i, item)
    return items


def rbo_dict(dict1, dict2, p, *, sort_ascending=False):
    """Wrapper around ``rbo()`` for dict input.
    Each dict maps items to be sorted to the score according to which
    they should be sorted. The RBO analysis is then performed on the
    resulting sorted lists.
    The sort is descending by default, because scores are typically the
    higher the better, but this can be overridden by specifying
    ``sort_ascending=True``.
    >>> dct1 = dict(a=1, b=2, c=1, d=3)
    >>> dct2 = dict(a=1, b=2, c=2, d=3)
    >>> _round(rbo_dict(dct1, dct2, p=.9, sort_ascending=True))
    RBO(min=0.489, res=0.477, ext=0.967)
    """
    list1, list2 = (
        sort_dict(dict1, ascending=sort_ascending),
        sort_dict(dict2, ascending=sort_ascending),
    )
    return rbo(list1, list2, p)

import math
from bisect import bisect_left
from collections import namedtuple
from collections import OrderedDict

RBO = namedtuple("RBO", "min res ext")
RBO.__doc__ += ": Result of full RBO analysis"
RBO.min.__doc__ = "Lower bound estimate"
RBO.res.__doc__ = "Residual corresponding to min; min + res is an upper bound estimate"
RBO.ext.__doc__ = "Extrapolated point estimate"

def _round(obj):
    if isinstance(obj, RBO):
        return RBO(_round(obj.min), _round(obj.res), _round(obj.ext))
    else:
        return round(obj, 3)


def set_at_depth(lst, depth):
    ans = set()
    for v in lst[:depth]:
        if isinstance(v, set):
            ans.update(v)
        else:
            ans.add(v)
    return ans


def embeddings_overlap(list1, list2, depth, index2word, word2vec):
    #set1, set2 = set_at_depth(list1, depth), set_at_depth(list2, depth)
    #return len(set1.intersection(set2)), len(set1), len(set2)

    set1, set2 = set_at_depth(list1, depth), set_at_depth(list2, depth)
    word_list1 = [index2word[index] for index in list1]
    word_list2 = [index2word[index] for index in list2]

    similarities = {}
    for w1 in word_list1[:depth]:
        for w2 in word_list2[:depth]:
            similarities[(w1,w2)] = word2vec.similarity(w1, w2)

    similarities = OrderedDict(sorted(similarities.items(), key=lambda x: -x[1]))

    e_ov = 0
    key_list = list(similarities.keys())
    for k in key_list:
        if k in similarities.keys():
            #print(k, similarities[k])
            e_ov = e_ov + similarities[k]
            similarities = {save_k: v for save_k, v in similarities.items()
                            if save_k[0] != k[0] and save_k[1] != k[1]}
    #e_ov = 1
    #print("****")
    return e_ov, len(set1), len(set2)


def overlap(list1, list2, depth, index2word, word2vec):
    #return agreement(list1, list2, depth) * min(depth, len(list1), len(list2))
    # NOTE: comment the preceding and uncomment the following line if you want
    # to stick to the algorithm as defined by the paper
    ov = embeddings_overlap(list1, list2, depth, index2word, word2vec)[0]
    return ov


def agreement(list1, list2, depth, index2word, word2vec):
    """Proportion of shared values between two sorted lists at given depth."""
    len_intersection, len_set1, len_set2 = embeddings_overlap(list1, list2, depth, index2word, word2vec)
    return 2 * len_intersection / (len_set1 + len_set2)


def cumulative_agreement(list1, list2, depth, index2word, word2vec):
    return (agreement(list1, list2, d, index2word, word2vec) for d in range(1, depth + 1))


def average_overlap(list1, list2, index2word, word2vec, depth=None):
    """Calculate average overlap between ``list1`` and ``list2``.
    """
    depth = min(len(list1), len(list2)) if depth is None else depth
    return sum(cumulative_agreement(list1, list2, depth, index2word=index2word, word2vec=word2vec)) / depth


def rbo_at_k(list1, list2, p, index2word, word2vec, depth=None):
    # ``p**d`` here instead of ``p**(d - 1)`` because enumerate starts at
    # 0
    depth = min(len(list1), len(list2)) if depth is None else depth
    d_a = enumerate(cumulative_agreement(list1, list2, depth, index2word=index2word, word2vec=word2vec))
    return (1 - p) * sum(p ** d * a for (d, a) in d_a)


def rbo_min(list1, list2, p, index2word, word2vec, depth=None):
    """Tight lower bound on RBO.
    See equation (11) in paper.
    """
    depth = min(len(list1), len(list2)) if depth is None else depth
    x_k = overlap(list1, list2, depth, index2word, word2vec)
    log_term = x_k * math.log(1 - p)
    sum_term = sum(
        p ** d / d * (overlap(list1, list2, d, index2word, word2vec=word2vec) - x_k) for d in range(1, depth + 1)
    )
    return (1 - p) / p * (sum_term - log_term)


def rbo_res(list1, list2, p, index2word, word2vec):
    """Upper bound on residual overlap beyond evaluated depth.
    See equation (30) in paper.
    NOTE: The doctests weren't verified against manual computations but seem
    plausible. In particular, for identical lists, ``rbo_min()`` and
    ``rbo_res()`` should add up to 1, which is the case.
    """
    S, L = sorted((list1, list2), key=len)
    s, l = len(S), len(L)
    x_l = overlap(list1, list2, l, index2word, word2vec)
    # since overlap(...) can be fractional in the general case of ties and f
    # must be an integer --> math.ceil()
    f = int(math.ceil(l + s - x_l))
    # upper bound of range() is non-inclusive, therefore + 1 is needed
    term1 = s * sum(p ** d / d for d in range(s + 1, f + 1))
    term2 = l * sum(p ** d / d for d in range(l + 1, f + 1))
    term3 = x_l * (math.log(1 / (1 - p)) - sum(p ** d / d for d in range(1, f + 1)))
    return p ** s + p ** l - p ** f - (1 - p) / p * (term1 + term2 + term3)


def rbo_ext(list1, list2, p, index2word, word2vec):
    """RBO point estimate based on extrapolating observed overlap.
    See equation (32) in paper.
    NOTE: The doctests weren't verified against manual computations but seem
    plausible.
    >>> _round(rbo_ext("abcdefg", "abcdefg", .9))
    1.0
    >>> _round(rbo_ext("abcdefg", "bacdefg", .9))
    0.9
    """
    S, L = sorted((list1, list2), key=len)
    s, l = len(S), len(L)
    x_l = overlap(list1, list2, l, index2word, word2vec)
    x_s = overlap(list1, list2, s, index2word, word2vec)
    # the paper says overlap(..., d) / d, but it should be replaced by
    # agreement(..., d) defined as per equation (28) so that ties are handled
    # properly (otherwise values > 1 will be returned)
    # sum1 = sum(p**d * overlap(list1, list2, d)[0] / d for d in range(1, l + 1))
    sum1 = sum(p ** d * agreement(list1, list2, d, index2word=index2word, word2vec=word2vec)
               for d in range(1, l + 1))
    sum2 = sum(p ** d * x_s * (d - s) / s / d for d in range(s + 1, l + 1))
    term1 = (1 - p) / p * (sum1 + sum2)
    term2 = p ** l * ((x_l - x_s) / l + x_s / s)
    return term1 + term2


def word_embeddings_rbo(list1, list2, p, index2word, word2vec):
    """Complete RBO analysis (lower bound, residual, point estimate).
    ``list`` arguments should be already correctly sorted iterables and each
    item should either be an atomic value or a set of values tied for that
    rank. ``p`` is the probability of looking for overlap at rank k + 1 after
    having examined rank k.
    >>> lst1 = [{"c", "a"}, "b", "d"]
    >>> lst2 = ["a", {"c", "b"}, "d"]
    >>> _round(rbo(lst1, lst2, p=.9))
    RBO(min=0.489, res=0.477, ext=0.967)
    """
    if not 0 <= p <= 1:
        raise ValueError("The ``p`` parameter must be between 0 and 1.")
    args = (list1, list2, p, index2word, word2vec)

    return RBO(rbo_min(*args), rbo_res(*args), rbo_ext(*args))


def sort_dict(dct, *, ascending=False):
    """Sort keys in ``dct`` according to their corresponding values.
    Sorts in descending order by default, because the values are
    typically scores, i.e. the higher the better. Specify
    ``ascending=True`` if the values are ranks, or some sort of score
    where lower values are better.
    Ties are handled by creating sets of tied keys at the given position
    in the sorted list.
    >>> dct = dict(a=1, b=2, c=1, d=3)
    >>> list(sort_dict(dct)) == ['d', 'b', {'a', 'c'}]
    True
    >>> list(sort_dict(dct, ascending=True)) == [{'a', 'c'}, 'b', 'd']
    True
    """
    scores = []
    items = []
    # items should be unique, scores don't have to
    for item, score in dct.items():
        if not ascending:
            score *= -1
        i = bisect_left(scores, score)
        if i == len(scores):
            scores.append(score)
            items.append(item)
        elif scores[i] == score:
            existing_item = items[i]
            if isinstance(existing_item, set):
                existing_item.add(item)
            else:
                items[i] = {existing_item, item}
        else:
            scores.insert(i, score)
            items.insert(i, item)
    return items


def rbo_dict(dict1, dict2, p, index2word, word2vec, *, sort_ascending=False):
    """Wrapper around ``rbo()`` for dict input.
    Each dict maps items to be sorted to the score according to which
    they should be sorted. The RBO analysis is then performed on the
    resulting sorted lists.
    The sort is descending by default, because scores are typically the
    higher the better, but this can be overridden by specifying
    ``sort_ascending=True``.
    """
    list1, list2 = (
        sort_dict(dict1, ascending=sort_ascending),
        sort_dict(dict2, ascending=sort_ascending),
    )
    return word_embeddings_rbo(list1, list2, p, index2word, word2vec)


In [166]:
#Wrapper code for Terragni's (2023) Topical Diversity Measures


def get_topical_coherence(texts,topics, metric='u_mass'):
    if type(texts[0]) == str:
        #train dictionary on tokenized documents (corpus)
        doc_tokens = [text_tokenize(text) for text in texts]
        phrases = Phrases(doc_tokens, min_count=1, threshold=1)
        bigrams = phrases[doc_tokens]
        dct = Dictionary(bigrams) 
        
        
        #topics = list of topical words
        #texts = tokenized documents
        #corpus = doc2bow
        #dictionary = gensim dictionary
        cm = CoherenceModel(topics=topics,texts=texts, corpus=[dct.doc2bow(c) for c in doc_tokens], dictionary=dct, coherence=metric)
        return cm.get_coherence()
    elif type(texts[0]) == list:
        doc_tokens = texts
        phrases = Phrases(doc_tokens, min_count=1, threshold=1)
        bigrams = phrases[doc_tokens]
        dct = Dictionary(bigrams) 
        
        
        #topics = list of topical words
        #texts = tokenized documents
        #corpus = doc2bow
        #dictionary = gensim dictionary
        cm = CoherenceModel(topics=topics,texts=texts, corpus=[dct.doc2bow(c) for c in doc_tokens], dictionary=dct, coherence=metric)
        return cm.get_coherence()
        

def get_topical_terms(dataframe, term_count=25):
    topics = []
    for i in range(len(dataframe.columns)-1):
        
        terms = dataframe.sort_values(by='coef_'+str(i),ascending=False)[:term_count]['term'].tolist()
        topics.append(terms)
    filtered = []
    for topic in topics:
        t = []
        for term in topic:
            if len(term.split()) > 1:
                t.append('_'.join(term.split()))
            else:
                t.append(term)
        filtered.append(t)
    return filtered

def get_diversity_scores(topics, model, topn=10, topic_type=''):
    df = pd.DataFrame({"puw:":proportion_unique_words(topics, topk=topn),
        "jd:": pairwise_jaccard_diversity(topics, topk=topn),
        "we-pd:": pairwise_word_embedding_distance(topics, model, topk=topn),
        "we-cd:": centroid_distance(topics, model.wv, topk=topn),
        "we-irbo p=0.5:":word_embedding_irbo(topics,model.wv, weight=0.5, topk=topn),
        "we-irbo p=0.9:":word_embedding_irbo(topics,model.wv, weight=0.9, topk=topn)},index=[topic_type])
    return df

def get_coherence_diversity_scores(texts, topics, model, topn=10, topic_type='',metric='u_mass'):
    df = pd.DataFrame({'u_mass_coherence':get_topical_coherence(texts,topics,metric=metric),
        "puw:":proportion_unique_words(topics, topk=topn),
        "jd:": pairwise_jaccard_diversity(topics, topk=topn),
        "we-pd:": pairwise_word_embedding_distance(topics, model, topk=topn),
        "we-cd:": centroid_distance(topics, model.wv, topk=topn),
        "we-irbo p=0.5:":word_embedding_irbo(topics,model.wv, weight=0.5, topk=topn),
        "we-irbo p=0.9:":word_embedding_irbo(topics,model.wv, weight=0.9, topk=topn)},index=[topic_type])
    return df


## Gensim LDA Modeling Functions

In [3]:
from gensim import models
def gensim_lda(texts, topic_num=5, topic_word_priors=None,numwords=25, eta_=None, tfidf=False):
    """
    Gensim lda wrapper with guided topic modeling.

    Args:
        texts: list of strings
        topic_num: (int) number of topics
        topic_word_priors: list of words (string) to guide modeling
        numwords: (int) number of topical terms
        eta_: None or list of ints
        tfidf: (bool) if True, then use gensim tfidf term weighting (default is False)
        
    """
    
    if tfidf != False:
        #process texts
        #build gensim dictionary
        processed_texts = [text_tokenize(text) for text in texts]
        dictionary = gensim.corpora.Dictionary(processed_texts)
    
        #build bag-of-words representation
        bow = [dictionary.doc2bow(text.split()) for text in texts]
        tfidf = models.TfidfModel(bow)
        corpus_tfidf = tfidf[bow]
        #guided lda with eta
        if topic_word_priors and eta_ != None:
            etas = []
        
            for r in range(len(topic_word_priors)):
                eta = []
                for i in range(len(dictionary)):
                    
                    if dictionary[i] in topic_word_priors[r]:
                        eta.append(np.array(eta_))
                    else:
                        eta.append(np.array(1/topic_num))
                etas.append(eta)
    
            model = gensim.models.ldamodel.LdaModel(
                corpus=corpus_tfidf, id2word=dictionary, num_topics=topic_num,
                random_state=42, chunksize=100, eta=np.array(etas),
                eval_every=-1, update_every=1,
                passes=150, alpha='auto', per_word_topics=True)
    
            #transform corpus into topics
            transformed_corpus = model[corpus_tfidf]
    
            #extract topical terms
            topical_terms = [[token for token in m[1].split('"') if not re.findall(r'\d',token)] for m in  model.show_topics(num_words=numwords)]
            return model, topical_terms, transformed_corpus, processed_texts
            
        else:
            #standard lda
            model = gensim.models.ldamodel.LdaModel(
                corpus=corpus_tfidf, id2word=dictionary, num_topics=topic_num,
                random_state=42, chunksize=100, eta=None,
                eval_every=-1, update_every=1,
                passes=150, alpha='auto', per_word_topics=True)
            
            #transform corpus into topics
            transformed_corpus = model[corpus_tfidf]
            topical_terms = [[token for token in m[1].split('"') if not re.findall(r'\d',token)] for m in  model.show_topics(num_words=numwords)]
            return model, topical_terms, transformed_corpus, processed_texts
    else:
        #process texts
        #build gensim dictionary
        processed_texts = [text_tokenize(text) for text in texts]
        dictionary = gensim.corpora.Dictionary(processed_texts)
        
        #build bag-of-words representation
        bow = [dictionary.doc2bow(text.split()) for text in texts]
        #tfidf = models.TfidfModel(bow)
        #corpus_tfidf = tfidf[bow]
    
    
        #guided lda with eta
        if topic_word_priors and eta_ != None:
            etas = []
        
            for r in range(len(topic_word_priors)):
                eta = []
                for i in range(len(dictionary)):
                    
                    if dictionary[i] in topic_word_priors[r]:
                        eta.append(np.array(eta_))
                    else:
                        eta.append(np.array(1/topic_num))
                etas.append(eta)
    
            model = gensim.models.ldamodel.LdaModel(
                corpus=bow, id2word=dictionary, num_topics=topic_num,
                random_state=42, chunksize=100, eta=np.array(etas),
                eval_every=-1, update_every=1,
                passes=150, alpha='auto', per_word_topics=True)
    
            #transform corpus into topics
            transformed_corpus = model[bow]
    
            #extract topical terms
            topical_terms = [[token for token in m[1].split('"') if not re.findall(r'\d',token)] for m in  model.show_topics(num_words=numwords)]
            return model, topical_terms, transformed_corpus, processed_texts
        
        else:
        #standard lda
            model = gensim.models.ldamodel.LdaModel(
                corpus=bow, id2word=dictionary, num_topics=topic_num,
                random_state=42, chunksize=100, eta=None,
                eval_every=-1, update_every=1,
                passes=150, alpha='auto', per_word_topics=True)
            
            #transform corpus into topics
            transformed_corpus = model[bow]
            
            #extract topical terms
            topical_terms = [[token for token in m[1].split('"') if not re.findall(r'\d',token)] for m in  model.show_topics(num_words=25)]
            return model, topical_terms, transformed_corpus, processed_texts
        

## City-Data.com Corpus

In [72]:
#load City-Data.com Corpus from Zenodo
citydata = pd.read_csv("https://zenodo.org/records/10086354/files/citydata.csv?download=1")

In [84]:
#extract all City-Data.com Corpus posts

citydata.fillna('',inplace=True) #remove NaN
posts = citydata.post.tolist() + [quote for quote in citydata.quote.tolist() if quote not in citydata.post.tolist()]


In [22]:
posts[:5]

[" So just about everything is shutting down slowly but surely.  My job has shut down for the rest of the month so I'll be home.  lol still haven't gotten to the grocery store.  I'm in PT after a knee replacement and PT is considered an essential operation so still going to that.Stay well guys",
 "I am so lucky to be able to work remotely through something like this.  I feel bad for all the service workers and business people who could lose their jobs.  My family has been taking it easy.  We cook/bake most meals from home and have a huge pantry so we're comfortable and operating mostly normal.  Go out for walks, relax, kids nap, watch movies.  Talking to neighbors and keeping calm/reasonable.  Good luck on your knee rehab.  Good luck to everyone!",
 'I agree with really feeling for those in the service and medical industries, as well as the many people who canâ€™t stay home. I hope companies with employees that canâ€™t work remotely have a plan for anyone 60+ to receive paid time off u

In [None]:
#extract threaded posts and quoted posts
#convert to embeddings
CITY, city_paths, city_chains, city_embeddings, city_singleton_ids,city_singleton_embeddings, city_singleton_texts = make_thread_embeddings(citydata,smodel)


In [14]:
#Or load preprocessed City-Data.com Corpus data
city = open_pickle('citydata_processed.pkl')

CITY = city[0] #networkx graph
city_paths = city[1] #post and quoted posts by post_id
city_chains = city[2] #treaded texts
city_embeddings = city[3] #threaded embeddings
city_singleton_ids = city[4] #singleton post_ids
city_singleton_embeddings = city[5] #singleton post embeddings
city_singleton_texts = city[6] #singleton posts

In [105]:
threads = city_chains + city_singleton_texts

## Sentence Embedding-Based Topic Modeler

In [36]:
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans, SpectralClustering, kmeans_plusplus
from sklearn.manifold import SpectralEmbedding
from nltk.corpus import stopwords
import nltk
import re
from scipy.cluster.hierarchy import fclusterdata
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectKBest, chi2

def topic_model(texts, transformer, clusters=3,vectorizer='cv',mindf=5, ngrams=(1,1),kbest=5000,init_='k-means++'):
    """
    Function to SE-Topic Model City-Data.com Corpus.

    Args:
        texts: list of strings
        transformer: sentence-transformer model 
        clusters: (int) number of topics to derive
        vectorizer: (string) 'cv' or 'tfidf'
        mindf: (int) minimum threshold for token inclusion
        ngrams: (tuple) ngram range for tokens
        kbest: max textual features for topic modeling
        init_: string or list of topic priors
    Returns:
        dictionary: dataframe of texts, topical term weights, vectroizer, clusterer, topics
    """
        
    embeddings = transformer.encode(texts)
    #if cluster centers provided:
    if type(init_) == np.ndarray:
        print('setting cluster centers')
    
    #instantiate kmeans clusterer 
    km = KMeans(n_clusters=clusters,init=init_, random_state=0)
    km.fit(embeddings)

    #process texts for topical word extraction
    processed_threads = [text_process(text) for text in texts]

    #group text by kmeans cluster label
    df = pd.DataFrame({'text':processed_threads,'label':km.labels_})    
    df_grouped = df.groupby('label')['text'].apply(list)

    if vectorizer == 'cv': #sklearn CountVectorizer
        cv = CountVectorizer(stop_words='english',min_df=mindf,ngram_range=ngrams)
        CX = cv.fit_transform(df.text)
        
        #select kbest features
        kbc = SelectKBest(chi2, k=kbest).fit(CX, km.labels_)

        ff = pd.DataFrame()
        ff['term'] = np.asarray(cv.get_feature_names_out())[kbc.get_support()]
        #extract topical terms
       
        for i in range(clusters):
            ff['topic_'+str(i)] = kbc.transform(cv.transform([' '.join(df.groupby('label')['text'].apply(list)[i])])).toarray().tolist()[0]
                                                 
        ff = pd.DataFrame([ff.iloc[i] for i in range(len(ff)) if len(ff.iloc[i]['term']) > 3])
        topics = get_topical_terms(ff,term_count=25)
        return {'texts':df,'term_weights':ff,'vectorizer':cv, 'clusterer':km, 'topics':topics}
   
    elif vectorizer == 'tfidf':
        tfidf = TfidfVectorizer(stop_words='english',min_df=mindf, ngram_range=ngrams)
        CX = tfidf.fit_transform(df.text)
        kbc = SelectKBest(chi2, k=kbest).fit(CX, km.labels_)

        ff = pd.DataFrame()
        ff['term'] = np.asarray(tfidf.get_feature_names_out())[kbc.get_support()]
       
        for i in range(clusters):
            ff['topic_'+str(i)] = kbc.transform(tfidf.transform([' '.join(df.groupby('label')['text'].apply(list)[i])])).toarray().tolist()[0]
   
        ff = pd.DataFrame([ff.iloc[i] for i in range(len(ff)) if len(ff.iloc[i]['term']) > 3])
        topics = get_topical_terms(ff,term_count=25)
       

        return {'texts':df,'term_weights':ff,'vectorizer':tfidf, 'clusterer':km, 'topics':topics}

def get_topical_terms(dataframe, term_count=25):
    topics = []
    for i in range(len(dataframe.columns)-1):
        
        terms = dataframe.sort_values(by='coef_'+str(i),ascending=False)[:term_count]['term'].tolist()
        topics.append(terms)
    filtered = []
    for topic in topics:
        t = []
        for term in topic:
            if len(term.split()) > 1:
                t.append('_'.join(term.split()))
            else:
                t.append(term)
        filtered.append(t)
    return filtered
import numpy as np

from sentence_transformers import SentenceTransformer

#instantiate transformer model to create embeddings
smodel = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

### Default SE-Topics Modeling Method

In [59]:
#SE-Topic Model City-Data.com Corpus Threads

se_thread_topics = topic_model(threads, smodel, clusters=4,vectorizer='cv',mindf=10, ngrams=(1,3),kbest=7500,init_='k-means++')





In [60]:
#SE-Topic Model City-Data.com Corpus Threads with top-25 terms
pd.DataFrame(se_thread_topics['topics'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,city,philadelphia,philly,people,think,street,year,area,center,building,...,need,make,really,place,going,want,retail,development,better,market
1,store,city,building,market,street,think,retail,center,walnut,going,...,look,really,project,time,good,center_city,philadelphia,chestnut,location,retailer
2,good,finally,abandon,politics,poorest_neighborhood,poorest,poorer,poor_white,poor_people,poor,...,politician,poorly_executed,politically,political_landscape,political_culture,political,politely,polite,policy,policing
3,people,city,crime,time,year,white,think,philly,going,black,...,good,need,police,want,news,area,philadelphia,thing,life,sure


In [61]:
#SE-Topic Model City-Data.com Corpus Posts
se_post_topics = topic_model(posts, smodel, clusters=4,vectorizer='cv',mindf=10, ngrams=(1,3),kbest=7500,init_='k-means++')





In [62]:
#SE-Topic Model City-Data.com Corpus Posts with top-25 terms
pd.DataFrame(se_post_topics['topics'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,people,crime,city,think,white,year,time,going,black,make,...,street,need,police,want,news,life,area,thing,look,sure
1,store,city,building,street,think,market,center,retail,people,going,...,center_city,time,really,project,good,look,philadelphia,chestnut,location,retailer
2,abandoned,plain,planted,plant,planning_commission,planning,planner,planned,plan_really,plan_philly,...,preserved,place_taking,place_start,place_philly,place_philadelphia,place_people,place_outside,place_live,place_food_court,place_food
3,city,philadelphia,philly,people,think,street,year,area,center,neighborhood,...,make,really,need,place,crime,want,going,development,retail,better


In [92]:
citydata.fillna('',inplace=True)
coronavirus = pd.DataFrame([citydata.iloc[i] for i in range(len(citydata)) if citydata.iloc[i]['forum'] == 'coronavirus'])
crime = pd.DataFrame([citydata.iloc[i] for i in range(len(citydata)) if citydata.iloc[i]['forum'] == 'crime'])
metro = pd.DataFrame([citydata.iloc[i] for i in range(len(citydata)) if citydata.iloc[i]['forum'] == 'metro'])
plan = pd.DataFrame([citydata.iloc[i] for i in range(len(citydata)) if citydata.iloc[i]['forum'] == 'plan'])
retail = pd.DataFrame([citydata.iloc[i] for i in range(len(citydata)) if citydata.iloc[i]['forum'] == 'retail'])


### Guided SE-Topics Post (High Degree Posts)

In [98]:
def extract_high_degree_nodes(graph):
    high_degree_node_text =[]
    for (k,v) in sorted(dict(graph.degree()).items(), key=lambda x:x[1],reverse=True): 
        if graph.nodes()[k]['text'] != None:
            high_degree_node_text.append(graph.nodes()[k]['text'])
    return high_degree_node_text


In [102]:
#make networkx graph of forum data
CO = make_graph_citydata(coronavirus)
CR = make_graph_citydata(crime)
ME = make_graph_citydata(metro)
PL = make_graph_citydata(plan)
R = make_graph_citydata(retail)

#extract high degree posts/nodes
coronavirus_nodes = extract_high_degree_nodes(CO)[0]
crime_nodes = extract_high_degree_nodes(CR)[0]
metro_nodes = extract_high_degree_nodes(ME)[0]
plan_nodes = extract_high_degree_nodes(PL)[0]
retail_nodes = extract_high_degree_nodes(R)[0]

#convert high degree posts into sentence embeddings
degree_seed_embeddings = smodel.encode([coronavirus_nodes,crime_nodes + metro_nodes, plan_nodes, retail_nodes])

se_post_degree_topics = topic_model(posts, smodel, clusters=4,vectorizer='cv',mindf=10, ngrams=(1,3),kbest=7500,init_=degree_seed_embeddings)

setting cluster centers





Explicit initial center position passed: performing only one init in KMeans instead of n_init=10.



In [181]:
pd.DataFrame(se_post_degree_topics['topics'])[[0,1,2,3,4,5,6,7,8,8]]

Unnamed: 0,0,1,2,3,4,5,6,7,8,8.1
0,people,think,year,time,going,good,look,really,project,project
1,city,people,crime,philly,year,street,think,time,white,white
2,city,store,philadelphia,think,building,philly,people,street,center,center
3,abandon,powelton,prediction,preceding,precaution,praying_rosary,praying,prank,prada,prada


### Guided SE-Topics Threads (High Degree Posts)

In [None]:
se_threads_degree_topics = topic_model(threads, smodel, clusters=4,vectorizer='cv',mindf=10, ngrams=(1,3),kbest=7500,init_=degree_seed_embeddings)

In [None]:
pd.DataFrame(se_threads_degree_topics['topics'])

### Guided SE-Topics Posts (Initial Posts)

In [112]:
#Extract initial posts from each forum
first_posts =[
    coronavirus.post.tolist()[0],
    crime.post.tolist()[0] +
    metro.post.tolist()[0],
    plan.post.tolist()[0],
    retail.post.tolist()[0]
]

#Convert posts to sentence-embeddings
first_posts_embeddings = smodel.encode(first_posts)

In [None]:
se_posts_init_posts_topics = topic_model(posts, smodel, clusters=4,vectorizer='cv',mindf=10, ngrams=(1,3),kbest=7500,init_=first_posts_embeddings)

In [183]:
pd.DataFrame(se_posts_init_posts_topics['topics'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,people,crime,city,white,think,year,time,going,black,make,...,good,need,police,want,news,life,thing,area,actually,look
1,city,philly,philadelphia,people,think,street,year,area,center,time,...,really,make,place,store,want,building,going,north,better,retail
2,city,building,street,think,project,people,center,market,tower,going,...,line,really,time,space,parking,make,plan,good,center_city,need
3,store,city,retail,market,think,mall,center,retailer,walnut,philadelphia,...,chestnut,year,shopping,going,area,time,brand,shop,really,space


### Guided SE-Topics Threads (Initial Posts)

In [None]:
se_threads_init_posts_topics = topic_model(threads, smodel, clusters=4,vectorizer='cv',mindf=10, ngrams=(1,3),kbest=7500,init_=first_posts_embeddings)

In [117]:
pd.DataFrame(se_threads_init_posts_topics['topics'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,people,city,crime,time,year,white,think,philly,going,black,...,good,need,police,want,news,area,philadelphia,thing,life,sure
1,city,philadelphia,philly,people,think,street,year,area,center,building,...,need,make,really,place,going,want,retail,development,better,market
2,store,city,building,market,street,think,retail,center,walnut,going,...,look,really,project,time,good,center_city,philadelphia,chestnut,location,retailer
3,good,finally,abandon,politics,poorest_neighborhood,poorest,poorer,poor_white,poor_people,poor,...,politician,poorly_executed,politically,political_landscape,political_culture,political,politely,polite,policy,policing


## Guided SE-Topics Posts (Forum Titles)

In [119]:
titles= """How's everyone doing amongst the Coronavirus shut down?
Official Greater Philadelphia Area Crime Thread (Chester, New Castle: 2013, middle school, university) Official Philadelphia Metro Crime Thread (York, Chester: apartment complexes, houses, unemployment) 
Philadelphia 2035 (Houston: foreclosure, neighborhoods, wage) 
Retail coming to Philadelphia (Penn, Burlington: real estate, house, buying)"""

title_embeddings = smodel.encode(titles.split('\n'))

In [None]:
se_posts_title_topics = topic_model(posts, smodel, clusters=4,vectorizer='cv',mindf=10, ngrams=(1,3),kbest=7500,init_=title_embeddings)

In [121]:
pd.DataFrame(se_posts_title_topics['topics'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,people,think,year,time,going,good,really,right,look,project,...,actually,article,want,news,sure,probably,thing,long,philly,hope
1,people,crime,city,white,year,philly,murder,time,think,black,...,right,really,drug,need,area,good,neighborhood,violent,case,shooting
2,city,philadelphia,philly,people,street,building,think,area,year,center,...,need,really,going,make,center_city,place,want,look,line,tower
3,store,city,retail,market,center,think,street,mall,philadelphia,walnut,...,building,area,chestnut,space,philly,going,year,good,time,really


## Guided SE-Topics Threads (Forum Titles)

In [None]:
se_threads_title_topics = topic_model(threads, smodel, clusters=4,vectorizer='cv',mindf=10, ngrams=(1,3),kbest=7500,init_=title_embeddings)

In [123]:
pd.DataFrame(se_threads_title_topics['topics'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,people,city,crime,year,time,think,white,philly,going,black,...,good,need,philadelphia,police,want,area,news,life,look,thing
1,finally,abandon,pottery,powerhouse,power,powelton_village,powelton,poverty_rate,poverty_problem,poverty_crime,...,potentially,prestige,potential_customer,potential_city,potential,posting,poster,post,possibly,possible
2,city,philadelphia,philly,people,building,street,think,year,area,center,...,need,really,going,make,center_city,look,place,want,tower,line
3,store,city,retail,market,think,center,street,mall,philadelphia,walnut,...,chestnut,going,space,shopping,year,area,good,philly,really,time


## SE-Topics Coherence and Diversity Scores

Note: topical diversity scores require: [FastText](https://fasttext.cc/docs/en/english-vectors.html).

- Download a model
- Load model with gensim

In [146]:
#load fasttext model locally+
fast = gensim.models.fasttext.load_facebook_model('crawl-300d-2M-subword/crawl-300d-2M-subword.bin')

In [170]:
se_topics_eval_df = pd.concat([
        get_coherence_diversity_scores(se_post_topics['texts'].text, se_post_topics['topics'], fast,topn=25,topic_type='posts'),
        get_coherence_diversity_scores(se_thread_topics['texts'].text, se_thread_topics['topics'], fast,topn=25,topic_type='threads'),
        get_coherence_diversity_scores(se_post_degree_topics['texts'].text, se_threads_degree_topics['topics'], fast,topn=25,topic_type='guided posts (high degree)' ),
        get_coherence_diversity_scores(se_threads_degree_topics['texts'].text, se_threads_degree_topics['topics'], fast,topn=25,topic_type='guided threads (high degree)' ),
        get_coherence_diversity_scores(se_posts_init_posts_topics['texts'].text, se_posts_init_posts_topics['topics'], fast,topn=25,topic_type='guided posts (initial posts)' ),
        get_coherence_diversity_scores(se_threads_init_posts_topics['texts'].text, se_threads_init_posts_topics['topics'], fast,topn=25,topic_type='guided threads (initial posts)' ),
        get_coherence_diversity_scores(se_posts_title_topics['texts'].text, se_posts_title_topics['topics'], fast,topn=25,topic_type='guided post (forum titles)' ),
        get_coherence_diversity_scores(se_threads_title_topics['texts'].text, se_threads_title_topics['topics'], fast,topn=25,topic_type='guided threads (forum titles)' )])

In [178]:
se_topics_eval_df[['u_mass_coherence','puw:','jd:','we-cd:']]

Unnamed: 0,u_mass_coherence,puw:,jd:,we-cd:
posts,-5.382926,0.68,0.803132,0.221566
threads,-4.572197,0.67,0.7855,0.184801
guided posts (high degree),-4.62701,0.67,0.7855,0.213113
guided threads (high degree),-4.534778,0.67,0.7855,0.213113
guided posts (initial posts),-2.561766,0.52,0.620674,0.071358
guided threads (initial posts),-4.572197,0.67,0.7855,0.218687
guided post (forum titles),-2.761955,0.54,0.638784,0.065617
guided threads (forum titles),-4.842382,0.69,0.797508,0.065617


In [179]:
se_topics_eval_df[['u_mass_coherence','puw:','jd:','we-cd:']].describe()

Unnamed: 0,u_mass_coherence,puw:,jd:,we-cd:
count,8.0,8.0,8.0,8.0
mean,-4.231901,0.63875,0.750262,0.156734
std,1.009037,0.067705,0.074847,0.074719
min,-5.382926,0.52,0.620674,0.065617
25%,-4.680853,0.6375,0.748821,0.069923
50%,-4.572197,0.67,0.7855,0.198957
75%,-4.091572,0.6725,0.788502,0.214506
max,-2.561766,0.69,0.803132,0.221566


## Gensim Topic Modeling Functions

The following functions were used to derive LDA and guided LDA topic models in gensim for comparison:

In [209]:
from gensim import models
stops = stopwords.words('english') + ['said','know','maybe','post','advertisements','advertisement','posted','thread','like','could','should','would','thing']
wn = WordNetLemmatizer()
stemmer = nltk.PorterStemmer()
#custom text normalizer for City-Data Corpus
def text_tokenize(text):
    stops = stopwords.words('english') + ['said','know','maybe','post','advertisements','advertisement','posted','thread','like','could','should','would','thing']

    text = text.lower()
    tokens = nltk.wordpunct_tokenize(text)
    tokens = [token for token in tokens if not re.findall(r'\.com|___|Advertisements|-|_',token)]
    lemmas = [wn.lemmatize(token) for token in tokens if len(token) > 2 and token not in stops]
    filtered = [lemma for lemma in lemmas if not re.findall(r'[0-9]',lemma) and not re.findall(r'htt',lemma)]
    return filtered
    
def gensim_lda(texts, topic_num=5, topic_word_priors=None,numwords=25, eta_=None, tfidf=False):
    """gensim lda wrapper with guided topic modeling"""

    if tfidf != False:
        #process texts
        #build gensim dictionary
        processed_texts = [text_tokenize(text) for text in texts]
        dictionary = gensim.corpora.Dictionary(processed_texts)
    
        #build bag-of-words representation
        bow = [dictionary.doc2bow(text) for text in processed_texts]
        tfidf = models.TfidfModel(bow)
        corpus_tfidf = tfidf[bow]
        #guided lda with eta
        if topic_word_priors and eta_ != None:
            etas = []
        
            for r in range(len(topic_word_priors)):
                eta = []
                for i in range(len(dictionary)):
                    
                    if dictionary[i] in topic_word_priors[r]:
                        eta.append(np.array(eta_))
                    else:
                        eta.append(np.array(1/topic_num))
                etas.append(eta)
    
            model = gensim.models.ldamodel.LdaModel(
                corpus=corpus_tfidf, id2word=dictionary, num_topics=topic_num,
                random_state=42, chunksize=100, eta=np.array(etas),
                eval_every=-1, update_every=1,
                passes=150, alpha='auto', per_word_topics=True)
    
            #transform corpus into topics
            transformed_corpus = model[corpus_tfidf]
    
            #extract topical terms
            topical_terms = [[token for token in m[1].split('"') if not re.findall(r'\d',token)] for m in  model.show_topics(num_words=numwords)]
            return model, topical_terms, transformed_corpus, processed_texts
            
        else:
            #standard lda
            model = gensim.models.ldamodel.LdaModel(
                corpus=corpus_tfidf, id2word=dictionary, num_topics=topic_num,
                random_state=42, chunksize=100, eta=None,
                eval_every=-1, update_every=1,
                passes=150, alpha='auto', per_word_topics=True)
            
            #transform corpus into topics
            transformed_corpus = model[corpus_tfidf]
            topical_terms = [[token for token in m[1].split('"') if not re.findall(r'\d',token)] for m in  model.show_topics(num_words=numwords)]
            return model, topical_terms, transformed_corpus, processed_texts
    else:
        #process texts
        #build gensim dictionary
        processed_texts = [text_tokenize(text) for text in texts]
        dictionary = gensim.corpora.Dictionary(processed_texts)
        
        #build bag-of-words representation
        bow = [dictionary.doc2bow(text) for text in processed_texts]
        #tfidf = models.TfidfModel(bow)
        #corpus_tfidf = tfidf[bow]
    
    
        #guided lda with eta
        if topic_word_priors and eta_ != None:
            etas = []
        
            for r in range(len(topic_word_priors)):
                eta = []
                for i in range(len(dictionary)):
                    
                    if dictionary[i] in topic_word_priors[r]:
                        eta.append(np.array(eta_))
                    else:
                        eta.append(np.array(1/topic_num))
                etas.append(eta)
    
            model = gensim.models.ldamodel.LdaModel(
                corpus=bow, id2word=dictionary, num_topics=topic_num,
                random_state=42, chunksize=100, eta=np.array(etas),
                eval_every=-1, update_every=1,
                passes=150, alpha='auto', per_word_topics=True)
    
            #transform corpus into topics
            transformed_corpus = model[bow]
    
            #extract topical terms
            topical_terms = [[token for token in m[1].split('"') if not re.findall(r'\d',token)] for m in  model.show_topics(num_words=numwords)]
            return model, topical_terms, transformed_corpus, processed_texts
        
        else:
        #standard lda
            model = gensim.models.ldamodel.LdaModel(
                corpus=bow, id2word=dictionary, num_topics=topic_num,
                random_state=42, chunksize=100, eta=None,
                eval_every=-1, update_every=1,
                passes=150, alpha='auto', per_word_topics=True)
            
            #transform corpus into topics
            transformed_corpus = model[bow]
            
            #extract topical terms
            topical_terms = [[token for token in m[1].split('"') if not re.findall(r'\d',token)] for m in  model.show_topics(num_words=25)]
            return model, topical_terms, transformed_corpus, processed_texts
        

## Gensim Default Topic Modeling

### Gensim LDA Posts

In [210]:
gensim_post_model, gensim_post_terms, gensim_post_corpus, gensim_processed_posts = gensim_lda(posts, topic_num=4)

### Gensim LDA Threads

In [212]:
gensim_thread_model, gensim_thread_terms, gensim_thread_corpus, gensim_processed_threads = gensim_lda(threads, topic_num=4)

### Gensim Guided Post LDA High Degree

In [213]:
#gensim requires a list of tokens to set topic priors
degree_priors = [text_tokenize(node) for node in [coronavirus_nodes, crime_nodes + metro_nodes, plan_nodes,retail_nodes]]

In [214]:
gensim_degree_model, gensim_degree_terms, gensim_degree_corpus, gensim_degree_processed_posts = gensim_lda(posts, topic_num=4, topic_word_priors=degree_priors, eta_=1.0)

### Gensim Guided Threads LDA High Degree

In [215]:
gensim_thread_degree_model, gensim__thread_degree_terms, gensim_thread_degree_corpus, gensim_thread_degree_processed_posts = gensim_lda(threads, topic_num=4, topic_word_priors=degree_priors, eta_=1.0)

### Gensim Guided Posts LDA Initial Posts

In [216]:
first_posts_tokens = [text_tokenize(post) for post in first_posts]

In [217]:
gensim_init_post_model, gensim_init_posts_terms, gensim_init_posts_corpus, processed_posts_init_posts = gensim_lda(posts, topic_num=4, topic_word_priors=first_posts_tokens, eta_=1.0)


### Gensim Guided Threads LDA Initial Posts

In [218]:
gensim_thread_init_post_model, gensim_thread_init_posts_terms, gensim__thread_init_posts_corpus, gensim_thread_processed_posts_init_posts = gensim_lda(threads, topic_num=4, topic_word_priors=first_posts_tokens, eta_=1.0)


### Gensim Guided Post LDA Forum Titles

In [219]:
title_terms = [text_tokenize(title) for title in titles.split('\n')]

gensim_titles_model, gensim_titles_terms, gensim_titles_corpus, gensim_titles_processed_posts = gensim_lda(posts, topic_num=4, topic_word_priors=title_terms, eta_=1.0)

### Gensim Guided Threads LDA Forum Titles

In [220]:
gensim_thread_titles_model, gensim_thread_titles_terms, gensim_thread_titles_corpus, gensim_thread_titles_processed_posts = gensim_lda(threads, topic_num=4, topic_word_priors=title_terms, eta_=1.0)

In [221]:
gensim_eval_df = pd.concat([
    get_coherence_diversity_scores(gensim_processed_posts, gensim_post_terms,fast,metric='u_mass',topic_type='lda posts'),
    get_coherence_diversity_scores(gensim_titles_processed_posts, gensim_titles_terms,fast,metric='u_mass',topic_type='guided lda (post titles)'),
    get_coherence_diversity_scores(gensim_degree_processed_posts, gensim_degree_terms, fast, metric='u_mass',topic_type='guided lda (high degree)'),
    get_coherence_diversity_scores(processed_posts_init_posts, gensim_init_posts_terms,fast, metric="u_mass",topic_type='guided lda (initial posts)'),
    get_coherence_diversity_scores(gensim_processed_threads, gensim_thread_terms,fast,metric='u_mass',topic_type='lda threads'),
    get_coherence_diversity_scores(gensim_thread_degree_processed_posts, gensim__thread_degree_terms,fast,metric='u_mass',topic_type='lda threads (high degree)'),
    get_coherence_diversity_scores(gensim_thread_titles_processed_posts, gensim_thread_titles_terms,fast,metric='u_mass',topic_type='lda threads (post titles)'),
    get_coherence_diversity_scores(gensim_thread_processed_posts_init_posts, gensim_thread_init_posts_terms,fast,metric='u_mass',topic_type='lda threads (initial post)'),
])

In [222]:
gensim_eval_df[['u_mass_coherence','puw:','jd:','we-cd:']]

Unnamed: 0,u_mass_coherence,puw:,jd:,we-cd:
lda posts,-2.049662,0.95,0.929901,0.234451
guided lda (post titles),-2.007422,1.0,0.956038,0.25634
guided lda (high degree),-2.175875,1.0,0.959436,0.262369
guided lda (initial posts),-2.126475,0.975,0.970196,0.273824
lda threads,-2.126444,0.9,0.94467,0.344369
lda threads (high degree),-1.922054,0.875,0.910089,0.221785
lda threads (post titles),-1.910564,0.875,0.937734,0.307599
lda threads (initial post),-1.84281,0.925,0.930816,0.318339


In [223]:
gensim_eval_df[['u_mass_coherence','puw:','jd:','we-cd:']].describe()

Unnamed: 0,u_mass_coherence,puw:,jd:,we-cd:
count,8.0,8.0,8.0,8.0
mean,-2.020163,0.9375,0.94236,0.277384
std,0.1201,0.051755,0.01931,0.042582
min,-2.175875,0.875,0.910089,0.221785
25%,-2.126452,0.89375,0.930587,0.250868
50%,-2.028542,0.9375,0.941202,0.268096
75%,-1.919181,0.98125,0.956887,0.310284
max,-1.84281,1.0,0.970196,0.344369
