In [1]:
import network_creation
import preprocessing
import community_utils
import tomotopy as tp
import networkx as nx
import igraph as ig
from gensim.models.coherencemodel import CoherenceModel
import numpy as np

In [2]:
with open("./text_datasets/20newsgroups_train.txt", "r") as f:
    bbc_train = f.read().split("\n")
with open("./text_datasets/20newsgroups_test.txt", "r") as f:
    bbc_test = f.read().split("\n")

In [3]:
# create filter configuration dict
filter_dict = {"filter_short": True,
              "filter_stopwords": True,
              "filter_numbers": True,
              "filter_punct": True,
              "filter_websites": True,
              "filter_emails": True,
              "filter_not_wordlike": True,
              "pos_filters": ["NOUN", "PROPN"]}

# create preprocessing pipeline
nlp = preprocessing.create_pipeline(detect_sentences=True,
                                    detect_entities=True,
                                    entity_types=["EVENT", "FAC", "GPE", "LOC", "ORG", "PERSON", "PRODUCT", "WORK_OF_ART"],
                                    filter_config=filter_dict)

In [4]:
bbc_train_docs = list(nlp.pipe(bbc_train))

In [5]:
bbc_test_docs = list(nlp.pipe(bbc_test))

In [6]:
tokenized_bbc_train_docs = list(preprocessing.tokenize_docs(bbc_train_docs, lowercase=True, sentences=False))
tokenized_bbc_train_sents = list(preprocessing.tokenize_docs(bbc_train_docs, lowercase=True, sentences=True))
tokenized_bbc_test_docs = list(preprocessing.tokenize_docs(bbc_test_docs, lowercase=True, sentences=False))

In [7]:
bbc_phrases, tokenized_bbc_train_docs, bbc_phrase_models = preprocessing.detect_phrases(tokenized_bbc_train_docs,
                                                      num_iterations=2,
                                                      scoring_method='npmi',
                                                      threshold=0.35,
                                                      min_count=None)

In [8]:
for model in bbc_phrase_models:
    tokenized_bbc_train_sents = model[tokenized_bbc_train_sents]
tokenized_bbc_train_sents = [[token.replace(" ", "_") for token in sent] for sent in tokenized_bbc_train_sents]

for model in bbc_phrase_models:
    tokenized_bbc_test_docs = model[tokenized_bbc_test_docs]
tokenized_bbc_test_docs = [[token.replace(" ", "_") for token in doc] for doc in tokenized_bbc_test_docs]

In [9]:
bbc_vocab, bbc_dictionary = preprocessing.create_vocabulary_and_dictionary(tokenized_bbc_train_docs, min_threshold=None)
tokenized_bbc_train_sents = preprocessing.filter_tokenized_docs_with_vocab(tokenized_bbc_train_sents, bbc_vocab)
tokenized_bbc_train_docs = preprocessing.filter_tokenized_docs_with_vocab(tokenized_bbc_train_docs, bbc_vocab)
tokenized_bbc_test_docs = preprocessing.filter_tokenized_docs_with_vocab(tokenized_bbc_test_docs, bbc_vocab)
test_vocab = set()
for doc in tokenized_bbc_test_docs:
    for token in doc:
        test_vocab.add(token)
tokenized_bbc_train_docs = [[token for token in doc if token in test_vocab] for doc in tokenized_bbc_train_docs]
tokenized_bbc_train_sents = [[token for token in sent if token in test_vocab] for sent in tokenized_bbc_train_sents]

In [10]:
tokenized_bbc_train_sents = [sent for sent in tokenized_bbc_train_sents if len(sent) > 0]
tokenized_bbc_train_docs = [doc for doc in tokenized_bbc_train_docs if len(doc) > 0]
tokenized_bbc_test_docs = [doc for doc in tokenized_bbc_test_docs if len(doc) > 0]

In [11]:
bbc_sentence_nb = network_creation.SentenceNetworkBuilder(tokenized_bbc_train_sents, 
                                                         bbc_dictionary)

In [12]:
bbc_sentence_nb.save_network(f"./hierarchy_networks/bbc_sentence_npmi_0.txt", type="npmi", threshold=0)

In [13]:
nx_g = nx.read_weighted_edgelist(f"./hierarchy_networks/bbc_sentence_npmi_0.txt")
ig_g = ig.Graph.from_networkx(nx_g)

In [14]:
clustering = ig_g.community_leiden(objective_function='modularity', weights='weight', resolution_parameter=1)

  clustering = ig_g.community_leiden(objective_function='modularity', weights='weight', resolution_parameter=1)


In [15]:
len(clustering)

5

In [16]:
topics = [[int(ig_g.vs[node]["_nx_name"]) for node in comm] for comm in clustering if len(comm) > 2]
for comm in topics:
    c = [str(node) for node in comm]
    comm.sort(key=lambda node: community_utils.get_internal_weighted_degree(node, c, nx_g), reverse=True)

In [17]:
topics_test = [[bbc_dictionary[node] for node in comm] for comm in clustering]

with open("./topics.txt", "w") as f:
    lines = []
    for topic in topics_test:
        line = " ".join(topic)
        lines.append(line)
    f.write("\n".join(lines))

print("Full topics saved to topics.txt")
for topic in topics_test:
    print(topic[:10])

Full topics saved to topics.txt
['addition', 'car', 'engine', 'model', 'production', 'floppies', 'functionality', 'computer', 'display', 'folks']
['body', 'info', 'rest', 'base', 'cpu', 'procedure', 'speed', 'usage', 'advance', 'machine']
['day', 'doors', 'thanks', 'access', 'dirt', 'duo', 'line', 'people', 'round', 'rumors']
['door', 'history', 'mail', 'specs', 'sports', 'years', 'cards', 'clock', 'days', 'disk']
['poll', 'finals', 'errors', 'tom', 'course', 'controller', 'sheet', 'example', 'references', 'air_force']


In [18]:
def get_topic_phi(topic, nx_g, level_0):
    c = [str(node) for node in topic]
    phi = np.zeros((len(level_0),))
    for i, v in enumerate(level_0):
        if v in topic:
            phi[i] = community_utils.get_internal_weighted_degree(v, c, nx_g)

    return phi

In [19]:
level_0 = [int(ig_g.vs[v.index]["_nx_name"]) for v in ig_g.vs]

In [20]:
len(level_0)

2918

In [21]:
phi_norm = np.array([bbc_sentence_nb.token_freqs[v] for v in level_0])

In [22]:
phi_norm = phi_norm / phi_norm.sum()

In [23]:
phis = []
for cluster in topics:
    phi = get_topic_phi(cluster, nx_g, level_0)
    phi = phi / phi.sum()
    cos = np.dot(phi, phi_norm) / (np.linalg.norm(phi) * np.linalg.norm(phi_norm))
    phis.append(1 - cos)

print()
print(sum(phis) / len(phis))


0.7027301415617562


In [24]:
subclusterings = []
subtopics = []
for cluster in clustering:
    ig_sg = ig_g.subgraph(cluster)
    subclusters = ig_sg.community_leiden(objective_function='modularity', resolution_parameter=1.0, weights='weight')
    print(f'{len(subclusters)} sub clusters found')
    subclusterings.append(subclusters)
    stopics = [[int(ig_sg.vs[node]["_nx_name"]) for node in comm] for comm in subclusters if len(comm) > 2]
    for comm in stopics:
        c = [str(node) for node in comm]
        comm.sort(key=lambda node: community_utils.get_internal_weighted_degree(node, c, nx_g), reverse=True)
    subtopics.append(stopics)


6 sub clusters found
5 sub clusters found


  subclusters = ig_sg.community_leiden(objective_function='modularity', resolution_parameter=1.0, weights='weight')


5 sub clusters found
4 sub clusters found
5 sub clusters found


In [25]:
phis = []

for stopics in subtopics:
    for topic in stopics:
        phi = get_topic_phi(topic, nx_g, level_0)
        phi = phi / phi.sum()
        cos = np.dot(phi, phi_norm) / (np.linalg.norm(phi) * np.linalg.norm(phi_norm))
        phis.append(1 - cos)

print('Avg specialization')
print(sum(phis) / len(phis))
    

Avg specialization
0.8823317325837333


In [26]:
child_sims = []
nonchild_sims = []

for i, supertopic in enumerate(topics):
    super_phi = get_topic_phi(supertopic, nx_g, level_0)
    for j, stopics in enumerate(subtopics):
        for subtopic in stopics:
            sub_phi = get_topic_phi(subtopic, nx_g, level_0)
            cos = np.dot(super_phi, sub_phi) / (np.linalg.norm(super_phi) * np.linalg.norm(sub_phi))
            if i == j:
                child_sims.append(cos)
            else:
                nonchild_sims.append(cos)

avg_child_sims = sum(child_sims) / len(child_sims)
avg_nonchild_sims = sum(nonchild_sims) / len(nonchild_sims)

print(avg_child_sims)
print(avg_nonchild_sims)

0.42574841928636964
0.0


In [27]:
eval_topics = []
for super_topic in topics:
    eval_topics.append([bbc_dictionary[node] for node in super_topic])
for stopic in subtopics:
    for sub_topic in stopic:
        eval_topics.append([bbc_dictionary[node] for node in sub_topic])

for coherence in ["c_v", "c_npmi"]:
    cm = CoherenceModel(topics=eval_topics, texts=tokenized_bbc_test_docs, dictionary=bbc_dictionary, topn=5, coherence=coherence)
    score = cm.get_coherence()
    print(f'{coherence}: {score}')

c_v: 0.6733591663950255
c_npmi: 0.06175667902927657


In [28]:
hlda_model = tp.HLDAModel(depth=4)

for doc in tokenized_bbc_train_docs:
    hlda_model.add_doc(doc)

In [29]:
iterations = 10
for i in range(0, 100, iterations):
    hlda_model.train(iterations)
    print('Iteration: #{}\tLog-likelihood: {}'.format(i, hlda_model.ll_per_word))

Iteration: #0	Log-likelihood: -7.226701854306841
Iteration: #10	Log-likelihood: -7.101672536347698
Iteration: #20	Log-likelihood: -7.056578853720364
Iteration: #30	Log-likelihood: -7.028760030832465
Iteration: #40	Log-likelihood: -7.012713281833828
Iteration: #50	Log-likelihood: -6.995792854880585
Iteration: #60	Log-likelihood: -6.984036350502312
Iteration: #70	Log-likelihood: -6.975818179868433
Iteration: #80	Log-likelihood: -6.965583308008099
Iteration: #90	Log-likelihood: -6.9623869592894225


In [30]:
total_topics = 0

for k in range(hlda_model.k):
    if hlda_model.is_live_topic(k):
        total_topics += 1

print(total_topics)

2207


In [31]:
total_topics = 0

for k in range(hlda_model.k):
    if hlda_model.is_live_topic(k):
        if (hlda_model.num_docs_of_topic(k) > 2):
            total_topics += 1

print(total_topics)

1794


In [32]:
level_0_count = 0
level_1_count = 0
level_2_count = 0
level_3_count = 0

for k in range(hlda_model.k):
    if hlda_model.is_live_topic(k):
        if (hlda_model.level(k) == 0):
            level_0_count += 1
        elif hlda_model.level(k) == 1:
            level_1_count += 1
        elif hlda_model.level(k) == 2:
            level_2_count += 1
        else:
            level_3_count += 1

print(level_0_count)
print(level_1_count)
print(level_2_count)
print(level_3_count)

1
256
686
1264


In [33]:
level_0_count = 0
level_1_count = 0
level_2_count = 0
level_3_count = 0

for k in range(hlda_model.k):
    if hlda_model.is_live_topic(k):
        if hlda_model.num_docs_of_topic(k) > 2:
            if (hlda_model.level(k) == 0):
                level_0_count += 1
            elif hlda_model.level(k) == 1:
                level_1_count += 1
            elif hlda_model.level(k) == 2:
                level_2_count += 1
            else:
                level_3_count += 1

print(level_0_count)
print(level_1_count)
print(level_2_count)
print(level_3_count)

1
249
588
956


In [34]:
weird_count = 0

for k in range(hlda_model.k):
    if hlda_model.is_live_topic(k):
        if hlda_model.level(k) == 2:
            num_docs = hlda_model.num_docs_of_topic(k)
            parent = hlda_model.parent_topic(k)
            num_parent_docs = hlda_model.num_docs_of_topic(parent)
            if num_docs >= num_parent_docs:
                weird_count += 1

print(weird_count)

90


In [35]:
def get_hlda_phi(mdl, topic, dictionary, level_0):
    phi = np.zeros((len(level_0),))
    for term, prob in mdl.get_topic_words(topic, top_n=len(level_0)):
        try:
            term_id = dictionary.token2id[term]
            phi_idx = level_0.index(term_id)
            phi[phi_idx] = prob
        except:
            continue
    return phi

In [36]:
level_1_specs = []
level_2_specs = []
level_3_specs = []

for k in range(hlda_model.k):
    if hlda_model.is_live_topic(k):
        if hlda_model.level(k) == 1:
            phi = get_hlda_phi(hlda_model, k, bbc_dictionary, level_0)
            cos = np.dot(phi_norm, phi) / (np.linalg.norm(phi_norm) * np.linalg.norm(phi))
            level_1_specs.append(1 - cos)
        elif hlda_model.level(k) == 2:
            phi = get_hlda_phi(hlda_model, k, bbc_dictionary, level_0)
            cos = np.dot(phi_norm, phi) / (np.linalg.norm(phi_norm) * np.linalg.norm(phi))
            level_2_specs.append(1 - cos)
        elif hlda_model.level(k) == 3:
            phi = get_hlda_phi(hlda_model, k, bbc_dictionary, level_0)
            cos = np.dot(phi_norm, phi) / (np.linalg.norm(phi_norm) * np.linalg.norm(phi))
            level_3_specs.append(1 - cos)

print("level 1 specialization: ", sum(level_1_specs) / len(level_1_specs))
print("level 2 specialization: ", sum(level_2_specs) / len(level_2_specs))
print("level 3 specialization: ", sum(level_3_specs) / len(level_3_specs))

level 1 specialization:  0.8816888150781839
level 2 specialization:  0.8806667016460439
level 3 specialization:  0.8731021859184818


In [37]:
from collections import defaultdict
super_phis = dict()
sub_phis = defaultdict(list)

for k in range(hlda_model.k):
    if hlda_model.is_live_topic(k):
        if hlda_model.level(k) == 1:
            phi = get_hlda_phi(hlda_model, k, bbc_dictionary, level_0)
            super_phis[k] = phi
        elif hlda_model.level(k) == 2:
            phi = get_hlda_phi(hlda_model, k, bbc_dictionary, level_0)
            parent = hlda_model.parent_topic(k)
            sub_phis[parent].append(phi)

In [38]:
child_sims = []
nonchild_sims = []

for parent, parent_phi in super_phis.items():
    for p, cs in sub_phis.items():
        for c_phi in cs:
            cos = np.dot(parent_phi, c_phi) / (np.linalg.norm(parent_phi) * np.linalg.norm(c_phi))
            if parent == p:
                child_sims.append(cos)
            else:
                nonchild_sims.append(cos)
                
print("Hierarchical Affinity level 1 and 2")
print(sum(child_sims) / len(child_sims), "(child)")
print(sum(nonchild_sims) / len(nonchild_sims), "non-child")

Hierarchical Affinity level 1 and 2
0.04405323992527675 (child)
0.019209528441067655 non-child


In [39]:
eval_topics = []
for k in range(hlda_model.k):
    if hlda_model.is_live_topic(k):
        eval_topics.append([w for w, _ in hlda_model.get_topic_words(k, top_n=5)])

In [40]:
for coherence in ["c_v", "c_npmi"]:
    cm = CoherenceModel(topics=eval_topics, texts=tokenized_bbc_test_docs, dictionary=bbc_dictionary, topn=5, coherence=coherence)
    score = cm.get_coherence()
    print(f'{coherence}: {score}')

c_v: 0.4276809630300829
c_npmi: -0.14553906538077566


In [41]:
pam_model = tp.PAModel(k1=5, k2=30)

for doc in tokenized_bbc_train_docs:
    pam_model.add_doc(doc)

In [42]:
iterations = 10
for i in range(0, 100, iterations):
    pam_model.train(iterations)
    print('Iteration: #{}\tLog-likelihood: {}'.format(i, pam_model.ll_per_word))

Iteration: #0	Log-likelihood: -10.551583517064913
Iteration: #10	Log-likelihood: -10.00039035928291
Iteration: #20	Log-likelihood: -9.888507715387012
Iteration: #30	Log-likelihood: -9.829247080263059
Iteration: #40	Log-likelihood: -9.807981821209532
Iteration: #50	Log-likelihood: -9.785678914264812
Iteration: #60	Log-likelihood: -9.779550593345256
Iteration: #70	Log-likelihood: -9.765294201453706
Iteration: #80	Log-likelihood: -9.762189594318853
Iteration: #90	Log-likelihood: -9.74867736840671


In [43]:
super_topic_phis = []
sub_topic_phis = defaultdict(list)
for k in range(pam_model.k1):
    super_phi = np.zeros((len(level_0),))
    i = 0
    for subtopic, prob in pam_model.get_sub_topics(k, top_n=30):
        sub_phi = get_hlda_phi(pam_model, subtopic, bbc_dictionary, level_0)
        if i < 6:
            sub_topic_phis[k].append(sub_phi)
        super_phi += (prob * sub_phi)
        i += 1
    super_topic_phis.append(super_phi)

In [44]:
level_1_specs = []
level_2_specs = []

for k in range(pam_model.k1):
    super_phi = super_topic_phis[k]
    cos = np.dot(phi_norm, super_phi) / (np.linalg.norm(phi_norm) * np.linalg.norm(super_phi))
    level_1_specs.append(1 - cos)
    for sub_phi in sub_topic_phis[k]:
        cos = np.dot(phi_norm, sub_phi) / (np.linalg.norm(phi_norm) * np.linalg.norm(sub_phi))
        level_2_specs.append(1 - cos)

print(sum(level_1_specs) / len(level_1_specs))
print(sum(level_2_specs) / len(level_2_specs))

0.005602825395026411
0.559521097401565


In [45]:
child_sims = []
nonchild_sims = []

for parent, parent_phi in enumerate(super_topic_phis):
    for p, cs in sub_topic_phis.items():
        for c_phi in cs:
            cos = np.dot(parent_phi, c_phi) / (np.linalg.norm(parent_phi) * np.linalg.norm(c_phi))
            if parent == p:
                child_sims.append(cos)
            else:
                nonchild_sims.append(cos)

print(sum(child_sims) / len(child_sims))
print(sum(nonchild_sims) / len(nonchild_sims))

0.45112651482523425
0.4349652665672422


In [46]:
eval_topics = []
for k in range(pam_model.k1):
    t = []
    for idx in np.argpartition(super_topic_phis[0], -5)[-5:]:
        t.append(bbc_dictionary[level_0[idx]])
    eval_topics.append(t)
for k in range(pam_model.k1, pam_model.k2):
    eval_topics.append([w for w, _ in pam_model.get_topic_words(k, top_n=5)])

In [47]:
for coherence in ["c_v", "c_npmi"]:
    cm = CoherenceModel(topics=eval_topics, texts=tokenized_bbc_test_docs, dictionary=bbc_dictionary, topn=5, coherence=coherence)
    score = cm.get_coherence()
    print(f'{coherence}: {score}')

c_v: 0.6516657781153187
c_npmi: 0.11427878851283484


In [48]:
# Tried new one HAPModel

In [49]:
hpam_model = tp.HPAModel(k1=5, k2=30)

for doc in tokenized_bbc_train_docs:
    hpam_model.add_doc(doc)

In [50]:
iterations = 10
for i in range(0, 100, iterations):
    hpam_model.train(iterations)
    print('Iteration: #{}\tLog-likelihood: {}'.format(i, hpam_model.ll_per_word))

Iteration: #0	Log-likelihood: -9.728332805512178
Iteration: #10	Log-likelihood: -9.346376235485627
Iteration: #20	Log-likelihood: -9.28974521167038
Iteration: #30	Log-likelihood: -9.26071704907
Iteration: #40	Log-likelihood: -9.262398603937344
Iteration: #50	Log-likelihood: -9.266059680329985
Iteration: #60	Log-likelihood: -9.267216800737721
Iteration: #70	Log-likelihood: -9.267400697321865
Iteration: #80	Log-likelihood: -9.27514962583069
Iteration: #90	Log-likelihood: -9.283231945593016


In [51]:
super_topic_phis = []
sub_topic_phis = defaultdict(list)
for k in range(hpam_model.k1):
    super_phi = np.zeros((len(level_0),))
    i = 0
    for subtopic, prob in hpam_model.get_sub_topics(k, top_n=30):
        sub_phi = get_hlda_phi(hpam_model, subtopic, bbc_dictionary, level_0)
        if i < 6:
            sub_topic_phis[k].append(sub_phi)
        super_phi += (prob * sub_phi)
        i += 1
    super_topic_phis.append(super_phi)

In [52]:
level_1_specs = []
level_2_specs = []

for k in range(hpam_model.k1):
    super_phi = super_topic_phis[k]
    cos = np.dot(phi_norm, super_phi) / (np.linalg.norm(phi_norm) * np.linalg.norm(super_phi))
    level_1_specs.append(1 - cos)
    for sub_phi in sub_topic_phis[k]:
        cos = np.dot(phi_norm, sub_phi) / (np.linalg.norm(phi_norm) * np.linalg.norm(sub_phi))
        level_2_specs.append(1 - cos)

print(sum(level_1_specs) / len(level_1_specs))
print(sum(level_2_specs) / len(level_2_specs))

0.09631363903023363
0.7198176398359268


In [53]:
child_sims = []
nonchild_sims = []

for parent, parent_phi in enumerate(super_topic_phis):
    for p, cs in sub_topic_phis.items():
        for c_phi in cs:
            cos = np.dot(parent_phi, c_phi) / (np.linalg.norm(parent_phi) * np.linalg.norm(c_phi))
            if parent == p:
                child_sims.append(cos)
            else:
                nonchild_sims.append(cos)

print(sum(child_sims) / len(child_sims))
print(sum(nonchild_sims) / len(nonchild_sims))

0.3631621852597141
0.330285608273151


In [54]:
eval_topics = []
for k in range(hpam_model.k1):
    t = []
    for idx in np.argpartition(super_topic_phis[0], -5)[-5:]:
        t.append(bbc_dictionary[level_0[idx]])
    eval_topics.append(t)
for k in range(hpam_model.k1, hpam_model.k2):
    eval_topics.append([w for w, _ in hpam_model.get_topic_words(k, top_n=5)])

In [55]:
for coherence in ["c_v", "c_npmi"]:
    cm = CoherenceModel(topics=eval_topics, texts=tokenized_bbc_test_docs, dictionary=bbc_dictionary, topn=5, coherence=coherence)
    score = cm.get_coherence()
    print(f'{coherence}: {score}')

c_v: 0.6320137051245874
c_npmi: 0.08799161208109782
