### Comprehension Burden for Sequencing Documents

In [25]:
import requests
from collections import Counter

from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import networkx as nx

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [26]:
'''
Knowledge Graph
'''
kg_path = "../graph_query/graphs/knowledge_graph.gpickle"
kg = nx.read_gpickle(kg_path)
kg_labels = [str(x) for x in list(kg.nodes())[1:]]
n_labels = len(kg_labels)

In [27]:
'''
Get content from a given set of URLs.
'''
def get_content():
    f = open('sample_urls.txt', 'r')
    l = f.readlines()
    docs = {}
    index = {}
    
    counter = 0
    for url in l:
        docs[url] = requests.get(url).content
        index[url] = counter
        counter += 1
    return docs, index

In [28]:
'''
Term Frequency Array for a particular document.
'''
def get_tfd(content):
    word_count_dict = Counter(w for w in kg_labels 
                              if w.lower() in content.lower())
    common = word_count_dict.most_common()
    
    frequency_arr = [0]*len(kg_labels)
    
    for common_word in common:
        common_word_index = kg_labels.index(common_word[0])
        frequency_arr[common_word_index] = common_word[1]
    return frequency_arr

In [29]:
content, index = get_content()

In [30]:
'''
Building word_data a document (rows) by term frequency (columns) matrix.
'''
tfd_data = {}
for url, cont in content.items():
    tfd_data[url] = get_tfd(cont)

tfd_arr = []
for key in index.keys():
    tfd_arr.append(key.replace("\n", ""))

word_data = {'TFD':tfd_arr}

for label in kg_labels:
    word_data[label] = [None]*len(index)

for url, words_in_doc in tfd_data.items():
    url_index = index[url]
    for i in range(0, n_labels, 1):
        word = kg_labels[i]
        word_data[word][url_index] = words_in_doc[i]

In [31]:
'''
(DTF)^T(DTF) = Coocurence Matrix
'''
document_term_frequency = pd.DataFrame(word_data).set_index('TFD')
dtf_asint = document_term_frequency.astype(int)
coocc = dtf_asint.T.dot(dtf_asint)

### Calculating Relationship Score: S(i, j)

In [32]:
def get_relationship_between_concepts(concept_1, concept_2):
    concept_1_index= document_term_frequency.columns.get_loc(concept_1)
    concept_2_index= document_term_frequency.columns.get_loc(concept_2)
    
    return coocc.iloc[concept_1_index, concept_2_index]

### Significance of a concept in a document: \lambda(c, i)

In [33]:
def get_significance_score(concept, document):
    if(document == None): return 0
    concept_index = document_term_frequency.columns.get_loc(concept)
    freq = dtf_asint.iloc[index[document]][concept_index]
    coocc_row = coocc.iloc[concept_index,:] 
    r = np.array(coocc_row)
    if(sum(r) == 0): return 0
    return (freq)+np.count_nonzero(r)

### Key Sections k_c

In [34]:
key_doc = {}

doc_to_key = {}

# for each_document in content.keys():
#     doc_max = 0
#     doc_to_key[each_document] = []
#     for each_concept in kg_labels:
#         s= get_significance_score(each_concept, each_document)
#         if(s > doc_max):
#             doc_max = s
#             doc_to_key[each_document] = (doc_max, each_concept)

for each_concept in kg_labels:
    key_max = 0
    for each_document in content.keys():
        s = get_significance_score(each_concept, each_document)
        if(s > key_max and s > 4.0):
            key_max = s
    
            key_doc[each_concept] = each_document

for key, val in key_doc.items():
    doc_to_key[val] = key

for each in content.keys():
    if(each and each not in doc_to_key):
        doc_to_key[each] = 'Clustering'

### Comprehension Burden

In [53]:
def f_cb(sig_score, key_sig_score, relationship):
    return sig_score+key_sig_score+relationship

def get_cb_document(document, document_key_concept, visited):
    key_sig_score = get_significance_score(document_key_concept, document)
    document_burden = 0.0
    num_of_docs = 0
    
    order = ['Computer Science', 'Operating Systems', 'Clustering', 'Mixture Models', 'Mixture of Gaussians']

    for other_concept in order:
        if(other_concept in visited or other_concept==document_key_concept): continue
        sig_score = get_significance_score(other_concept, document)
        relationship = get_relationship_between_concepts(document_key_concept, other_concept)
        if(sig_score > 0): 
            document_burden += f_cb(sig_score, key_sig_score, relationship)
            num_of_docs += 1
    return document_burden

In [54]:
v = set()
tk = 0

random = ['Computer Science', 'Operating Systems','Clustering', 'Mixture Models', 'Mixture of Gaussians']
randomized = {}

for each in order: randomized[each] = []
for doc, kc in doc_to_key.items():
    randomized[kc].append(doc)

for kc in order:
    v.add(kc)
    docs = randomized[kc]
    for each in docs:
        tk += get_cb_document(each, kc, v)
print(tk)

1432.0


In [58]:
visited = set()
total = 0

order = ['Mixture Models', 'Mixture of Gaussians', 'Computer Science','Clustering', 'Operating Systems']
ordered = {}

for each in order: ordered[each] = []
for doc, kc in doc_to_key.items():
    ordered[kc].append(doc)

for kc in order:
    visited.add(kc)
    docs = ordered[kc]
    for each in docs:
        total += get_cb_document(each, kc, visited)
print(total)

546.0
