### Collective Burden for Sequencing Documents

In [1]:
import requests
from collections import Counter
import random as randomlib

from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import networkx as nx

import itertools

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
'''
Knowledge Graph
'''
kg_path = "../graph_query/graphs/weighted_knowledge_graph.gpickle"
kg = nx.read_gpickle(kg_path)
kg_labels = [str(x) for x in list(kg.nodes())[1:]]
n_labels = len(kg_labels)

In [3]:
def get_queries_based_on_node(node_label):
    node = kg.node[node_label]

    if("NodeType" not in node): 
        kg.node[node_label]["NodeType"] = "ConceptNode"
        l =  list(kg.neighbors(node_label))
        return l
    node_type = node["NodeType"]
    if(node_type == "TopicNode" or node_type == "ConceptNode"):
        return list(kg.neighbors(node_label))
    elif(node_type == "SubConceptNode"):
        return [node_label]
    else:
        pass

'''
Returns a list of queries depending on the type of the
node closest to the query.
args - query(str)
returns [] of str
'''
def query_formulator(query, label):
    queries = []
    children_neighbours = get_queries_based_on_node(label)
    queries = [label]
    for child in children_neighbours:
        queries.append(child)
    return list(set(queries))


In [4]:
'''
Get content from a given set of URLs.
'''
def get_content(url):
    es_order = []
    f = open(url, 'r')
    l = f.readlines()
    docs = {}
    index = {}
    
    counter = 0
    for url in l:
        try:
            docs[url] = requests.get(url).content
            index[url] = counter
            es_order.append(url)
            counter += 1
        except:
            continue
    return docs, index, es_order

In [5]:
'''
Term Frequency Array for a particular document.
'''
def get_tfd(content):
    word_count_dict = Counter(w for w in kg_labels 
                              if w.lower() in content.lower())
    common = word_count_dict.most_common()
    
    frequency_arr = [0]*len(kg_labels)
    
    for common_word in common:
        common_word_index = kg_labels.index(common_word[0])
        frequency_arr[common_word_index] = common_word[1]
    return frequency_arr

In [6]:
'''
Building word_data a document (rows) by term frequency (columns) matrix.
'''
def get_matrices(content, index):
    tfd_data = {}
    for url, cont in content.items():
        tfd_data[url] = get_tfd(cont)

    tfd_arr = []
    for key in index.keys():
        tfd_arr.append(key.replace("\n", ""))

    word_data = {'TFD':tfd_arr}

    for label in kg_labels:
        word_data[label] = [None]*len(index)

    for url, words_in_doc in tfd_data.items():
        url_index = index[url]
        for i in range(0, n_labels, 1):
            word = kg_labels[i]
            word_data[word][url_index] = words_in_doc[i]

    '''
    (DTF)^T(DTF) = Coocurence Matrix
    '''
    document_term_frequency = pd.DataFrame(word_data).set_index('TFD')
    dtf_asint = document_term_frequency.astype(int)
    coocc = dtf_asint.T.dot(dtf_asint)

    return document_term_frequency, dtf_asint, coocc

### Calculating Relationship Score: S(i, j)

In [7]:
def get_relationship_between_concepts(concept_1, concept_2, document_term_frequency):
    concept_1_index= document_term_frequency.columns.get_loc(concept_1)
    concept_2_index= document_term_frequency.columns.get_loc(concept_2)
    
    return coocc.iloc[concept_1_index, concept_2_index]

### Significance of a concept in a document: \lambda(c, i)

In [8]:
def get_significance_score(concept, index, document, document_term_frequency, dtf_asint, coocc):
    if(document == None): return 0
    concept_index = document_term_frequency.columns.get_loc(concept)
    freq = dtf_asint.iloc[index[document]][concept_index]
    coocc_row = coocc.iloc[concept_index,:] 
    r = np.array(coocc_row)
    if(sum(r) == 0): return freq
    return (freq)+np.count_nonzero(r)

In [9]:
def get_right(content, index, top_n, document_term_frequency, dtf_asint, coocc):
    doc_to_concepts_list = {}
    for each_document in content.keys():
        doc_to_concepts_list[each_document] = []
    print(doc_to_concepts_list)
    for each_concept in kg_labels:
        m = 0.0
        d_to_v = {}
        for each_document in content.keys():
            d_to_v[each_document] = get_significance_score(each_concept, index, each_document, document_term_frequency,
                                                          dtf_asint, coocc)
            if(d_to_v[each_document] > m):
                m = d_to_v[each_document]
        
        for d, v in d_to_v.items():
            if(v == m):
                doc_to_concepts_list[d].append((each_concept, v))
    
    final_doc_to_concept_list = {}
    for d, v in doc_to_concepts_list.items():
        v.sort(key=lambda x:x[1])
        if(len(v) >= top_n):
            final_doc_to_concept_list[d] = [v[i][0] for i in range(0, top_n, 1)]
        else:
            final_doc_to_concept_list[d] = [x[0] for x in v]
    relevant_concepts= set()
    
    for d, v in final_doc_to_concept_list.items():
        for each in v:
            relevant_concepts.add(each[0])
    return doc_to_concepts_list, relevant_concepts

### Key Sections k_c

In [23]:
def get_doc_to_concepts_list(content, index, top_n, document_term_frequency, dtf_asint, coocc):
    doc_to_concept_list = {}
    relevant_concepts_to_sequence = set()
    
    for each_document in content.keys():
        rt = []
        rc = []
        for each_concept in kg_labels:
            s = get_significance_score(each_concept, index,each_document, document_term_frequency, dtf_asint, coocc)
            if(s <= 0): continue
            if("NodeType" not in kg.node[each_concept]):
                continue
            elif(kg.node[each_concept]["NodeType"] == "ConceptNode"):
                rc.append((each_concept, s))
            elif(kg.node[each_concept]["NodeType"] == "TopicNode"):
                rt.append((each_concept, s))
        
        rt.sort(key=lambda x:x[1])
        rt = rt[::-1]
        
        rc.sort(key=lambda x:x[1])
        rc = rc[::-1]
        
        key_concepts = []
        while(len(rc) and len(key_concepts) < top_n):
            key_concepts.append(rc[0][0])
            print(rc[0][0], rc[0][1], each_document)
            relevant_concepts_to_sequence.add(rc[0][0])
            rc.pop(0)
            
        while(len(rt) and len(key_concepts) < top_n):
            key_concepts.append(rt[0][0])
            relevant_concepts_to_sequence.add(rt[0][0])
            rt.pop(0)
            
        for each in rt:
            relevant_concepts_to_sequence.add(each[0])
        
        for each in rc:
            relevant_concepts_to_sequence.add(each[0])
        doc_to_concept_list[each_document] = key_concepts
    return doc_to_concept_list, relevant_concepts_to_sequence

In [11]:
def get_relevant_concepts_for_lp(doc_to_concepts_list):
    rel = []
    for key,val in doc_to_concepts_list.items():
        for each in val:
            rel.append(each)
    return rel   

### Comprehension Burden

In [12]:
def f_cb(sig_score, key_sig_score, relationship):
    return sig_score+key_sig_score

def get_related_concepts(document, index, document_term_frequency):
    concepts = []
    a = np.array(document_term_frequency.iloc[index[document]])
    z = a.nonzero()
    if(len(z[0]) == 0): return []
    for x in np.nditer(z[0]):
        concepts.append(document_term_frequency.columns[x])   
    return concepts
    
def get_cb_document(document, dc, visited, relevant, 
                    document_term_frequency, dtf_asint, coocc, index):
    document_burden = 0.0
    ds = get_related_concepts(document, index, document_term_frequency)
    for d in ds:
        burden = 0.0
        count = 0
        for c in dc:
            if(get_relationship_between_concepts(d, c, document_term_frequency) > 0):
                count += 1
                if(d not in visited):
                    burden += get_significance_score(c, index, document, document_term_frequency, dtf_asint, coocc)
                else:
                    count += 1
        if(count > 0):
            document_burden += burden/count
    return document_burden
    
    
    

### Sequence Generation

In [13]:
def get_linear(nodes):
    parents = []
    for each in nodes:
        if(each in kg.nodes and kg.nodes[each]["NodeType"] == "TopicNode"):
            parents.append(each)
    
    linear = []
    for p in parents:
        linear.append(p)
        children = kg.neighbors(p)
        for c in children:
            if c in nodes and kg.nodes[c]["NodeType"] == "ConceptNode":
                linear.append(c)
    
    for each in nodes:
        if each not in linear:
            linear.append(each)
    
    return linear

def get_weighted_sequences(nodes):
    parents = []
    for each in nodes:
        if(each in kg.nodes and kg.nodes[each]["NodeType"] == "TopicNode"):
            parents.append(each)
            
    weighted = []
    for p in parents:
        weighted.append(p)
        children = kg.neighbors(p)
        all_c = []
        
        for c in children:
            if(c not in nodes): continue
            if("weight" in kg[p][c]):
                all_c.append((c, kg[p][c]["weight"]))
            else:
                all_c.append((c, 0.0))

        all_c.sort(key=lambda x:x[1])
        all_c = all_c[::-1]

        for e in all_c: weighted.append(e[0])
    return weighted
            
    
def get_sequences(nodes):
        linear = get_linear(nodes)
        top_down = linear[::-1]
        weighted = get_weighted_sequences(nodes)
        return linear, top_down, weighted

In [14]:
def get_concepts_to_document_list(doc_to_concepts_list, relevant_concepts):
    concept_to_document_list = {}
    for each_concept in relevant_concepts:
        concept_to_document_list[each_concept] = []
        for doc, kcs in doc_to_concepts_list.items():
            if(each_concept in kcs): 
                concept_to_document_list[each_concept].append(doc)
    return concept_to_document_list
                

In [15]:
def get_burden_for_a_sequence(docs_sequence, doc_to_concepts_list, concepts_to_document_list,
                                   document_term_frequency, dft_asint, coocc, index, relevant_concepts_to_sequence):
    visited = set()
    collective_burden = 0.0
    burdens = []
    for each_doc in docs_sequence:
        
        for each_ass_conc in doc_to_concepts_list[each_doc]:
            visited.add(each_ass_conc)

        burden_per_doc = get_cb_document(each_doc, doc_to_concepts_list[each_doc], 
                                                     visited, relevant_concepts_to_sequence, 
                                                     document_term_frequency, dtf_asint, coocc, index)
        collective_burden += burden_per_doc
        burdens.append(burden_per_doc)
        
        
    return collective_burden

def get_burden_for_all_permutations(doc_to_concepts_list, concepts_to_document_list,
                                   document_term_frequency, dft_asint, coocc, index, relevant_concepts_to_sequence):
    
    docs = [x for x in doc_to_concepts_list.iterkeys()]
    perms = list(itertools.permutations(docs))
    
    for each in perms: print(get_burden_for_a_sequence(each, doc_to_concepts_list, concepts_to_document_list,
                                   document_term_frequency, dft_asint, coocc, index, relevant_concepts_to_sequence))
    
    

In [16]:
def get_burden_for_sequence(sequence, doc_to_concepts_list, 
                            concepts_to_document_list, 
                            document_term_frequency, dtf_asint, coocc, index):
    collective_burden = 0.0
    docs_sequence = []
    
    for each_con in sequence:
        docs_ass = concepts_to_document_list[each_con]
        
        doc_ass_size = []
        for each in docs_ass:
            doc_ass_size.append((each, 
                                 get_significance_score(each_con, index, each, document_term_frequency, dtf_asint, coocc)))
        doc_ass_size.sort(key=lambda x:x[1])
        doc_ass_size = doc_ass_size[::-1]
        
        for each, v in doc_ass_size:
            if(each not in docs_sequence):
                docs_sequence.append(each)

    for each in doc_to_concepts_list.keys():
        if(each not in docs_sequence): docs_sequence.append(each)
    
    visited = set()
    burdens = []
    for each_doc in docs_sequence:
        for each_ass_conc in doc_to_concepts_list[each_doc]:
            visited.add(each_ass_conc)
        
        burden_per_doc = get_cb_document(each_doc, doc_to_concepts_list[each_doc], 
                                                     visited, sequence, 
                                                     document_term_frequency, dtf_asint, coocc, index)
        collective_burden += burden_per_doc
        burdens.append(burden_per_doc)
        
        
    return collective_burden, burdens

In [17]:
def get_score(content, index, top_n, document_term_frequency, dtf_asint, coocc):
    doc_to_concepts_list, relevant_concepts_to_sequence = get_doc_to_concepts_list(content, index, top_n, document_term_frequency, dtf_asint, coocc)
    concepts_to_document_list = get_concepts_to_document_list(doc_to_concepts_list, relevant_concepts_to_sequence)
    
    linear, bottom_up, weighted = get_sequences(relevant_concepts_to_sequence)
    s, burden_per_doc = get_burden_for_sequence(linear, doc_to_concepts_list, concepts_to_document_list, 
                                document_term_frequency, dtf_asint, coocc, index)
    print("len", len(doc_to_concepts_list))
    return (s, doc_to_concepts_list, max(burden_per_doc))

In [18]:
def get_required(url):
    content, index, es_order = get_content(url)
    document_term_frequency, dtf_asint, coocc = get_matrices(content, index)
    return content, document_term_frequency, dtf_asint, coocc, index

In [19]:
content, document_term_frequency, dtf_asint, coocc, index = get_required("lps/engage/user_study_graph_theory_engage.txt")


In [24]:
for i in range(1, 2, 1):
    s, doc_to_concepts_list, m = get_score(content, index, 10, document_term_frequency, dtf_asint, coocc)
    for k, v in doc_to_concepts_list.items():
        print(k, v)

('Dfs', 60, 'https://jeremykun.com/2013/01/22/depth-and-breadth-first-search/\n')
('Shortest Path', 58, 'https://jeremykun.com/2013/01/22/depth-and-breadth-first-search/\n')
('Randomized algorithms', 56, 'https://jeremykun.com/2013/01/22/depth-and-breadth-first-search/\n')
('Topological Sort', 55, 'https://jeremykun.com/2013/01/22/depth-and-breadth-first-search/\n')
('Shortest Paths', 55, 'https://jeremykun.com/2013/01/22/depth-and-breadth-first-search/\n')
('Strongly Connected Components', 55, 'https://jeremykun.com/2013/01/22/depth-and-breadth-first-search/\n')
('Heaps', 54, 'https://jeremykun.com/2013/01/22/depth-and-breadth-first-search/\n')
('Scheduling', 53, 'https://jeremykun.com/2013/01/22/depth-and-breadth-first-search/\n')
('Huffman codes', 52, 'https://jeremykun.com/2013/01/22/depth-and-breadth-first-search/\n')
('Longest common subsequence', 50, 'https://jeremykun.com/2013/01/22/depth-and-breadth-first-search/\n')
('Dfs', 60, 'https://blog.oureducation.in/depth-first-search

In [126]:
import os
for root, dirs, files in os.walk("lps/engage/"):  
    for filename in files[:1]:
        content, document_term_frequency, dtf_asint, coocc, index = get_required("lps/engage/"+filename)
        print(filename)
        result_str = ""
        max_str = ""
        og = 0.0
        og_max = 0.0
        for i in range(1, 3, 1):
            s, doc_to_concepts_list, max_burden_doc = get_score(content, index, i, document_term_frequency, dtf_asint, coocc)
            if(i == 1):
                result_str += "& "+str(1.0)
                og = s
                max_str += "& "+str(1.0)
                og_max = max_burden_doc
            else:
                if(og == 0): print(s)
                else:
                    result_str += "&"+"{0:.3f}".format(s/og)
                    max_str += "&"+"{0:.3f}".format(max_burden_doc/og_max)
        print("burden str", result_str)
        print("max str", max_str)
        print("\n")

user_study_clustering_engage.txt
('len', 30)
('len', 30)
('burden str', '& 1.0&0.867')
('max str', '& 1.0&0.880')




In [20]:
25.266666666666666
24.5

24.5