### Comprehension Burden for Sequencing Documents

In [327]:
import requests
from collections import Counter

from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import networkx as nx

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [328]:
'''
Knowledge Graph
'''
kg_path = "../graph_query/graphs/knowledge_graph.gpickle"
kg = nx.read_gpickle(kg_path)
kg_labels = [str(x) for x in list(kg.nodes())[1:]]
n_labels = len(kg_labels)

In [329]:
'''
Get content from a given set of URLs.
'''
def get_content():
    f = open('sample_urls.txt', 'r')
    l = f.readlines()
    docs = {}
    index = {}
    
    counter = 0
    for url in l:
        docs[url] = requests.get(url).content
        index[url] = counter
        counter += 1
    return docs, index

### Calculating Relationship Score: S(i, j)

In [336]:
'''
Term Frequency Array for a particular document.
'''
def get_tfd(content):
    word_count_dict = Counter(w for w in kg_labels 
                              if w.lower() in content.lower())
    common = word_count_dict.most_common()
    
    frequency_arr = [0]*len(kg_labels)
    
    for common_word in common:
        common_word_index = kg_labels.index(common_word[0])
        frequency_arr[common_word_index] = common_word[1]
    return frequency_arr

In [331]:
content, index = get_content()

In [332]:
'''
Building word_data a document (rows) by term frequency (columns) matrix.
'''
tfd_data = {}
for url, cont in content.items():
    tfd_data[url] = get_tfd(cont)

tfd_arr = []
for key in index.keys():
    tfd_arr.append(key.replace("\n", ""))

word_data = {'TFD':tfd_arr}

for label in kg_labels:
    word_data[label] = [None]*len(index)

for url, words_in_doc in tfd_data.items():
    url_index = index[url]
    for i in range(0, n_labels, 1):
        word = kg_labels[i]
        word_data[word][url_index] = words_in_doc[i]

('Clustering', 1)
('Supervised Learning', 1)
('Unsupervised Learning', 1)
('K-Means', 1)
('Expectation Maximization', 1)
('Machine Learning', 1)
('Logistic Regression', 1)
('Pca', 1)
('Mixture Models', 1)
('Svms', 1)
('Computer Science', 1)
('Clustering', 1)
('K-means clustering', 1)
('Supervised Learning', 1)
('Unsupervised Learning', 1)
('K-Means', 1)
('Dfs', 1)
('Machine Learning', 1)
('Logistic Regression', 1)
('Pca', 1)
('Computer Science', 1)
('Clustering', 1)
('Sorting', 1)
('Unsupervised Learning', 1)
('K-Means', 1)
('Computer Science', 1)
('Supervised Learning', 1)
('Support Vector Machines', 1)
('Mixture Models', 1)
('Mixture of Gaussians', 1)
('Clustering', 1)
('K-means clustering', 1)
('Unsupervised Learning', 1)
('Deep Learning', 1)
('Machine Learning', 1)
('Bfs', 1)
('Optimization', 1)
('K-Means', 1)
('Supervised Learning', 1)
('Operating Systems', 1)


In [333]:
'''
(DTF)^T(DTF) = Coocurence Matrix
'''
document_term_frequency = pd.DataFrame(word_data).set_index('TFD')
dtf_asint = document_term_frequency.astype(int)
coocc = dtf_asint.T.dot(dtf_asint)

### Significance of a concept in a document: \lambda(c, i)

In [337]:
def get_significance_score(concept, document):
    concept_index = document_term_frequency.columns.get_loc(concept)
    freq = dtf_asint.iloc[index[document]][concept_index]
    coocc_row = coocc.iloc[concept_index,:] 
    r = np.array(coocc_row)
    return freq+0.3*np.count_nonzero(r)

### Key Sections

In [344]:
key_doc = {}

for each_document in content.keys():
    sig_score = 0
    for each_concept in kg_labels:
        s = get_significance_score(each_concept, each_document)
        if(s > sig_score):
            sig_score = s
            
            key_doc[each_document] = each_concept

for key, val in key_doc.items():
    if(val != None):
        print(key, val)

('http://www.cs.utah.edu/~piyush/teaching/cs5350.html\n', 'Clustering')
('https://shapeofdata.wordpress.com/2013/07/16/mixture-models/\n', 'Clustering')
('http://scikit-learn.org/stable/modules/mixture.html\n', 'Clustering')
('https://en.wikipedia.org/wiki/Mixture_models\n', 'Clustering')
('https://en.wikipedia.org/wiki/MLPACK_(C%2B%2B_library)\n', 'Clustering')
('https://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-867-machine-learning-fall-2006/readings\n', 'Clustering')
('https://msdn.microsoft.com/en-us/library/azure/dn905944.aspx', 'Clustering')
('https://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-867-machine-learning-fall-2006/lecture-notes\n', 'Clustering')
('https://stats.stackexchange.com/q/232500\n', 'Clustering')
('http://www.powershow.com/view/21510b-MDU0M/EM_Algorithm_and_Mixture_of_Gaussians_powerpoint_ppt_presentation\n', 'Clustering')
('https://stats.stackexchange.com/questions/198239/k-means-clustering-minimizes-conditional-v

In [190]:
a = "https://stats.stackexchange.com/questions/198239/k-means-clustering-minimizes-conditional-variance"
c = requests.get(a).content.lower()
for w in kg_labels:
    if(w.lower() in c):
        print(w)

K-means clustering
Computer Science
Clustering
Processes
K-Means
Machine Learning
Operating Systems


In [219]:
content.keys()

['http://www.powershow.com/view/21510b-MDU0M/EM_Algorithm_and_Mixture_of_Gaussians_powerpoint_ppt_presentation\n']