# 2

In [159]:
from elasticsearch import Elasticsearch
import json

class MySearch:
    max_id = 0
    
    def __init__(self, index='paper_index', addr='http://localhost:9200'):
        self.es = Elasticsearch(addr)
        self.index = index
        
    def index_json(self, path):
        with open(path, 'r') as my_file:
            data = json.load(my_file)
            for i, paper in enumerate(data):
                if paper['type'] == 'paper':
                    paper.pop('type')
                    self.es.index(index=self.index, doc_type=self.index, id=MySearch.max_id+1, body={'paper': paper})
                    MySearch.max_id +=1
        return True
    
    def delete_index(self):
        return self.es.indices.delete(self.index)
        
    def bare_search(self, query, size=10):
        return self.es.search(index=self.index, doc_type=self.index, body=query, size=size)
    
    def update(self, id, new_dict):
        return self.es.update(index=self.index,doc_type=self.index,id=id, body={"doc": new_dict})
    
    def search(self, title_query, abstract_query, year, w_title=1.0,
               w_abstract=1.0, w_date=1.0, use_page_rank=False, size = 10):
        my_query = {"query": { 
                "bool": {
                    "should": [
                        {"match" : {
                          "paper.title":{
                              "query":title_query,
                              "boost": w_title
                        }}},
                        {"match" : {
                          "paper.abstract":{
                              "query":abstract_query,
                              "boost": w_abstract
                        }}},
                        { "range": { "paper.date": { "gte": str(year) , "boost" : w_date}}}
                    ]
                }
            }}
        if not use_page_rank:
            return self.bare_search(my_query, size)
        else:
            new_query = {
                        "query": {
                            "function_score": {
                                "query": my_query["query"],
                                "script_score" : {
                                    "script" : {
                                      "source": "_score + doc['paper.page_rank'].value"
                                    }
                                }
                            }
                        }}
            return self.bare_search(new_query, size)

In [160]:
ms = MySearch()
# ms.index_json('semanticCrawler/data.json')

In [161]:
# ms.delete_index()

{'acknowledged': True}

In [62]:
ms.bare_search({'query':{'match':{'paper.title':'and'}}}, 1)

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 5, 'total': 5},
 'hits': {'hits': [{'_id': '663',
    '_index': 'paper_index',
    '_score': 1.9333732,
    '_source': {'paper': {'abstract': 'Semantic Scholar extracted view of "Coherence and Coreference" by Jerry R. Hobbs',
      'authors': ['Jerry R. Hobbs'],
      'date': '1979',
      'id': 'Coherence-and-Coreference-Hobbs/e564391324ede7c9771e78b6d8c23bee5afff559',
      'page_rank': 0.00020358794857695006,
      'references': [],
      'title': 'Coherence and Coreference'}},
    '_type': 'paper_index'}],
  'max_score': 1.9333732,
  'total': 566},
 'timed_out': False,
 'took': 5}

# 3

In [22]:
import numpy as np
from scipy.sparse import csc_matrix

def pageRank(G, s = .85, maxerr = .0001):
    n = G.shape[0]
    A = csc_matrix(G,dtype=np.float)
    rsums = np.array(A.sum(1))[:,0]
    ri, ci = A.nonzero()
    A.data /= rsums[ri]

    ro, r = np.zeros(n), np.ones(n)
    while np.sum(np.abs(r-ro)) > maxerr:
        print('ERR:', np.sum(np.abs(r-ro)))
        ro = r.copy()
        for i in range(0,n):
            Ai = np.array(A[:,i].todense())[:,0]
            Ei = np.ones(n) / float(n)
            r[i] = ro.dot( Ai*s + Ei*(1-s))
            
    return r/float(sum(r))

In [107]:
def create_graph(path):
    ids = []
    graph = []
    authors = set()
    authors_graph = []
    with open(path, 'r') as my_file:
        data = json.load(my_file)
        new_data = []
        for paper in data:
            if paper['type'] == 'paper':
                new_data.append(paper)
                ids.append(paper['id'])
                for author in paper['authors']:
                    authors.add(author)

        data = new_data
        authors = list(authors)
        authors_index ={author:i for i, author in enumerate(authors)}
        for i in range(len(authors)):
            authors_graph.append([])
            for j in range(len(authors)):
                authors_graph[i].append(0)
            
        for ind1, paper1 in enumerate(data):
            graph.append([])
            for ind2, paper2 in enumerate(data):
                if paper2['id'] in paper1['references']:
                    graph[ind1].append(1)
                    
                    for author1 in paper1['authors']:
                        for author2 in paper2['authors']:
                            authors_graph[authors_index[author1]][authors_index[author2]] = 1
                else:
                    graph[ind1].append(0)
    
    return graph, ids, authors_graph, authors

In [108]:
graph, ids , authors_graph, authors= create_graph('semanticCrawler/data.json')
graph = np.array(graph, dtype=np.float)
authors_graph = np.array(authors_graph, dtype=np.float)

In [109]:
pr = pageRank(graph)

ERR: 2000.0
ERR: 1156.8520238095234
ERR: 476.46353660863895
ERR: 229.65400258650843
ERR: 112.62301365575024
ERR: 56.98554497768786
ERR: 29.032707209362307
ERR: 14.750073802979609
ERR: 7.461954984207983
ERR: 3.8024358769042843
ERR: 1.935596239438159
ERR: 0.9959575075906749
ERR: 0.5112930186440392
ERR: 0.2636425033725379
ERR: 0.1359713471133596
ERR: 0.07002016997793546
ERR: 0.0360416882318603
ERR: 0.018526308021111104
ERR: 0.009517107780170078
ERR: 0.0048848875597231715
ERR: 0.0025067513316527662
ERR: 0.001285984903271673
ERR: 0.0006597065558292028
ERR: 0.00033840719178822553
ERR: 0.00017359993900468228


In [26]:
for i, p in enumerate(pr):
    ms.update(i+1, {'paper': {'page_rank':p}})

# 4

In [106]:
ms.search('coordinate', 'coordinate', 2018, 1.0, 0.0, 1.0, False, 1)

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 5, 'total': 5},
 'hits': {'hits': [{'_id': '5',
    '_index': 'paper_index',
    '_score': 1.0,
    '_source': {'paper': {'abstract': 'Big-data server applications frequently encounter data misses, and hence, lose significant performance potential. One way to reduce the number of data misses or their effect is data prefetching. As data accesses have high temporal correlations, temporal prefetching techniques are promising for them. While state-of-the-art temporal prefetching techniques are effective at reducing the number of data misses, we observe that there is a significant gap between what they offer and the opportunity. This work aims to improve the effectiveness of temporal prefetching techniques. We identify the lookup mechanism of existing temporal prefetchers responsible for the large gap between what they offer and the opportunity. Existing lookup mechanisms either not choose the right stream in the history, or unnecessaril

# 5

In [129]:
def normalize(v):
    norm = sum(v)
    if norm == 0: 
        return v
    return v / norm

In [152]:
def HITS(graph, epochs=5):
    a = np.ones(len(graph))
    h = np.ones(len(graph))
    
    for epoch in range(epochs):
        h = graph.dot(a)
        a = h.dot(graph)
        a = normalize(a)
        h = normalize(h)
    
    return a

In [156]:
def get_best_k(authors_graph, authors, k):
    a = HITS(authors_graph)
    authorities = [x for x in zip(a, authors)]
    authorities.sort(reverse=True)
    return authorities[:k]

In [158]:
get_best_k(authors_graph, authors, 10)

[(0.012438237419841238, 'Torsten N. Wiesel'),
 (0.011940380560591296, 'David H. Hubel'),
 (0.009335599102681308, 'Peter H. Schiller'),
 (0.008338461129592932, 'Max S. Cynader'),
 (0.008239153214949906, 'Nancy Berman'),
 (0.00785247232450169, 'Charles D. Gilbert'),
 (0.007708886458653803, 'Michael P. Stryker'),
 (0.007611779177375818, 'F. W. Campbell'),
 (0.00751187564514074, 'Raymond D. Lund'),
 (0.007457760850912143, 'Simon Levay')]

# User Interface

In [167]:
from IPython.display import display,clear_output
import ipywidgets as widgets

path_txt = widgets.Text(description="File path", width=200)
index_btn = widgets.Button(description='Index')
delete_btn = widgets.Button(description='Delete index')
out = widgets.Output()

def index_btn_click(event):
    with out:
        clear_output(wait=False)
        if not path_txt.value:
            print('Error: Empty path!')
        else:
            print('Indexing...')
            if ms.index_json(path_txt.value):
                print('Done')
            else:
                print('Error!')
    
def delete_btn_click(event):
    with out:
        clear_output(wait=False)
        print(ms.delete_index())

index_btn.on_click(index_btn_click)
delete_btn.on_click(delete_btn_click)
display(path_txt)
display(index_btn)
display(delete_btn)
display(out)