In [9]:
import numpy as np
import json

with open("data/super_data.json", "r") as f:
    super_data = json.load(f)
    
p_data=super_data['papers']
index_phrase=super_data['index_phrase']

In [10]:
# build pagerank graph

node_num=len(p_data)

bayes_graph=np.zeros((node_num,node_num))
bayes_rank=np.ones(node_num)
bayes_reserve=np.zeros(node_num)

markov_graph=np.zeros((node_num,node_num))
markov_rank=np.ones(node_num)
markov_reserve=np.zeros(node_num)

#node with less citations have high originality
originality=0.0000001

for p in p_data:
    
    index=int(p['index']) 
    
    # undirected
    markov_weight=np.array(p['all_cite_sim'])
    z=originality
    for cs in p['all_cite_sim']:
        z+=cs  
    markov_weight/=float(z)
    markov_reserve[index]=originality/float(z)
    markov_graph[index][index]=originality/float(z)
    i=0
    for c in p['all_cite']:
        markov_graph[int(c)][index]=markov_weight[i]
        i+=1
        
        
    # directed
    bayes_weight=np.array(p['citations_sim'])
    z=originality
    for cs in p['citations_sim']:
        z+=cs 
    bayes_weight/=float(z)
    bayes_reserve[index]=originality/float(z)
    bayes_graph[index][index]=originality/float(z)
    i=0
    for c in p['citations']:
        bayes_graph[int(c)][index]=bayes_weight[i]
        i+=1
        
bayes_rank=bayes_rank-bayes_reserve
markov_rank=markov_rank-markov_reserve

In [3]:
iterations=500
for i in range(iterations):
    bayes_rank=np.dot(bayes_graph,bayes_rank)
for i in range(iterations):
    markov_rank=np.dot(markov_graph,markov_rank)
    
bayes_rank+=bayes_reserve
markov_rank+=markov_reserve

bayes_rank[705]

2.796697653458339

In [4]:
#sanity check
count=0
for i in range(node_num):
    if(bayes_rank[i]>2.6):
        count+=1
        print '#'+str(count)+' '+p_data[i]['index']
        print p_data[i]['title']
        print 'score: '+ str(bayes_rank[i])
        print ' '
        print p_data[i]['abstract']
        print ' '
        print ' '

#1 322
Time-Dependent Reliability Analysis by a Sampling Approach to Extreme Values of Stochastic Processes
score: 3.16610132847
 
Maintaining high accuracy and efficiency is a challenging issue in time-dependent reliability analysis. In this work, an accurate and efficient method is proposed for limit-state functions with the following features: The limit-state function is implicit with respect to time, and its input contains stochastic processes; the stochastic processes include only general strength and stress variables, or the limit-state function is monotonic to these stochastic processes. The new method employs random sampling approaches to estimate the distributions of the extreme values of the stochastic processes. The extreme values are then used to replace the corresponding stochastic processes, and consequently the time-dependent reliability analysis is converted into its time-invariant counterpart. The commonly used time-invariant reliability method, the First Order Reliabi

In [5]:
for p in p_data:
    idx=int(p['index'])
    p['markov_rank']=markov_rank[idx]
    p['bayes_rank']=bayes_rank[idx]

    
markov_ranks=np.argsort(markov_rank)[::-1]
bayes_ranks=np.argsort(bayes_rank)[::-1]
    
super_data['markov_ranks']=markov_ranks.tolist()
super_data['bayes_ranks']=bayes_ranks.tolist()

In [6]:
import networkx as nx
import community

G=nx.Graph()
for p in p_data:
    idx=int(p['index'])
    count=0
    for i in p['citations']:
        G.add_edge(idx, int(i),weight=p['citations_sim'][count])
        count+=1
        

c_scores = nx.degree_centrality(G)

c_ranks=np.zeros(len(p_data))
for i in range (len(p_data)):
    if i in c_scores:
        c_ranks[i]=c_scores[i]

c_ranks=np.argsort(c_ranks)[::-1]
super_data['c_ranks']=c_ranks.tolist()


In [7]:
from operator import itemgetter
partition = community.best_partition(G)

check=set()
for key in partition:
    check.add(partition[key])
    


label_num=len(check)
group=[[] for i in range(label_num)]

for i in range(len(p_data)):
    if(i in partition):
        p_data[i]['louvain_index']=partition[i]
        group[partition[i]].append(i)
    else:
        p_data[i]['louvain_index']=-1
    

# build group info
final_group_info=[]
for i in range(label_num):
    final_group_info.append({})
    final_group_info[i]['nodes']=group[i]
    final_group_info[i]['size']=len(group[i])
    final_group_info[i]['index']=i    

    
#Top phrase for group
for i in range(label_num):
    top_phrase=[]
    count=np.zeros(len(index_phrase))
    for j in group[i]:
        for key in p_data[j]['phrases']:
            count[int(key)]+=p_data[j]['phrases'][key]
    b=np.argsort(count)[::-1]
    for k in range(30):
        top_phrase.append(index_phrase[str(b[k])])
    final_group_info[i]['top_phrase']=top_phrase
    name_str = (top_phrase[0]+', '+top_phrase[1]+' and '+top_phrase[2])
    final_group_info[i]['name']=name_str



#build connection
for i in range(label_num):
    connected_group=set()
    nodes=final_group_info[i]['nodes']
    for node in nodes:
        for c in p_data[node]['all_cite']:
            out_index=p_data[int(c)]['louvain_index']
            if(out_index!=index):
                connected_group.add(out_index)
    final_group_info[i]['connected_group']=connected_group 

# get importer, exporter and contribution score
for group in final_group_info:
    
    index=group['index']
    #map group to node
    importer={}
    exporter={}
    #map group to number
    import_score={}
    export_score={}
    exchange_score={}

    
    for cg in group['connected_group']:
        importer[cg]=set()
        exporter[cg]=set()
        import_score[cg]=0
        export_score[cg]=0
        exchange_score[cg]=0
    
    for node in group['nodes']:
        
        for c in p_data[node]['citations']:
            out_index=p_data[int(c)]['louvain_index']
            if(out_index!=index):
                importer[out_index].add(node)
                import_score[out_index]+=1
                exchange_score[out_index]+=1
                
        for c in p_data[node]['cited_by']:
            out_index=p_data[int(c)]['louvain_index']
            if(out_index!=index):
                exporter[out_index].add(node)
                export_score[out_index]+=1
                exchange_score[out_index]+=1
                
    for cg in group['connected_group']:
        importer[cg]=list(importer[cg])
        exporter[cg]=list(exporter[cg])
    
    import_list=[]
    for key in import_score:
        import_list.append([key, import_score[key]])
    export_list=[]
    for key in export_score:
        export_list.append([key, export_score[key]])
    exchange_list=[]
    for key in exchange_score:
        exchange_list.append([key, exchange_score[key]])
    
    group['import_list']=sorted(import_list,key=itemgetter(1),reverse=True)
    group['export_list']=sorted(export_list,key=itemgetter(1),reverse=True)
    group['exchange_list']=sorted(exchange_list,key=itemgetter(1),reverse=True)
    
    group['importer']=importer
    group['exporter']=exporter


# turn set into list for storage
for group in final_group_info:
    group['nodes']=list(group['nodes'])
    group['connected_group']=list(group['connected_group'])

super_data['louvain_group']=final_group_info

In [8]:
import os

path = "data/super_data_2.json"
if(os.path.isfile(path)):
    os.remove(path)
with open(path, "w") as f:
    json.dump(super_data, f)