In [47]:
import pymongo
import networkx as nx
from itertools import combinations
import pandas as pd
from networkx.readwrite import json_graph
import json
import datetime
from fa2l import force_atlas2_layout


In [48]:
# !pip install fa2l

In [49]:
client = pymongo.MongoClient()
mdb = client.arxiv
db_papers = mdb.papers

In [50]:
papers = list(db_papers.find({'time_published': {'$gt': datetime.datetime(2018, 1, 1)}}))
# papers = papers[:5000]

Save list of fields to show in the menu

In [51]:
all_fields = json.load(open('all_arxiv_categories.json', 'r'))
optional_fields = {key: 0 for key, val in all_fields.items()} 
for p in papers:
#     r_fields.add(p['arxiv_primary_category']['term'])
    for t in p['tags']:
        if t['term'] in optional_fields:
            optional_fields[t['term']] += 1
        
fields_counter = {key: val for key, val in optional_fields.items() if val > 30}
FIELDS = {key: all_fields[key] for key in fields_counter}
fields_to_save = [{'key': key, 'value': all_fields[key]} for key in fields_counter]
json.dump(fields_to_save, open('relevant_arxiv_categories.json', 'w'))
print(f'Num fields {len(fields_to_save)}')

Num fields 48


In [63]:
def add_or_update_node(G, name, p):
    if G.has_node(name):
        G.node[name]['weight'] += 1
    else:
        G.add_node(name, weight=1, fields=set())
    for t in p['tags']:
        if t['term'] in FIELDS:
            G.node[name]['fields'].add(t['term'])
        
def add_or_update_edge(G, edge):
    node1, node2 = edge
    if G.has_edge(node1, node2):
        G[node1][node2]['weight'] += 1
    else:
        G.add_edge(node1, node2, weight=1)
        
    
def save_to_json_vis(G, filename):
    output = {'nodes': [], 'edges': []}
    json_G = json_graph.node_link_data(G)
    for n in json_G['nodes']:
        n['label'] = n['id']
        n['title'] = '{} ({})'.format(n['id'], n['weight'])
        n['value'] = n['weight'] 
        n['fields'] = list(n['fields'])
        output['nodes'].append(n)
    for l in json_G['links']:
        output['edges'].append({'from': l['source'], 'to': l['target'], 'width': l['weight']})
        
    json.dump(output, open(filename, 'w'))

def add_component_data(G):
    for c_id, cur_ns in enumerate(nx.connected_components(G)):
        for n in cur_ns:
            G.node[n]['component'] = c_id

def add_positions(G):
    pos = json.load(open('positions.json', 'r'))
    for n, cur_p in pos.items():
        G.node[n]['x'] = cur_p['x']
        G.node[n]['y'] = cur_p['y']
        
def add_positions_networkx(G):
    pos = force_atlas2_layout(G,
                                iterations=100,
                                pos_list=None,
                                node_masses=None,
                                outbound_attraction_distribution=False,
                                lin_log_mode=True,
                                prevent_overlapping=False,
                                edge_weight_influence=1.0,

                                jitter_tolerance=1.0,
                                barnes_hut_optimize=True,
                                barnes_hut_theta=1,

                                scaling_ratio=2.0,
                                strong_gravity_mode=False,
                                multithread=False,
                                gravity=1.0)
    for n, cur_p in pos.items():
        G.node[n]['x'] = cur_p[0] 
        G.node[n]['y'] = cur_p[1]

In [70]:
G = nx.Graph()

for p in papers:
    if len(p['authors']) > 1:
        names = [a['name'] for a in p['authors']]
        for n in names:
            add_or_update_node(G, n, p)
            
        edges = combinations(names, 2)
        for e in edges:
            add_or_update_edge(G, e)

print(f'Num nodes {len(G.nodes())}')

to_remove = []
for n in G.nodes(data=True):
    if n[1]['weight'] <= 1:
        to_remove.append(n[0])

# G.remove_nodes_from(to_remove)
# print(f'Num nodes {len(G.nodes())}')

# to_remove = []
# for n in G.nodes():
#     if len(G.edges(n)) == 0 and G.node[n]['weight'] < 3:
#         to_remove.append(n)
        
# G.remove_nodes_from(to_remove)
# print(f'Num nodes {len(G.nodes())}')

# add_component_data(G)
# add_positions(G)

Num nodes 29127


In [786]:
n = list(G.nodes(data=True))
max(n, key=lambda x: x[1]['weight'])

('Yang Liu',
 {'weight': 33,
  'fields': {'cs.AI',
   'cs.CL',
   'cs.CR',
   'cs.CV',
   'cs.CY',
   'cs.GR',
   'cs.GT',
   'cs.LG',
   'cs.MA',
   'cs.NE',
   'cs.SD',
   'cs.SE',
   'cs.SY',
   'eess.AS',
   'eess.IV',
   'math.OC',
   'stat.ML'},
  'component': 0,
  'x': -2077,
  'y': 176})

In [787]:
save_to_json_vis(G, 'static/network_data.json')

Old parts:

In [806]:
# a = next(nx.algorithms.community.girvan_newman(G))

In [333]:
nodes = []
for n in G.nodes(data=True):
    nodes.append({'name': n[0], 'num': n[1]['count']})
df = pd.DataFrame(nodes)
df[df.num > 5]



Unnamed: 0,name,num
26,Pascal Fua,6
69,Yang Liu,6
127,Larry S. Davis,8
325,Sergey Levine,7
486,Xin Wang,6
521,Yoshua Bengio,9
624,Dogancan Temel,8
625,Ghassan AlRegib,8


### Google Scholar citation search. Got banned after 10 requests :(

In [8]:
import scholar
querier = scholar.ScholarQuerier()
settings = scholar.ScholarSettings()
querier.apply_settings(settings)

# scholar.ScholarConf.COOKIE_JAR_FILE = 'scholar_cookie.txt'
# querier.save_cookies()

for p in papers[-20:-1]:
    query = scholar.SearchScholarQuery()
    url = f'https://arxiv.org/abs/{p["_id"]}'
    print(url)
    query.set_words(url)
    querier.send_query(query)
    break
    citations = {'num': 0}
    if querier.articles:
        attrs = querier.articles[0].attrs
        url = attrs['url'][0]
        if p['_id'] not in url:
            print(f'Wrong Paper - {url}')
        else:
            citations['num'] = attrs['num_citations'][0]
            citations['url'] = attrs['url_citations'][0]
    print(citations)
    db_papers.update_one({'_id': p['_id']}, {'$set': {'citations': citations}}, True)

    

https://arxiv.org/abs/1809.04696


In [29]:
# db_papers.update_one({'_id': '1802.05382'}, {'$set': {'test': 'test'}}, True)

In [74]:
# list(filter(lambda x: 'Hinton' in x, G.nodes()))

{'weight': 1, 'fields': {'cs.CR', 'cs.CV', 'cs.LG', 'stat.ML'}}