In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import json

In [5]:
from pathlib import Path
import sys
import os
import dask.bag as db

In [6]:
# Extract Only the AI , ML PAPERS
def extractArxivData(categories=['stat.ML','cs.AI'],year=None,raw_data_path="../data/raw/",save_extracted_filename="../data/processed/AI_ML.json"):
    """ This function extracts data for the given set of categories and save the data into the save_extracted_filename path """
    records=db.read_text(raw_data_path+"/*.json").map(lambda x:json.loads(x))
    docs = (records.filter(lambda x:any(ele in x['categories'] for ele in categories)==True))
    extract_latest_version=lambda x:x['versions'][-1]["created"]
    if year!=None:
        docs=docs.filter(lambda x:int(extract_latest_version(x).split(" ")[3])>=year)

    get_metadata = lambda x: {'id': x['id'],
                  'title': x['title'],
                  'category':x['categories'],
                  'abstract':x['abstract'],
                 'version':x['versions'][-1]['created'],
                         'doi':x["doi"],
                         'authors_parsed':x['authors_parsed']}
                        
    data=docs.map(get_metadata).to_dataframe().compute()

    ## Creating authors fields by joining first and last nmes in authors_parsed columns.
    data['authors']=data['authors_parsed'].apply(lambda authors:[(" ".join(author)).strip() for author in authors])

    print("Number of Records Extracted for Given Set of Categories ",data.shape[0])
    Path(os.path.dirname(save_extracted_filename)).mkdir(parents=True, exist_ok=True)
    data.to_json(save_extracted_filename,orient="records")
    return data

In [8]:
RAW_DATA_PATH="arxiv_data/"
## Collect data for Papers published in ['stat.ML','cs.AI'] since year 2015.
arxiv_df = extractArxivData(categories=['stat.ML','cs.AI'], year=2015, raw_data_path=RAW_DATA_PATH,save_extracted_filename="AI_ML_since2015.json")

Number of Records Extracted for Given Set of Categories  75996


In [9]:
arxiv_df

Unnamed: 0,id,title,category,abstract,version,doi,authors_parsed,authors
0,0709.1667,Solving Constraint Satisfaction Problems throu...,cs.AI cond-mat.dis-nn cond-mat.stat-mech cs.CC,Message passing algorithms have proved surpr...,"Tue, 4 Jun 2019 11:43:45 GMT",,"[[Montanari, Andrea, ], [Ricci-Tersenghi, Fede...","[Montanari Andrea, Ricci-Tersenghi Federico, S..."
1,0811.2551,Modeling Cultural Dynamics,cs.MA cs.AI q-bio.NC,EVOC (for EVOlution of Culture) is a compute...,"Tue, 9 Jul 2019 20:25:22 GMT",,"[[Gabora, Liane, ]]",[Gabora Liane]
2,0812.0885,Elementary epistemological features of machine...,cs.AI,Theoretical analysis of machine intelligence...,"Fri, 30 Jun 2017 14:10:03 GMT",,"[[Horvat, Marko, ]]",[Horvat Marko]
3,0812.4044,The Offset Tree for Learning with Partial Labels,cs.LG cs.AI,"We present an algorithm, called the Offset T...","Sun, 3 Apr 2016 21:41:38 GMT",,"[[Beygelzimer, Alina, ], [Langford, John, ]]","[Beygelzimer Alina, Langford John]"
4,0905.1424,Concept Stability for Constructing Taxonomies ...,cs.CY cs.AI cs.SI stat.ML,Owners of a web-site are often interested in...,"Thu, 24 Nov 2016 19:25:12 GMT",,"[[Kuznetsov, Sergei O., ], [Ignatov, Dmitry I....","[Kuznetsov Sergei O., Ignatov Dmitry I.]"
...,...,...,...,...,...,...,...,...
75991,2111.12705,MixSyn: Learning Composition and Style for Mul...,cs.CV cs.AI,Synthetic images created by generative model...,"Wed, 24 Nov 2021 18:58:34 GMT",,"[[Demir, Ilke, ], [Ciftci, Umur A., ]]","[Demir Ilke, Ciftci Umur A.]"
75992,2111.12707,MHFormer: Multi-Hypothesis Transformer for 3D ...,cs.CV cs.AI cs.LG,Estimating 3D human poses from monocular vid...,"Wed, 24 Nov 2021 18:59:02 GMT",,"[[Li, Wenhao, ], [Liu, Hong, ], [Tang, Hao, ],...","[Li Wenhao, Liu Hong, Tang Hao, Wang Pichao, V..."
75993,cs/0601132,A Study on the Global Convergence Time Complex...,cs.AI cs.NE,The Estimation of Distribution Algorithm is ...,"Tue, 2 Apr 2019 10:03:57 GMT",,"[[Rastegar, R., ], [Meybodi, M. R., ]]","[Rastegar R., Meybodi M. R.]"
75994,cs/0604010,Nearly optimal exploration-exploitation decisi...,cs.AI cs.LG,While in general trading off exploration and...,"Mon, 4 Jun 2018 18:17:32 GMT",,"[[Dimitrakakis, Christos, ]]",[Dimitrakakis Christos]


In [10]:
arxiv_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75996 entries, 0 to 75995
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              75996 non-null  object
 1   title           75996 non-null  object
 2   category        75996 non-null  object
 3   abstract        75996 non-null  object
 4   version         75996 non-null  object
 5   doi             9275 non-null   object
 6   authors_parsed  75996 non-null  object
 7   authors         75996 non-null  object
dtypes: object(8)
memory usage: 4.6+ MB


In [11]:
from itertools import combinations

In [12]:
arxiv_df['author_pairs'] = arxiv_df['authors'].apply(lambda x: list(combinations(x, 2)))
arxiv_df.head()

Unnamed: 0,id,title,category,abstract,version,doi,authors_parsed,authors,author_pairs
0,709.1667,Solving Constraint Satisfaction Problems throu...,cs.AI cond-mat.dis-nn cond-mat.stat-mech cs.CC,Message passing algorithms have proved surpr...,"Tue, 4 Jun 2019 11:43:45 GMT",,"[[Montanari, Andrea, ], [Ricci-Tersenghi, Fede...","[Montanari Andrea, Ricci-Tersenghi Federico, S...","[(Montanari Andrea, Ricci-Tersenghi Federico),..."
1,811.2551,Modeling Cultural Dynamics,cs.MA cs.AI q-bio.NC,EVOC (for EVOlution of Culture) is a compute...,"Tue, 9 Jul 2019 20:25:22 GMT",,"[[Gabora, Liane, ]]",[Gabora Liane],[]
2,812.0885,Elementary epistemological features of machine...,cs.AI,Theoretical analysis of machine intelligence...,"Fri, 30 Jun 2017 14:10:03 GMT",,"[[Horvat, Marko, ]]",[Horvat Marko],[]
3,812.4044,The Offset Tree for Learning with Partial Labels,cs.LG cs.AI,"We present an algorithm, called the Offset T...","Sun, 3 Apr 2016 21:41:38 GMT",,"[[Beygelzimer, Alina, ], [Langford, John, ]]","[Beygelzimer Alina, Langford John]","[(Beygelzimer Alina, Langford John)]"
4,905.1424,Concept Stability for Constructing Taxonomies ...,cs.CY cs.AI cs.SI stat.ML,Owners of a web-site are often interested in...,"Thu, 24 Nov 2016 19:25:12 GMT",,"[[Kuznetsov, Sergei O., ], [Ignatov, Dmitry I....","[Kuznetsov Sergei O., Ignatov Dmitry I.]","[(Kuznetsov Sergei O., Ignatov Dmitry I.)]"


In [13]:
ai_authors = [author for author_list in arxiv_df['authors'] for author in author_list ]
len(ai_authors)

286021

In [14]:
from collections import Counter

In [15]:
paper_count_by_author = Counter(ai_authors)

In [16]:
paper_count_by_author.most_common(5)

[('Bengio Yoshua', 216),
 ('Levine Sergey', 216),
 ('Abbeel Pieter', 162),
 ('Jordan Michael I.', 157),
 ('Schölkopf Bernhard', 139)]

In [17]:
len(paper_count_by_author.keys())

116477

In [60]:
nodes_to_keep = {key for key, value in paper_count_by_author.items() if value >= 10}
len(nodes_to_keep)

4297

## Build CoAuthor Graph

In [19]:
author_edge_list = [author_pair for author_pair_list in arxiv_df['author_pairs'] for author_pair in author_pair_list]
author_edge_list

[('Montanari Andrea', 'Ricci-Tersenghi Federico'),
 ('Montanari Andrea', 'Semerjian Guilhem'),
 ('Ricci-Tersenghi Federico', 'Semerjian Guilhem'),
 ('Beygelzimer Alina', 'Langford John'),
 ('Kuznetsov Sergei O.', 'Ignatov Dmitry I.'),
 ('Blume Lawrence', 'Easley David'),
 ('Blume Lawrence', 'Halpern Joseph Y.'),
 ('Easley David', 'Halpern Joseph Y.'),
 ('Kakade Sham M.', 'Shamir Ohad'),
 ('Kakade Sham M.', 'Sridharan Karthik'),
 ('Kakade Sham M.', 'Tewari Ambuj'),
 ('Shamir Ohad', 'Sridharan Karthik'),
 ('Shamir Ohad', 'Tewari Ambuj'),
 ('Sridharan Karthik', 'Tewari Ambuj'),
 ('Lunagómez Simón', 'Mukherjee Sayan'),
 ('Lunagómez Simón', 'Wolpert Robert L.'),
 ('Lunagómez Simón', 'Airoldi Edoardo M.'),
 ('Mukherjee Sayan', 'Wolpert Robert L.'),
 ('Mukherjee Sayan', 'Airoldi Edoardo M.'),
 ('Wolpert Robert L.', 'Airoldi Edoardo M.'),
 ('DiPaola Steve', 'Gabora Liane'),
 ('Mossel Elchanan', 'Olsman Noah'),
 ('Mossel Elchanan', 'Tamuz Omer'),
 ('Olsman Noah', 'Tamuz Omer'),
 ('Golovin Danie

In [20]:
weighted_author_edge_list = list(Counter(author_edge_list).items())
weighted_author_edge_list

[(('Montanari Andrea', 'Ricci-Tersenghi Federico'), 2),
 (('Montanari Andrea', 'Semerjian Guilhem'), 1),
 (('Ricci-Tersenghi Federico', 'Semerjian Guilhem'), 1),
 (('Beygelzimer Alina', 'Langford John'), 3),
 (('Kuznetsov Sergei O.', 'Ignatov Dmitry I.'), 1),
 (('Blume Lawrence', 'Easley David'), 1),
 (('Blume Lawrence', 'Halpern Joseph Y.'), 1),
 (('Easley David', 'Halpern Joseph Y.'), 1),
 (('Kakade Sham M.', 'Shamir Ohad'), 1),
 (('Kakade Sham M.', 'Sridharan Karthik'), 1),
 (('Kakade Sham M.', 'Tewari Ambuj'), 1),
 (('Shamir Ohad', 'Sridharan Karthik'), 2),
 (('Shamir Ohad', 'Tewari Ambuj'), 1),
 (('Sridharan Karthik', 'Tewari Ambuj'), 1),
 (('Lunagómez Simón', 'Mukherjee Sayan'), 1),
 (('Lunagómez Simón', 'Wolpert Robert L.'), 1),
 (('Lunagómez Simón', 'Airoldi Edoardo M.'), 1),
 (('Mukherjee Sayan', 'Wolpert Robert L.'), 1),
 (('Mukherjee Sayan', 'Airoldi Edoardo M.'), 1),
 (('Wolpert Robert L.', 'Airoldi Edoardo M.'), 1),
 (('DiPaola Steve', 'Gabora Liane'), 2),
 (('Mossel Elcha

In [21]:
weighted_author_edge_list = [(item[0][0], item[0][1], item[1]) for item in weighted_author_edge_list]
weighted_author_edge_list[:10]

[('Montanari Andrea', 'Ricci-Tersenghi Federico', 2),
 ('Montanari Andrea', 'Semerjian Guilhem', 1),
 ('Ricci-Tersenghi Federico', 'Semerjian Guilhem', 1),
 ('Beygelzimer Alina', 'Langford John', 3),
 ('Kuznetsov Sergei O.', 'Ignatov Dmitry I.', 1),
 ('Blume Lawrence', 'Easley David', 1),
 ('Blume Lawrence', 'Halpern Joseph Y.', 1),
 ('Easley David', 'Halpern Joseph Y.', 1),
 ('Kakade Sham M.', 'Shamir Ohad', 1),
 ('Kakade Sham M.', 'Sridharan Karthik', 1)]

In [22]:
G = nx.Graph()
G.add_weighted_edges_from(weighted_author_edge_list)

In [23]:
print(nx.info(G))

Name: 
Type: Graph
Number of nodes: 114628
Number of edges: 578484
Average degree:  10.0932


In [61]:
sub_G = nx.subgraph(G, nodes_to_keep)
G = nx.Graph(sub_G)


print(nx.info(G))

Name: 
Type: Graph
Number of nodes: 4287
Number of edges: 29241
Average degree:  13.6417


In [62]:
isolated_nodes = list(nx.isolates(G))
len(isolated_nodes)

114

In [63]:
G.remove_nodes_from(isolated_nodes)
print(nx.info(G))

Name: 
Type: Graph
Number of nodes: 4173
Number of edges: 29241
Average degree:  14.0144


In [83]:
pyvis_network = Network(notebook=True, height = '600px', width = '100%', heading = 'Whole Co-Author Network')
pyvis_network.from_nx(G)
pyvis_network.show("whole.html")

## DeepWalk Algorithm

In [64]:
import random

In [65]:
def RandomWalk(graph, start_node, length_of_random_walk):
    
    current_node = start_node
    random_walk = [start_node]
    
    for i in range(length_of_random_walk):
        
        current_node_neighbors = list(graph.neighbors(current_node))
        chosen_node = random.choice(current_node_neighbors)
        current_node = chosen_node
        random_walk.append(current_node)
        
    return random_walk

In [66]:
from tqdm import tqdm

In [67]:
num_of_sampling = 10
random_walk_list = []
length_of_random_walk = 10

for node in tqdm(G.nodes(), desc = "Iterating Nodes"):
    for i in range(0, num_of_sampling):
        random_walk_list.append(RandomWalk(G, node, length_of_random_walk))

Iterating Nodes: 100%|███████████████████████████████████████████████████████████| 4173/4173 [00:00<00:00, 6161.92it/s]


In [68]:
from gensim.models.word2vec import Word2Vec

In [69]:
deepwalk_model = Word2Vec(sentences=random_walk_list, window=5, sg=1, negative=5, size=128, compute_loss=True)

In [70]:
deepwalk_model.save("deepwalk_since2015_gt10.model")

## Similarity Analysis

In [71]:
deepwalk_model.wv.most_similar("Fei-Fei Li")

[('Niebles Juan Carlos', 0.8413130640983582),
 ('Martín-Martín Roberto', 0.825070321559906),
 ('Soto Alvaro', 0.817757248878479),
 ('Piech Chris', 0.8129456639289856),
 ('Xu Danfei', 0.7855753302574158),
 ('Nair Suraj', 0.7797989845275879),
 ('Bohg Jeannette', 0.7712190747261047),
 ('Ren Hongyu', 0.761582612991333),
 ('Manning Christopher D.', 0.7494522929191589),
 ('Qureshi Ahmed H.', 0.7484649419784546)]

In [72]:
def getSimilarNodes(model, node):
    similarity = model.wv.most_similar(node)
    similar_nodes_df = pd.DataFrame()
    similar_nodes_df['similar_authors'] = [item[0] for item in similarity]
    similar_nodes_df['similarity_score'] = [item[1] for item in similarity]
    similar_nodes_df['source_author'] = node
    
    return similar_nodes_df

In [73]:
getSimilarNodes(deepwalk_model, "Fei-Fei Li")

Unnamed: 0,similar_authors,similarity_score,source_author
0,Niebles Juan Carlos,0.841313,Fei-Fei Li
1,Martín-Martín Roberto,0.82507,Fei-Fei Li
2,Soto Alvaro,0.817757,Fei-Fei Li
3,Piech Chris,0.812946,Fei-Fei Li
4,Xu Danfei,0.785575,Fei-Fei Li
5,Nair Suraj,0.779799,Fei-Fei Li
6,Bohg Jeannette,0.771219,Fei-Fei Li
7,Ren Hongyu,0.761583,Fei-Fei Li
8,Manning Christopher D.,0.749452,Fei-Fei Li
9,Qureshi Ahmed H.,0.748465,Fei-Fei Li


## CoAuthor Network Visualization

In [74]:
def getCoAuthorshipNetwork(graph, initial_nodes):
    count = 0
    nodes_list = [node for node in initial_nodes]
    
    for node in initial_nodes:
        neighbors = list(graph.neighbors(node))
        count += len(neighbors)
        nodes_list += neighbors
        
    return nodes_list

In [80]:
nodes_to_keep

{'Touati Ahmed',
 'Kollias Dimitrios',
 'Maheswaranathan Niru',
 'Simeone Osvaldo',
 'Yurochkin Mikhail',
 'Liu Jixue',
 'Choo Jaegul',
 'Sheth Amit',
 'Li Tianrui',
 'Chen Yixin',
 'Srivastava Biplav',
 'Nagai Takayuki',
 'Wang Qing',
 'van Gerven Marcel',
 'Javidi Tara',
 "D'Amour Alexander",
 'Caselles-Dupré Hugo',
 'Ahn Sungjin',
 'Russo Daniel',
 'Liu Dong',
 'Yip Michael C.',
 'Li Jinyu',
 'Liu Ming-Yu',
 'Liu Lin',
 'Chen Lei',
 'Passerini Andrea',
 'Yang Shuang',
 'Chi Yuejie',
 'Santos-Rodriguez Raul',
 'Xu Jie',
 'Yang Xin',
 'Milli Smitha',
 'Menon Aditya Krishna',
 'Johansson Fredrik D.',
 'Allen Genevera I.',
 'Ling Qing',
 'Saremi Saeed',
 'Duan Yan',
 'Wang Wenlin',
 'Chandar Sarath',
 'Benhamou Eric',
 'Neykov Matey',
 'Kumar Vikash',
 'Marchisio Alberto',
 'Kuang Kun',
 'Driggs-Campbell Katherine',
 'Evans David',
 'Vehtari Aki',
 'Shang Fanhua',
 'Huang Yu',
 'Vaswani Namrata',
 'Chernova Sonia',
 'Jaderberg Max',
 'Kira Zsolt',
 'Wang Yunlong',
 'Wang Xianzhi',
 'Wu 

In [75]:
paper_count_by_author.most_common(20)

[('Bengio Yoshua', 216),
 ('Levine Sergey', 216),
 ('Abbeel Pieter', 162),
 ('Jordan Michael I.', 157),
 ('Schölkopf Bernhard', 139),
 ('Liu Yang', 136),
 ('Sugiyama Masashi', 132),
 ('Wang Jun', 114),
 ('Carin Lawrence', 112),
 ('Welling Max', 111),
 ('Li Bo', 106),
 ('Ermon Stefano', 105),
 ('Krause Andreas', 104),
 ('Zhu Jun', 104),
 ('Mannor Shie', 100),
 ('Tao Dacheng', 93),
 ('Gu Quanquan', 93),
 ('Salakhutdinov Ruslan', 92),
 ('Teh Yee Whye', 91),
 ('Chen Pin-Yu', 91)]

In [82]:
coauthor_nodes = getCoAuthorshipNetwork(G, ['Bengio Yoshua', 'Fei-Fei Li', 'Teh Yee Whye'])

In [85]:
len(coauthor_nodes)

273

In [87]:
len(set(coauthor_nodes))

262

In [88]:
coauthor_subgraph = nx.subgraph(G, coauthor_nodes)
print(nx.info(coauthor_subgraph))

Name: 
Type: Graph
Number of nodes: 262
Number of edges: 1952
Average degree:  14.9008


In [41]:
import pyvis

In [42]:
from pyvis.network import Network

In [43]:
pyvis_network = Network(notebook=True, height = '600px', width = '100%', heading = 'Co-Author Network of Fei-Fei Li')
pyvis_network.from_nx(coauthor_subgraph)
pyvis_network.show("Fei-Fei_Li.html")

In [44]:
coauthor_embeddings = np.zeros((len(coauthor_nodes), 128))
for idx, node in enumerate(coauthor_nodes):
    coauthor_embeddings[idx, :] = deepwalk_model.wv[node]
coauthor_embeddings.shape

(643, 128)

In [45]:
from sklearn.metrics.pairwise import cosine_distances

In [46]:
pairwise_cosine_similarity = cosine_distances(coauthor_embeddings)
pairwise_cosine_similarity.shape

(643, 643)

In [47]:
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans

In [48]:
kmeans_model = KMeans(n_clusters=3)
coauthor_clusters = kmeans_model.fit_predict(pairwise_cosine_similarity)
print(kmeans_model.inertia_)

4265.678902368664


In [49]:
coauthor_cluster_dict = {node: str(coauthor_clusters[idx]) for idx, node in enumerate(coauthor_nodes)}

In [50]:
nx.set_node_attributes(coauthor_subgraph, coauthor_cluster_dict, "group")

In [51]:
pyvis_network = Network(notebook=True, height='800px', width='100%', heading = 'Author Network Clustering Effects')
pyvis_network.from_nx(coauthor_subgraph)
pyvis_network.toggle_physics(True)
pyvis_network.show("clustering.html")

In [55]:
fei_coauthor_nodes = getCoAuthorshipNetwork(G, ['Bengio Yoshua'])
fei_network = nx.subgraph(G, fei_coauthor_nodes)
print(nx.info(fei_network))

Name: 
Type: Graph
Number of nodes: 344
Number of edges: 2213
Average degree:  12.8663


In [56]:
pyvis_network = Network(notebook=True, height='800px', width='100%', heading = 'Fei-Fei Li Author Network Clustering Effects')
pyvis_network.from_nx(fei_network)
pyvis_network.toggle_physics(True)
pyvis_network.show("fei_clustering.html")