# Connections of Top Authors and their network

In [1]:
from __future__ import division
from __future__ import print_function
# The %... is an iPython thing, and is not part of the Python language.
# In this case we're just telling the plotting library to draw things on
# the notebook, instead of on a separate window.
%matplotlib inline
import os
os.getcwd()
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
import csv
from collections import Counter
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")


import random
from PIL import Image
from os import path
from nltk.corpus import stopwords
from scipy.misc import imread
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from wordcloud import WordCloud, STOPWORDS

import mysql.connector
from mysql.connector import errorcode
from mysql.connector import Error

import plotly
import networkx as nx
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
plotly.tools.set_credentials_file(username='raunakm90', api_key='qh9wd16d6g')

import plot_network
import plotly.plotly as py
from plotly.graph_objs import *
#sns.set_context("poster")

from fuzzywuzzy import fuzz
from fuzzywuzzy import process





## Import data from SQL database

In [2]:
import mysql_setup
query_list,field_names = mysql_setup.query_with_fetchmany("SELECT * FROM Author")
Author = mysql_setup.make_frame(query_list,field_names)

In [3]:
query_list,field_names = mysql_setup.query_with_fetchmany("SELECT * FROM Publishing_Detail")
Publishing_Detail = mysql_setup.make_frame(query_list,field_names)

## Extract top authors and their corresponding publications

In [4]:
ind = Author.Author_Cited_By.sort_values(ascending = False).index
# Select top 4 indices
ind = ind[0:4]
top_Authors = Author[Author.index.isin(ind)]
a_dict = top_Authors.set_index('Author_Id')['Author_Name'].to_dict()
top_Authors

Unnamed: 0,Author_Id,Author_Name,Author_Affiliation,Author_Email,Author_Interests,Author_Cited_By,Author_hIndex,Author_i10Index,Author_hIndex_recent,Author_i10Index_recent
7,JicYPdAAAAAJ,Geoffrey Hinton,"Emeritus Professor of Computer Science, Univer...",@cs.toronto.edu,"machine learning,neural networks,artificial in...",133698,120,288,87,205
18,vtegaJgAAAAJ,vapnik,"Professor of Columbia, Fellow of NEC Labs Amer...",@nec-labs.com,"machine learning,statistics,computer science",168020,108,372,73,278
20,yxUduqMAAAAJ,Michael I. Jordan,"Professor of EECS and Professor of Statistics,...",@cs.berkeley.edu,"machine learning,statistics,computational biol...",99956,129,397,88,334
21,ZpG_cJwAAAAJ,Robert Tibshirani,"Professor of Health Research and Policy, and S...",@stanford.edu,"Statistics,Applied Statistics,Statistical lear...",222620,130,336,94,283


In [5]:
#Key_Value pair of author id and author name
a_dict

{u'JicYPdAAAAAJ': u'Geoffrey Hinton',
 u'ZpG_cJwAAAAJ': u'Robert Tibshirani',
 u'vtegaJgAAAAJ': u'vapnik',
 u'yxUduqMAAAAJ': u'Michael I. Jordan'}

In [6]:
#Get the publications of the required authors
a_id = a_dict.keys()
DB_NAME = 'NLP_Project'
in_p=', '.join(list(map(lambda x: '%s', a_id)))
sql = ("SELECT * FROM publishing_Detail WHERE Author_Id in (%s)")
sql = sql % in_p
query_list = list()
cnx = mysql.connector.connect(user='root',password = "raunak")
cursor = cnx.cursor()
cnx.database = DB_NAME
cursor.execute(sql,a_id)

for row in iter(cursor):
        query_list.append(row)
        num_fields = len(cursor.description)
        field_names = [i[0] for i in cursor.description]
        field_names
        
cursor.close()
cnx.close()
Author_Pubs = mysql_setup.make_frame(query_list,field_names)
Author_Pubs.head()

Unnamed: 0,Pub_Id,Pub_Title,Pub_Authors,Pub_Publisher,Pub_Journal,Pub_Abstract,Pub_Volume,Pub_Year,Pub_Citedby,Pub_URL,Author_Id
0,JicYPdAAAAAJ:-6RzNnnwWf8C,"Pittsburgh, PA 15213","David C Plant,Steven J Now1an,Geoffrey E Hinton",,,"7 Abstract Rumelhart, Hinton and Williams [Rum...",,1986,-99,https://www.cs.toronto.edu/~hinton/absps/bptr.pdf,JicYPdAAAAAJ
1,JicYPdAAAAAJ:-7ulzOJl1JYC,Error Propagation,"DE Rummelhart,GE Hinton,RJ Williams Learning I...",,Parallel Distributed Processing,,,-99,20,http://scholar.google.com/scholar?cluster=1578...,JicYPdAAAAAJ
2,JicYPdAAAAAJ:-DxkuPiZhfEC,WJ CLANCEY,"S AMAREL,Y ANZAI,HG BARROW,H BERLINER,RS BOYER...",,,,,-99,-99,http://scholar.google.com/scholar?cluster=6697...,JicYPdAAAAAJ
3,JicYPdAAAAAJ:-f6ydRqryjwC,Learning translation invariant recognition in ...,Geoffrey E Hinton,Springer Berlin Heidelberg,,One major goal of research on massively parall...,,1987,177,http://link.springer.com/chapter/10.1007/3-540...,JicYPdAAAAAJ
4,JicYPdAAAAAJ:-mN3Mh-tlDkC,"E ‘-Jgjfssl ‘-y Cw. 117-tae.\, I987./ttttttttt...","Geoffrey E Hinton,David C Plaut",,,\ Abstract _ Connectionist models usually have...,,-99,-99,https://www.cnbc.pitt.edu/~plaut/papers/pdf/Hi...,JicYPdAAAAAJ


In [7]:
# Publication of the first author
pubs = Author_Pubs[Author_Pubs['Author_Id']==a_id[0]]

## Create author networks

Collect all co-authors for a given author's publication and count how many times they have co-authored a paper. This way create a adjacency matrix for all connections between the author and his co-authors as well as amongst the co-authors. This will show how the co-authors are connected as well.

For each graph, hover over the nodes to get details related to that node. Yet to add edge strength (for some reason it seems to be way more complicated than imagined). The size and the colour of the node vary according to the number of connections or the degree of the node.

In [8]:
# All the co-authors of a given author
x = pubs['Pub_Authors'].tolist()
x_authors = ",".join(unicode(i) for i in x if not i.isdigit())
x_authors = x_authors.lower()

#Create adjacency matrix of author connections
x_authors = set(x_authors.split(","))
adj_mat = pd.DataFrame(0, index=x_authors, columns=x_authors)
for authors in x:
    temp = authors.lower().split(",")
    if len(temp)>1:
        for i in range(0,len(temp)-1):
            j=0
            while j <= len(temp)-1:
                adj_mat[temp[i]][temp[j]] = adj_mat[temp[i]][temp[j]]+1
                j +=1
                
Gr=nx.from_numpy_matrix(adj_mat.values)

G = ((source, target, attr) for source, target, attr in 
                Gr.edges_iter(data=True) if attr['weight'] > 1)
new_network = nx.Graph()
new_network.add_edges_from(G)


position=nx.spring_layout(new_network)
labels = adj_mat.columns[new_network.nodes()]

traceE=plot_network.scatter_edges(new_network, position)
traceN=plot_network.scatter_nodes(new_network,position,labels = labels)

fig = Figure(data=Data([traceE, traceN]),
             layout=Layout(
                title='<br>Network graph - Robert Tibshirani',
                titlefont=dict(size=18),
                showlegend=False, 
                width=1000,
                height=1000,
                margin=dict(b=20,l=5,r=5,t=40),
                hovermode='closest',
                xaxis=XAxis(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=YAxis(showgrid=False, zeroline=False, showticklabels=False)))

iplot(fig, filename='Robert Tibshirani')

In [9]:
x_authors

{u'p herbert leiderman',
 u'balakanapathy rajaratnam',
 u'robert j huebner',
 u'andrew fire',
 u'emmanuel candes',
 u'joachim torhorst',
 u'cantor michael',
 u'david j inwards',
 u'rj arceci',
 u'efron bradley',
 u'saul a rosenberg',
 u'eugene i rosanoff',
 u'd botstein',
 u'oliver w press',
 u'ra olshen',
 u'chi ma',
 u'matthew w anderson',
 u't louis',
 u'jm vose',
 u'celia mt greenwood',
 u'wonshik han',
 u'robert textor',
 u'jialing zhang',
 u'max grazier g\u2019sell',
 u'tarangini deshpande',
 u'f batliwalla',
 u'terence tao',
 u'louis kouadio',
 u'l tarassenko',
 u'ephraim m hanks',
 u'uwe scherf',
 u'ezequiel martinez',
 u'brett d arnoldo',
 u'ellen schlichting',
 u'roberto herrera-goepfert',
 u'christine steinhoff',
 u'l lu',
 u'nj crellin',
 u'mia levy',
 u'inigo espinosa',
 u'mr chang',
 u'john c parker',
 u'jerome friedman',
 u'mohith sadaram',
 u'pamela m merrick',
 u'jean-marc robin',
 u'tze leung lai',
 u'p eystein lonning',
 u'chris marrison',
 u'millard h lambert',
 u'a

In [9]:
# Publication of the first author
pubs = Author_Pubs[Author_Pubs['Author_Id']==a_dict.keys()[1]]
pubs.head()

# All the co-authors of a given author
x = pubs['Pub_Authors'].tolist()
x_authors = ",".join(unicode(i) for i in x if not i.isdigit())
x_authors = x_authors.lower()

#Create adjacency matrix of author connections
x_authors = set(x_authors.split(","))
adj_mat = pd.DataFrame(0, index=x_authors, columns=x_authors)
for authors in x:
    temp = authors.lower().split(",")
    if len(temp)>1:
        for i in range(0,len(temp)-1):
            j=0
            while j <= len(temp)-1:
                adj_mat[temp[i]][temp[j]] = adj_mat[temp[i]][temp[j]]+1
                j +=1

Gr=nx.from_numpy_matrix(adj_mat.values)

G = ((source, target, attr) for source, target, attr in 
                Gr.edges_iter(data=True) if attr['weight'] > 1)
new_network = nx.Graph()
new_network.add_edges_from(G)


position=nx.spring_layout(new_network)
labels = adj_mat.columns[new_network.nodes()]
     
traceE=plot_network.scatter_edges(new_network, position)
traceN=plot_network.scatter_nodes(new_network,position,labels = labels)

fig = Figure(data=Data([traceE, traceN]),
             layout=Layout(
                title='<br>Network graph - '+str(a_dict.values()[1]),
                titlefont=dict(size=18),
                showlegend=False, 
                width=1000,
                height=750,
                margin=dict(b=20,l=5,r=5,t=40),
                hovermode='closest',
                xaxis=XAxis(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=YAxis(showgrid=False, zeroline=False, showticklabels=False)))

iplot(fig, filename=str(a_dict.values()[1]))


In [10]:
# Publication of the first author
pubs = Author_Pubs[Author_Pubs['Author_Id']==a_dict.keys()[2]]
pubs.head()

# All the co-authors of a given author
x = pubs['Pub_Authors'].tolist()
x_authors = ",".join(unicode(i) for i in x if not i.isdigit())
x_authors = x_authors.lower()
temp = x_authors.split(",")
print (temp.count(str(a_dict.values()[2])))

#Create adjacency matrix of author connections
x_authors = set(x_authors.split(","))
adj_mat = pd.DataFrame(0, index=x_authors, columns=x_authors)
for authors in x:
    temp = authors.lower().split(",")
    if len(temp)>1:
        for i in range(0,len(temp)-1):
            j=0
            while j <= len(temp)-1:
                adj_mat[temp[i]][temp[j]] = adj_mat[temp[i]][temp[j]]+1
                j +=1

Gr=nx.from_numpy_matrix(adj_mat.values)

G = ((source, target, attr) for source, target, attr in 
                Gr.edges_iter(data=True) if attr['weight'] > 1)
new_network = nx.Graph()
new_network.add_edges_from(G)


position=nx.spring_layout(new_network)
labels = adj_mat.columns[new_network.nodes()]
     
traceE=plot_network.scatter_edges(new_network, position)
traceN=plot_network.scatter_nodes(new_network,position,labels = labels)

fig = Figure(data=Data([traceE, traceN]),
             layout=Layout(
                title='<br>Network graph - '+str(a_dict.values()[2]),
                titlefont=dict(size=18),
                showlegend=False, 
                width=1000,
                height=750,
                margin=dict(b=20,l=5,r=5,t=40),
                hovermode='closest',
                xaxis=XAxis(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=YAxis(showgrid=False, zeroline=False, showticklabels=False)))

iplot(fig, filename=str(a_dict.values()[2]))


0


In [11]:
# Publication of the first author
pubs = Author_Pubs[Author_Pubs['Author_Id']==a_dict.keys()[3]]
pubs.head()

# All the co-authors of a given author
x = pubs['Pub_Authors'].tolist()
x_authors = ",".join(unicode(i) for i in x if not i.isdigit())
x_authors = x_authors.lower()
temp = x_authors.split(",")
print (temp.count(str(a_dict.values()[3])))

#Create adjacency matrix of author connections
x_authors = set(x_authors.split(","))
adj_mat = pd.DataFrame(0, index=x_authors, columns=x_authors)
for authors in x:
    temp = authors.lower().split(",")
    if len(temp)>1:
        for i in range(0,len(temp)-1):
            j=0
            while j <= len(temp)-1:
                adj_mat[temp[i]][temp[j]] = adj_mat[temp[i]][temp[j]]+1
                j +=1

Gr=nx.from_numpy_matrix(adj_mat.values)

G = ((source, target, attr) for source, target, attr in 
                Gr.edges_iter(data=True) if attr['weight'] > 1)
new_network = nx.Graph()
new_network.add_edges_from(G)


position=nx.spring_layout(new_network)
labels = adj_mat.columns[new_network.nodes()]
     
traceE=plot_network.scatter_edges(new_network, position)
traceN=plot_network.scatter_nodes(new_network,position,labels = labels)

fig = Figure(data=Data([traceE, traceN]),
             layout=Layout(
                title='<br>Network graph - '+str(a_dict.values()[3]),
                titlefont=dict(size=18),
                showlegend=False, 
                width=1000,
                height=750,
                margin=dict(b=20,l=5,r=5,t=40),
                hovermode='closest',
                xaxis=XAxis(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=YAxis(showgrid=False, zeroline=False, showticklabels=False)))

iplot(fig, filename=str(a_dict.values()[3]))

0


## Conclusion

Ideally, these graphs should be like following: One center node of the main author (on whose basis we extract the publications) and connections from that node to the rest of the network or to his co-authors. Currently, because of difference in spellings and the way names are used, we dont see every node connected. We need partial matching of strings, which will bring in it's own errors. 