In [1]:
#Dimitris Spathis {dispathis@gmail.com} Nov 2016
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#%pylab inline
import mpld3
mpld3.enable_notebook()

In [2]:
data = pd.read_csv('generated-author-info.csv')

In [3]:
countries = pd.read_csv('country-info.csv')
countries = countries[countries.region.str.contains("europe") == True]

In [4]:
pattern = '|'.join(countries.institution)
data["europe"] = data.dept.str.contains(pattern)

In [5]:
# 'mlmining' : ['NIPS', 'ICML','KDD']
# 'ai' : ['AAAI', 'AAAI/IAAI', 'IJCAI']
#  'ir' : ['WWW', 'SIGIR']
# 'chi' : ['CHI','UbiComp','Ubicomp','UIST']
# 'nlp' : ['EMNLP','ACL','ACL (1)', 'ACL (2)', 'NAACL', 'HLT-NAACL'] 
# 'vision' : ['CVPR', 'CVPR (1,2,3...), ICCV', 'ECCV (1,2,3...)']

# you can choose your fields here 
# https://github.com/emeryberger/CSrankings/blob/cec0143fbe6cbdbcd6c83f640ac24f665580307b/util/csrankings.py#L55

data = (data.loc[data['area'].isin(['mlmining', 'chi', 'ir', 'nlp', 'ai'])])

In [6]:
# Adjusted counts: each publication is counted exactly once, with credit adjusted by splitting evenly
# across all co-authors. This approach makes it impossible to boost rankings simply by adding authors to a paper.

aggregations = {
    
    'adjustedcount':  'sum',
    'year':  np.mean,
    'count' : 'sum', 
    'europe': 'unique',
    'area': {
        'publications': 'count',
        'fields': 'unique'
    }
}

In [7]:
grouped = data.groupby(['name', 'dept']).agg(aggregations)

In [8]:
#display dataframe in full length
pd.set_option('display.max_rows', None)

In [9]:
df = grouped.sort([('adjustedcount', 'sum')], ascending=False)

  if __name__ == '__main__':


In [10]:
#transform categorical [nlp, ai, mlmining, ir] => columns nlp 1 | ai 1 | chi 0 ...etc
df = pd.concat([df, df[('area', 'fields')].str.join(sep=',').str.get_dummies(sep=',')], axis=1)

In [11]:
#arbitrary weights based on your preferences, will be used for scatter size
df["weight"] = df.ai*2 + df.chi*9 +df.ir*2 + df.mlmining*9 + df.nlp*7 

In [12]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,"(count, sum)","(europe, unique)","(adjustedcount, sum)","(area, fields)","(area, publications)","(year, mean)",ai,chi,ir,mlmining,nlp,weight
name,dept,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Michael I. Jordan,University of California - Berkeley,138.0,[False],48.692857,"[nlp, ai, mlmining, ir]",33,2003.393939,1,0,1,1,1,20
Dan Roth,University of Illinois at Urbana-Champaign,99.0,[False],39.792857,"[nlp, ai, ir, mlmining]",48,2006.8125,1,0,1,1,1,20
Craig Boutilier,University of Toronto,72.0,[False],35.492857,"[ai, mlmining]",35,2002.714286,1,0,0,1,0,11
Christopher D. Manning,Stanford University,100.0,[False],35.284524,"[nlp, ai, ir, mlmining, chi]",33,2008.090909,1,1,1,1,1,29
Dan Klein,University of California - Berkeley,89.0,[False],35.033333,"[nlp, ai, ir, mlmining]",27,2008.074074,1,0,1,1,1,20
Tuomas Sandholm,Carnegie Mellon University,77.0,[False],34.216667,"[ai, mlmining]",25,2006.24,1,0,0,1,0,11
Pedro M. Domingos,University of Washington,61.0,[False],32.709524,"[nlp, ai, ir, mlmining]",37,2005.594595,1,0,1,1,1,20
Richard E. Korf,University of California - Los Angeles,47.0,[False],30.916667,[ai],26,2002.0,1,0,0,0,0,2
Raymond J. Mooney,University of Texas at Austin,67.0,[False],29.266667,"[nlp, ai, mlmining]",43,2003.488372,1,0,0,1,1,18
Jieping Ye,University of Michigan,101.0,[False],28.992136,"[nlp, mlmining, ai]",16,2011.125,1,0,0,1,1,18


In [13]:
counts = []
years = []
names = []
weights = []
countries = []
for i in range(df.shape[0]):
    counts.append((df[('adjustedcount', 'sum')][i]))
    years.append((df[('year', 'mean')][i]))
    names.append((list(df.index.values))[i])
    countries.append(df[('europe', 'unique')][i])
    weights.append((df['weight'][i]))

In [14]:
counts = np.array(counts)
years = np.array(years)
weights = np.array(weights)
countries = np.array(countries)
#names = np.array(names)

In [15]:
%matplotlib inline
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(12,8))


custom_colors = ['red' if countries[i] == True else 'blue' for i in range(df.shape[0])]
scatter = ax.scatter(years,counts, c=custom_colors, s=weights*10, alpha=0.2)

ax.set_title("Computer Science professors in ML/AI/HCI/NLP by their avg. publication time in top venues", fontsize=15)
ax.set_ylabel('Adjusted publication count', fontsize=15)
ax.set_xlabel('Avg year of their publications', fontsize=15)
ax.grid(color='lightgray', alpha=0.7)

plt.locator_params(nbins=25)
labels = names
tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels, location='mouse')
mpld3.plugins.connect(fig, tooltip)

mpld3.save_html(fig, 'interactive_figure.html')
mpld3.display(fig)