In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import networkx as nx

## Read files

In [16]:
col = pd.read_csv('../../data/processed/co_author_2.csv')

col = col[(col.year >= 1990) & (col.year < 2020)]

In [15]:
cite = pd.read_csv('../../data/processed/cites.csv')

In [5]:
col_authors = set(col.author1.unique()) | set(col.author2.unique())

cite_authors = set(cite.target.unique()) | set(cite.source.unique())

In [2]:
article = pd.read_csv('../../data/econometry/base_1_todos.csv')

article = article.dropna(subset=['year', 'gender'])

In [7]:
all_people = pd.read_csv("../../data/processed/people.csv")

In [8]:
places = pd.read_csv("../../data/processed/institution.csv")

places = places[['Handle', 'Primary-Name', 'alpha-3', 'region', 'sub-region']].set_index("Handle")

In [9]:
all_people = pd.merge(all_people,
                  places,
                  left_on="Workplace-Institution",
                  right_index=True,
                  how="left")
genders = dict(zip(all_people['Short-Id'], all_people['gender']))

In [10]:
article = article.sort_values(by='year')

age = article[article.year >= 1970].groupby('Short-Id').year.first().rename('age').reset_index()

all_people = pd.merge(all_people, age, how='left')

In [11]:
ages = dict(zip(all_people['Short-Id'], all_people['age']))

In [12]:
all_people.head()

Unnamed: 0,Short-Id,Workplace-Institution,first_name,last_name,gender,Primary-Name,alpha-3,region,sub-region,age
0,pne333,RePEc:edi:sfeixuk,stephen,nei,male,Oxford University,GBR,Europe,Northern Europe,
1,pnd1,RePEc:edi:permaus,leonce,ndikumana,male,University of Massachusetts-Amherst,USA,Americas,Northern America,1999.0
2,pnd33,RePEc:edi:cbngvng,obiageri,ndukwe,female,Central Bank of Nigeria,NGA,Africa,Sub-Saharan Africa,2018.0
3,pno11,,douglass,north,male,,,,,1970.0
4,pni74,RePEc:edi:cetorit,giovanna,nicodano,female,University of Turin,ITA,Europe,Southern Europe,1998.0


In [13]:
unis = dict(zip(all_people['Short-Id'], all_people['Workplace-Institution']))

In [14]:
countries = dict(zip(all_people['Short-Id'], all_people['alpha-3']))

In [15]:
_ = len(col_authors & cite_authors) / len(col_authors)
print(f"{_:.2%} of researchers in collaboration network are also in the citation network")

95.16% of researchers in collaboration network are also in the citation network


## Networks

In [16]:
col_adj = col.groupby(['author1', 'author2']).size().rename('weight').reset_index()

In [17]:
G_col = nx.from_pandas_edgelist(col_adj,
                                source='author1',
                                target='author2',
                                edge_attr='weight')

In [18]:
deg = pd.DataFrame(G_col.degree, columns=['node', 'degree'])

In [19]:
nx.set_node_attributes(G_col, genders, name='gender')

In [20]:
nx.set_node_attributes(G_col, ages, name='age')

In [21]:
nx.set_node_attributes(G_col, unis, name='university')

In [22]:
nx.set_node_attributes(G_col, countries, name='country')

## Degree

In [23]:
deg.degree.describe()

count    39330.000000
mean         5.530689
std          6.641845
min          1.000000
25%          2.000000
50%          3.000000
75%          7.000000
max        122.000000
Name: degree, dtype: float64

Order (number of nodes)

In [24]:
len(G_col)

39330

Number of edges

In [25]:
len(G_col.edges)

108761

## Assortativity

In [26]:
gac = nx.attribute_assortativity_coefficient(G_col, 'gender')

uac = nx.attribute_assortativity_coefficient(G_col, 'university')

cac = nx.attribute_assortativity_coefficient(G_col, 'country')

aac = nx.degree_assortativity_coefficient(G_col, weight='age')

In [30]:
print(f'GAC: {gac:.2}')
print(f'UAC: {uac:.2}')
print(f'CAC: {cac:.2}')
print(f'AAC: {aac:.2}')

GAC: 0.084
UAC: 0.11
CAC: 0.32
AAC: 0.15


In [31]:
ages_cat = {}
for k in ages:
    if not np.isnan(ages[k]):
        ages_cat[k] = str(int(ages[k]))
    else:
        ages_cat[k] = np.nan

In [32]:
nx.set_node_attributes(G_col, ages_cat, name='age_cat')

In [33]:
acac = nx.attribute_assortativity_coefficient(G_col, 'age_cat')

In [38]:
print(f'Age as a category assortativity: {acac:.2}')

Age as a category assortativity: 0.033


In [35]:
nx.number_connected_components(G_col)

1016

## Groups

In [48]:
def print_ass(g):
    
    gac = nx.attribute_assortativity_coefficient(g, 'gender')
    uac = nx.attribute_assortativity_coefficient(g, 'university')
    cac = nx.attribute_assortativity_coefficient(g, 'country')
    aac = nx.degree_assortativity_coefficient(g, weight='age')
    acac = nx.attribute_assortativity_coefficient(g, 'age_cat')
    
    print(f'GAC: {gac:.2}')
    print(f'UAC: {uac:.2}')
    print(f'CAC: {cac:.2}')
    print(f'AAC: {aac:.2}')
    print(f'Age as a category assortativity: {acac:.2}')

In [39]:
group = pd.read_csv('../../data/processed/network_people.csv')

super cited

In [57]:
cites = cite.groupby('target').size().rename('cites').reset_index()

In [64]:
super_cited = cites[cites.cites >= cites.cites.mean() + 1.5 * (cites.cites.quantile(.75) - cites.cites.quantile(.25))]['target'].unique()

In [65]:
G_sc = G_col.subgraph(super_cited)

In [41]:
G_a = G_col.subgraph(group[group.group=='A']['Short-Id'].values)

In [42]:
G_b = G_col.subgraph(group[group.group=='B']['Short-Id'].values)

In [44]:
deg_a = pd.DataFrame(G_a.degree, columns=['n', 'd'])

In [43]:
deg_b = pd.DataFrame(G_b.degree, columns=['n', 'd'])

In [66]:
deg_sc = pd.DataFrame(G_sc.degree, columns=['n', 'd'])

In [45]:
deg_a.describe()

Unnamed: 0,d
count,7160.0
mean,0.669553
std,1.026412
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,12.0


In [51]:
deg_b.describe()

Unnamed: 0,d
count,1382.0
mean,1.052098
std,1.196256
min,0.0
25%,0.0
50%,1.0
75%,2.0
max,8.0


In [67]:
deg_sc.describe()

Unnamed: 0,d
count,4271.0
mean,7.268555
std,5.66337
min,0.0
25%,3.0
50%,6.0
75%,9.0
max,60.0


In [47]:
len(G_a.edges)

2397

In [52]:
len(G_b.edges)

727

In [68]:
len(G_sc.edges)

15522

In [49]:
print_ass(G_a)

GAC: 0.12
UAC: 0.23
CAC: 0.56
AAC: 0.49
Age as a category assortativity: 0.067


In [53]:
print_ass(G_b)

GAC: 0.12
UAC: 0.077
CAC: 0.34
AAC: 0.28
Age as a category assortativity: 0.035


In [69]:
print_ass(G_sc)

GAC: 0.039
UAC: 0.054
CAC: 0.19
AAC: 0.12
Age as a category assortativity: 0.017


In [50]:
nx.number_connected_components(G_a)

5034

In [54]:
nx.number_connected_components(G_b)

724

In [70]:
nx.number_connected_components(G_sc)

81

# Descriptive stats

In [33]:
authors = article.groupby('Short-Id').agg({'gender': 'first', 'year': 'min', 'n_authors': 'count'}).reset_index()

In [24]:
G = nx.from_pandas_edgelist(col, source='author1', target='author2')

In [29]:
authors_col = pd.DataFrame(G.degree, columns=['Short-Id', 'co-authors'])

In [20]:
author_cite = cite.groupby('target').source.count().reset_index()

In [31]:
author_cite = author_cite.rename(columns={'target': 'Short-Id', 'source': 'cites'})

In [35]:
authors = authors.rename(columns={'year': 'first_paper_date', 'n_authors': 'papers'})

In [37]:
authors = pd.merge(authors, authors_col, how='left')
authors = pd.merge(authors, author_cite, how='left')

In [39]:
authors['age'] = 2019 - authors['first_paper_date']

In [53]:
authors = authors[authors['age'] >= 0]

In [48]:
authors = authors.drop('first_paper_date', axis=1)

In [56]:
print(authors.groupby('gender').mean().reset_index().round(2).to_latex())

\begin{tabular}{llrrrr}
\toprule
{} &  gender &  papers &  co-authors &   cites &    age \\
\midrule
0 &  female &    8.83 &        4.36 &   75.94 &  12.02 \\
1 &    male &   15.23 &        6.09 &  177.79 &  15.86 \\
\bottomrule
\end{tabular}

