In [12]:
import pandas as pd
import networkx as nx
import numpy as np
import seaborn as sns; sns.set(style='darkgrid')
from scipy import stats
from os.path import join
from numpy.random import choice

# Choose two groups of males and females to compare

An analysis of three decades: 1990--2019


We are interested in comparing the topological differences between male and females in two networks:

1. Co-author network
2. Citation network

Two cases: most cited and average cited.

In [13]:
cite = pd.read_csv("../data/processed/cites.csv")

cite = cite[(cite.t_year >= 1990) & (cite.t_year < 2020)]
cite = cite[(cite.s_year >= 1990) & (cite.s_year < 2020)]

cite = cite[cite.target != cite.source]

In [14]:
people = pd.read_csv("../data/processed/people.csv")

In [15]:
# Add target gender and source gender
cite = pd.merge(cite,
                people.set_index("Short-Id"),
                how='left',
                left_on="target",
                right_index=True)

cite = pd.merge(cite,
                people.set_index("Short-Id"),
                how="left",
                left_on="source",
                right_index=True,
                suffixes=("_t", "_s"))

In [16]:
# Get cites per person and add it to the people table
cite_t = (cite
          .groupby('target')
          .size()
          .rename("cites"))

people = pd.merge(people,
                  cite_t,
                  left_on="Short-Id",
                  right_index=True,
                  how="left")

In [17]:
# Some people are not in the citation network
# We'll remove them
people = people[people.cites.notna()]

In [18]:
cite = cite[cite.gender_t.notna()]
cite = cite[cite.gender_s.notna()]

## Research groups

We take as reference two groups of female researchers:

1) Successful: Outliers in terms of citations

2) Average: Inter-quartile researchers in terms of citations

In [19]:
femec = people[people.gender=='female']

In [20]:
X_f = femec.cites
q1, q2, q3 = X_f.quantile([0.25, 0.5, 0.75])
mu = X_f.mean()
# Inter quartile
qr = (q3 - q1)
X_f = X_f[(X_f >= q1) & (X_f <= q3)]
# Group A
femec_avg = femec[femec.cites.between(q1, q3, inclusive=True)]
# Group B, outliers are mean + 1.5 interquartile range
X_f_top = people[people.gender=='female'].cites
X_f_top = X_f_top[(X_f_top > mu + (1.5 * qr))]
# Group B
femec_top = femec[femec.cites > mu + (1.5 * qr)]

In [21]:
print("Group B")
X_f_top.describe()

Group B


count     770.000000
mean      450.893506
std       568.552630
min       153.000000
25%       198.000000
50%       268.500000
75%       459.750000
max      7294.000000
Name: cites, dtype: float64

In [22]:
print("Group A")
X_f.describe()

Group A


count    3971.000000
mean       19.619239
std        14.715456
min         4.000000
25%         7.000000
50%        15.000000
75%        29.000000
max        58.000000
Name: cites, dtype: float64

## Let's now take a same-size sample for male researchers

To be able to compare between genders we need:
- A stratified sample as the range of citations in both groups is too large.

In order to take a stratified sample, we divide the citation group of female researchers in deciles and get a similar number of male researchers on each decile.

In [23]:
malec = people[people.gender=='male']
# Get males in the overall citation range of group A
malec_avg = malec[malec.cites.isin(X_f.unique())]
# Males in the overall citation range of group B
malec_top = malec[malec.cites.isin(X_f_top.unique())]

In [24]:
malec = people[people.gender=='male']
# Get males in the overall citation range of group A
malec_avg = malec[malec.cites.isin(X_f.unique())]
# Males in the overall citation range of group B
malec_top = malec[malec.cites.isin(X_f_top.unique())]
# Add category in group A
# Deciles
avg_deciles = pd.qcut(femec_avg.cites,
                               retbins=True,
                               q=10,
                               labels=range(1, 11))
top_deciles = pd.qcut(femec_top.cites,
                               retbins=True,
                               q=10,
                               labels=range(1, 11))
femec_avg = (femec_avg
             .assign(q=pd.qcut(femec_avg.cites,
                               q=10,
                               labels=range(1, 11)).values))
# Add category in group B
femec_top = (femec_top
             .assign(q=pd.qcut(femec_top.cites,
                               q=10,
                               labels=range(1, 11)).values))
# Using the same deciles or bins, compute the classification
# of males to stratify data
malec_avg = (malec_avg
             .assign(q=pd.cut(malec_avg.cites,
                               bins=avg_deciles[1],
                               include_lowest=True,
                               labels=range(1, 11)).values))
malec_top = (malec_top
             .assign(q=pd.cut(malec_top.cites,
                               bins=top_deciles[1],
                               include_lowest=True,
                               labels=range(1, 11)).values))

In [25]:
femec_top.q.value_counts()

2     83
9     78
6     78
8     77
4     77
1     77
10    76
7     76
5     75
3     73
Name: q, dtype: int64

In [26]:
vals = femec_avg.q.value_counts()

In [27]:
np.random.seed(42)
choices = []
for i, v in vals.iteritems():
    chunk = malec_avg[malec_avg.q==i]
    choices.extend(choice(chunk['Short-Id'], size=v, replace=False))

In [28]:
malec_avg = malec_avg[malec_avg['Short-Id'].isin(choices)]

In [29]:
malec_avg.cites.sum()

77801.0

In [30]:
femec_avg.cites.sum()

77908.0

In [31]:
malec_avg.describe()

Unnamed: 0,cites
count,3971.0
mean,19.592294
std,14.67585
min,4.0
25%,7.0
50%,15.0
75%,28.0
max,58.0


In [32]:
femec_avg.describe()

Unnamed: 0,cites
count,3971.0
mean,19.619239
std,14.715456
min,4.0
25%,7.0
50%,15.0
75%,29.0
max,58.0


In [33]:
femec_top.q.value_counts()

2     83
9     78
6     78
8     77
4     77
1     77
10    76
7     76
5     75
3     73
Name: q, dtype: int64

In [34]:
malec_top.q.value_counts()

6     402
1     397
2     384
4     368
5     356
7     340
3     310
8     284
9     167
10     45
Name: q, dtype: int64

In [35]:
top_deciles

(116       1
 327      10
 515       1
 602       5
 626      10
          ..
 58298     9
 58352     1
 58354     6
 58430    10
 58484     9
 Name: cites, Length: 770, dtype: category
 Categories (10, int64): [1 < 2 < 3 < 4 ... 7 < 8 < 9 < 10],
 array([ 153. ,  168.9,  187. ,  208. ,  235. ,  268.5,  324. ,  396.6,
         517.2,  867. , 7294. ]))

### Malec Top

Because the distribution of female researchers with  more than 874.5 is very heterogeneous, we'll drop this category.

In [36]:
femec_top = femec_top[femec_top.q != 10]
malec_top = malec_top[malec_top.q != 10]

In [37]:
vals = femec_top.q.value_counts()

choices = []
for i, v in vals.iteritems():
    chunk = malec_top[malec_top.q==i]
    if v <= chunk.shape[0]:
        choices.extend(choice(chunk['Short-Id'], size=v, replace=False))
    else:
        choices.extend(list(chunk['Short-Id'].values))

malec_top = malec_top[malec_top['Short-Id'].isin(choices)]

In [38]:
malec_top.cites.sum()

211526.0

In [39]:
femec_top.cites.sum()

214860.0

In [40]:
femec_avg['group'] = 'A'
malec_avg['group'] = 'A'
femec_top['group'] = 'B'
malec_top['group'] = 'B'

In [41]:
people_2 = pd.concat([femec_avg,
                      malec_avg,
                      femec_top,
                      malec_top])

In [42]:
people_2.to_csv("../data/processed/network_people.csv", index=False)