# Femec

Julio 2020

In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt

## Red de citas

In [2]:
cite = pd.read_csv("../data/processed/cites.csv")

cite = cite[(cite.t_year >= 1990) & (cite.t_year < 2020)]

### Grupos de investigadores

Por número de citas $n$ de las investigadoras.

- Grupo A: Investigadores promedio:
    
    - Citas $n > q1$ y $n < q3$, donde $q1$ y $q3$ son el primer y tercer cuartil.
    
    
- Grupo B: Investigadores sobresalientes

    - Citas $n > \mu + 1.5(q3 - q1)$, dónde $\mu$ es el promedio de citas de mujeres.

## Números

- A: 3778 investigadoras
- B: 796

Para tomar los grupos masculinos respectivos usamos muestreo estratificado (por deciles).

In [3]:
people = pd.read_csv("../data/processed/network_people.csv")
all_people = pd.read_csv("../data/processed/people.csv")

In [4]:
# Overall network
G_i = nx.from_pandas_edgelist(cite, source="source",
                            target="target",
                            create_using=nx.DiGraph)
G_i.remove_edges_from(nx.selfloop_edges(G_i))

In [66]:
len(G_i.edges)

2975736

In [5]:
# Add genders to the overall network
genders = dict(zip(all_people['Short-Id'], all_people['gender']))

nx.set_node_attributes(G_i, genders, name="gender")

In [6]:
# Create the 4 sets of people, two genders on each group

A_m = people[(people.group=='A') & (people.gender=='male')]['Short-Id'].values
A_f = people[(people.group=='A') & (people.gender=='female')]['Short-Id'].values
B_m = people[(people.group=='B') & (people.gender=='male')]['Short-Id'].values
B_f = people[(people.group=='B') & (people.gender=='female')]['Short-Id'].values

## Aristas en la red de citas

In [7]:
print("In-degree:\n"
      f"A total: {sum(dict(G_i.in_degree(np.concatenate([A_m, A_f]))).values()):,}"
      f"  Mujeres: {sum(dict(G_i.in_degree(A_f)).values()):,}"
      f"  Hombres: {sum(dict(G_i.in_degree(A_m)).values()):,}\n"
      f"B total: {sum(dict(G_i.in_degree(np.concatenate([B_m, B_f]))).values()):,}"
      f"  Mujeres: {sum(dict(G_i.in_degree(B_f)).values()):,}"
      f"  Hombres: {sum(dict(G_i.in_degree(B_m)).values()):,}\n")

In-degree:
A total: 120,095  Mujeres: 60,502  Hombres: 59,593
B total: 275,968  Mujeres: 140,739  Hombres: 135,229



In [8]:
print("Out-degree:\n"
      f"A total: {sum(dict(G_i.out_degree(np.concatenate([A_m, A_f]))).values()):,}"
      f"  Mujeres: {sum(dict(G_i.out_degree(A_f)).values()):,}"
      f"  Hombres: {sum(dict(G_i.out_degree(A_m)).values()):,}\n"
      f"B total: {sum(dict(G_i.out_degree(np.concatenate([B_m, B_f]))).values()):,}"
      f"  Mujeres: {sum(dict(G_i.out_degree(B_f)).values()):,}"
      f"  Hombres: {sum(dict(G_i.out_degree(B_m)).values()):,}\n")

Out-degree:
A total: 370,847  Mujeres: 179,866  Hombres: 190,981
B total: 205,306  Mujeres: 97,159  Hombres: 108,147



In [9]:
print("Auto-citas:\n"
      f"A total: {sum([1 for x in nx.selfloop_edges(G_i) if x[0] in np.concatenate([A_m, A_f])]):,}"
      f"  Mujeres: {sum([1 for x in nx.selfloop_edges(G_i) if x[0] in A_f]):,}"
      f"  Hombres: {sum([1 for x in nx.selfloop_edges(G_i) if x[0] in A_m]):,}\n"
      f"B total: {sum([1 for x in nx.selfloop_edges(G_i) if x[0] in np.concatenate([B_m, B_f])]):,}"
      f"  Mujeres: {sum([1 for x in nx.selfloop_edges(G_i) if x[0] in B_f]):,}"
      f"  Hombres: {sum([1 for x in nx.selfloop_edges(G_i) if x[0] in B_m]):,}\n")

Auto-citas:
A total: 0  Mujeres: 0  Hombres: 0
B total: 0  Mujeres: 0  Hombres: 0



## Procedencia de las citas

Fuera de las auto-citas.

In [10]:
G_i.remove_edges_from(nx.selfloop_edges(G_i))

### Grupo A

Citas recibidas

In [11]:
mujeres = [u for u, v in G_i.in_edges(A_f)]
hombres = [u for u, v in G_i.in_edges(A_m)]

In [12]:
p_m = [genders[n] for n in mujeres if n in genders]
p_h = [genders[n] for n in hombres if n in genders]

p_m = sum([1 for n in p_m if n == 'female']) / len(p_m)
p_h = sum([1 for n in p_h if n == 'female']) / len(p_h)

In [13]:
print(f"Grupo A\n"
      "Proporción de citas de mujeres a:\n"
      f"Mujeres {p_m:.2}"
      f"    Hombres {p_h:.2}")

Grupo A
Proporción de citas de mujeres a:
Mujeres 0.22    Hombres 0.18


Citas emitidas

In [14]:
mujeres = [v for u, v in G_i.out_edges(A_f)]
hombres = [v for u, v in G_i.out_edges(A_m)]

In [15]:
p_m = [genders[n] for n in mujeres if n in genders]
p_h = [genders[n] for n in hombres if n in genders]

p_m = sum([1 for n in p_m if n == 'female']) / len(p_m)
p_h = sum([1 for n in p_h if n == 'female']) / len(p_h)

In [16]:
print(f"Grupo A\n"
      "Proporción de citas a mujeres de:\n"
      f"Mujeres {p_m:.2}"
      f"    Hombres {p_h:.2}")

Grupo A
Proporción de citas a mujeres de:
Mujeres 0.16    Hombres 0.13


Al parecer hay un sesgo hacia citar hombres en el grupo A.

### Grupo B

In [17]:
mujeres = [u for u, v in G_i.in_edges(B_f)]
hombres = [u for u, v in G_i.in_edges(B_m)]

In [18]:
p_m = [genders[n] for n in mujeres if n in genders]
p_h = [genders[n] for n in hombres if n in genders]

p_m = sum([1 for n in p_m if n == 'female']) / len(p_m)
p_h = sum([1 for n in p_h if n == 'female']) / len(p_h)

In [19]:
print(f"Grupo B\n"
      "Proporción de citas de mujeres a:\n"
      f"Mujeres {p_m:.2}"
      f"    Hombres {p_h:.2}")

Grupo B
Proporción de citas de mujeres a:
Mujeres 0.21    Hombres 0.17


Citas emitidas

In [20]:
mujeres = [v for u, v in G_i.out_edges(B_f)]
hombres = [v for u, v in G_i.out_edges(B_m)]

In [21]:
p_m = [genders[n] for n in mujeres if n in genders]
p_h = [genders[n] for n in hombres if n in genders]

p_m = sum([1 for n in p_m if n == 'female']) / len(p_m)
p_h = sum([1 for n in p_h if n == 'female']) / len(p_h)

In [22]:
print(f"Grupo B\n"
      "Proporción de citas a mujeres de:\n"
      f"Mujeres {p_m:.2}"
      f"    Hombres {p_h:.2}")

Grupo B
Proporción de citas a mujeres de:
Mujeres 0.15    Hombres 0.13


Sigue habiendo el mismo sesgo en el grupo B. ¿Tal vez son los mismos?

## Assortativity

Assortativity de género: ¿qué tan mezcladas están las citas?

In [23]:
G_A = nx.subgraph(G_i, nbunch=np.concatenate([A_m, A_f])).copy()
G_B = nx.subgraph(G_i, nbunch=np.concatenate([B_m, B_f])).copy()

In [24]:
print("Assortativity por género\n"
      f"Grupo A: {nx.attribute_assortativity_coefficient(G_A, 'gender'):.2}"
      f"    Grupo B: {nx.attribute_assortativity_coefficient(G_B, 'gender'):.2}")

Assortativity por género
Grupo A: 0.072    Grupo B: 0.058


Hay una mezcla parecida en ambos grupos. No se puede decir que hay un sesgo.

### Red de co-autores

In [25]:
work = pd.read_csv("../data/processed/co_author.csv")

In [26]:
work = work[(work.year >= 1990) & (work.year < 2020)]

In [27]:
G = nx.from_pandas_edgelist(work,
                            source='author1',
                            target='author2')

In [28]:
nx.set_node_attributes(G, genders, name="gender")

In [29]:
G_A = nx.subgraph(G, nbunch=np.concatenate([A_m, A_f])).copy()
G_B = nx.subgraph(G, nbunch=np.concatenate([B_m, B_f])).copy()

In [30]:
print("Assortativity por género\n"
      f"Grupo A: {nx.attribute_assortativity_coefficient(G_A, 'gender'):.2}"
      f"    Grupo B: {nx.attribute_assortativity_coefficient(G_B, 'gender'):.2}")

Assortativity por género
Grupo A: 0.13    Grupo B: 0.16


El sesgo en la red de co-autores se diluye en la red de citas de cada grupo. Sin embargo existe un sesgo creado por un tercer grupo en la red de citas.

## Género en la de co-autores

### Grupo A

In [31]:
mujeres = [v for u, v in G.edges(A_f)]
hombres = [v for u, v in G.edges(A_m)]

In [32]:
p_m = [genders[n] for n in mujeres if n in genders]
p_h = [genders[n] for n in hombres if n in genders]

p_m = sum([1 for n in p_m if n == 'female']) / len(p_m)
p_h = sum([1 for n in p_h if n == 'female']) / len(p_h)

In [33]:
print(f"Grupo A\n"
      "Proporción de colaborador mujer con:\n"
      f"Mujeres {p_m:.2}"
      f"    Hombres {p_h:.2}")

Grupo A
Proporción de colaborador mujer con:
Mujeres 0.24    Hombres 0.19


### Grupo B

In [34]:
mujeres = [v for u, v in G.edges(B_f)]
hombres = [v for u, v in G.edges(B_m)]

In [35]:
p_m = [genders[n] for n in mujeres if n in genders]
p_h = [genders[n] for n in hombres if n in genders]

p_m = sum([1 for n in p_m if n == 'female']) / len(p_m)
p_h = sum([1 for n in p_h if n == 'female']) / len(p_h)

In [36]:
print(f"Grupo B\n"
      "Proporción de colaborador mujer con:\n"
      f"Mujeres {p_m:.2}"
      f"    Hombres {p_h:.2}")

Grupo B
Proporción de colaborador mujer con:
Mujeres 0.22    Hombres 0.16


También bajan las colaboraciones con mujeres en el grupo B.

### Centralización

Main core y hubs.

In [37]:
main = nx.k_core(G_i)

min(dict(main.degree).values())

165

In [38]:
main_ca = nx.k_core(G)

min(dict(main_ca.degree).values())

18

In [39]:
core_nodes = list(main.nodes)

p = [genders[n] for n in core_nodes if n in genders]
p = sum([1 for n in p if n == 'female']) / len(p)

In [68]:
mujeres = [genders[n] for n in G_i.nodes if (n in genders) and (genders[n]=="female")]

In [71]:
hombres = [genders[n] for n in G_i.nodes if (n in genders) and (genders[n]=="male")]

In [40]:
mujeres = [genders[n] for n in G_i.nodes if n in genders]
mujeres = sum([1 for n in mujeres if n == 'female']) / len(mujeres)

In [41]:
print(f"Proporción de mujeres en red de citas: {mujeres:.2}")
print(f"Proporción de mujeres en main core de citas: {p:.2}")

Proporción de mujeres en red de citas: 0.24
Proporción de mujeres en main core de citas: 0.11


In [42]:
core_nodes = list(main_ca.nodes)

p = [genders[n] for n in core_nodes if n in genders]
p = sum([1 for n in p if n == 'female']) / len(p)

In [43]:
mujeres = [genders[n] for n in G.nodes if n in genders]
mujeres = sum([1 for n in mujeres if n == 'female']) / len(mujeres)

In [44]:
print(f"Proporción de mujeres en red de co-autores: {mujeres:.2}")
print(f"Proporción de mujeres en main core de co-autores: {p:.2}")

Proporción de mujeres en red de co-autores: 0.24
Proporción de mujeres en main core de co-autores: 0.17


### Proporciones de aristas hacia el main core

In [45]:
print(f"El main core de la red de citas tiene {len(core_nodes):,} nodos")
print(f"Son el {len(core_nodes) / len(G_i):.2%} de todos los nodos")

El main core de la red de citas tiene 19 nodos
Son el 0.04% de todos los nodos


In [46]:
citas_A = [v for u, v in G_i.out_edges(np.concatenate([A_m, A_f]))]
p = sum([1 for n in citas_A if n in core_nodes]) / len(citas_A)

In [47]:
print("Grupo A\n"
      f"Proporción de aristas que van al core {p:.2}")

Grupo A
Proporción de aristas que van al core 0.00061


In [48]:
m_A = [v for u, v in G_i.out_edges(A_f)]
p_m = sum([1 for n in m_A if n in core_nodes]) / len(m_A)

In [49]:
h_A = [v for u, v in G_i.out_edges(A_m)]
p_h = sum([1 for n in h_A if n in core_nodes]) / len(h_A)

In [50]:
print(f"Proporción de aristas hacia el core de:\n"
      f"Mujeres: {p_m:.2}"
      f"     Hombres: {p_h:.2}")

Proporción de aristas hacia el core de:
Mujeres: 0.00054     Hombres: 0.00069


In [51]:
citas_B = [v for u, v in G_i.out_edges(np.concatenate([B_m, B_f]))]
p = sum([1 for n in citas_B if n in core_nodes]) / len(citas_B)

In [52]:
print("Grupo B\n"
      f"Proporción de aristas que van al core {p:.2}")

Grupo B
Proporción de aristas que van al core 0.00041


In [53]:
m_A = [v for u, v in G_i.out_edges(B_f)]
p_m = sum([1 for n in m_A if n in core_nodes]) / len(m_A)

In [54]:
h_A = [v for u, v in G_i.out_edges(B_m)]
p_h = sum([1 for n in h_A if n in core_nodes]) / len(h_A)

In [55]:
print(f"Proporción de aristas hacia el core de:\n"
      f"Mujeres: {p_m:.2}"
      f"     Hombres: {p_h:.2}")

Proporción de aristas hacia el core de:
Mujeres: 0.00021     Hombres: 0.0006


### Desde la red de co-autores

In [56]:
ca_A = [v for u, v in G.edges(np.concatenate([A_m, A_f]))]
p = sum([1 for n in ca_A if n in core_nodes]) / len(ca_A)

In [57]:
print("Grupo A\n"
      f"Proporción de aristas que van al core {p:.2}")

Grupo A
Proporción de aristas que van al core 0.0038


In [58]:
m_A = [v for u, v in G.edges(A_f)]
p_m = sum([1 for n in m_A if n in core_nodes]) / len(m_A)

In [59]:
h_A = [v for u, v in G.edges(A_m)]
p_h = sum([1 for n in h_A if n in core_nodes]) / len(h_A)

In [60]:
print(f"Proporción de aristas hacia el core de:\n"
      f"Mujeres: {p_m:.2}"
      f"     Hombres: {p_h:.2}")

Proporción de aristas hacia el core de:
Mujeres: 0.0037     Hombres: 0.0037


In [61]:
ca_B = [v for u, v in G.edges(np.concatenate([B_m, B_f]))]
p = sum([1 for n in ca_B if n in core_nodes]) / len(ca_B)

In [62]:
print("Grupo B\n"
      f"Proporción de aristas que van al core {p:.2}")

Grupo B
Proporción de aristas que van al core 0.0035


In [63]:
m_A = [v for u, v in G.edges(B_f)]
p_m = sum([1 for n in m_A if n in core_nodes]) / len(m_A)

In [64]:
h_A = [v for u, v in G.edges(B_m)]
p_h = sum([1 for n in h_A if n in core_nodes]) / len(h_A)

In [65]:
print(f"Proporción de aristas hacia el core de:\n"
      f"Mujeres: {p_m:.2}"
      f"     Hombres: {p_h:.2}")

Proporción de aristas hacia el core de:
Mujeres: 0.0     Hombres: 0.0064
