# Crear base con

Para cada año entre 2000 y 2014:

- Colaboró o no con SC
- Fue SC o no 5 años después
- Edad a la hora de colaborar
- Activo 5 años después
- Género

In [131]:
import pandas as pd
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt

### Citation table

We take cites from 1990 to 2019 avoiding self-cites.

In [132]:
cite = pd.read_csv("../data/processed/cites.csv")

cite = cite[(cite.t_year >= 1990) & (cite.t_year < 2020)]
cite = cite[(cite.s_year >= 1990) & (cite.s_year < 2020)]

cite = cite[cite.target != cite.source]

In [133]:
papers = pd.read_csv("../data/processed/adjacency_papers.csv")

papers = papers[(papers.t_year >= 1990) & (papers.t_year < 2020)]
papers = papers[(papers.s_year >= 1990) & (papers.s_year < 2020)]

papers = papers[papers.target != papers.source]

### Authors table

There are two author tables: 

1. One with the comparable groups A and B (`people`) and 
2. the one with the all the authors found in the RePEc repository (`all_people`).

In [134]:
people = pd.read_csv("../data/processed/network_people.csv")
all_people = pd.read_csv("../data/processed/people.csv")

### Places table

We use the institution to infer the place of work of the authors. We have:

1. The region (continent)
2. The sub-region (sub-continent)
3. Country 3-letter code
4. The institution's name

In [135]:
places = pd.read_csv("../data/processed/institution.csv")

places = places[['Handle', 'Primary-Name', 'alpha-3', 'region', 'sub-region']].set_index("Handle")

### Adding place of work to people

In [136]:
all_people = pd.merge(all_people,
                  places,
                  left_on="Workplace-Institution",
                  right_index=True,
                  how="left")

# all_people = all_people[all_people.region.notna()]

### Adding gender to the citation table

We have two genders for each cite:

1. Gender of the source (`gender_s`)
3. Gender of the target (`gender`)

In [137]:
cite = pd.merge(cite,
                all_people[["Short-Id", "gender"]],
                how="left",
                left_on="target",
                right_on="Short-Id")

cite = pd.merge(cite,
                all_people[["Short-Id", "gender"]].rename(columns={"gender":"gender_s"}),
                how="left",
                left_on="source",
                right_on="Short-Id")

Let's remove the citations without the gender of the target from the table.

In [138]:
cite = cite[cite.gender.notna()]
cite = cite[cite.gender_s.notna()]

## Super-cited researchers

Let's get some basic statistics of the super-cited researchers in our citation network.

In [139]:
G_cite = nx.from_pandas_edgelist(cite,
                            source='source',
                            target='target',
                            create_using=nx.DiGraph)

In [140]:
degree = pd.DataFrame(G_cite.in_degree(), columns=["author", "degree"])
mu = degree.degree.mean()
r = degree.degree.quantile(.75) - degree.degree.quantile(.25)

In [141]:
super_cited = degree[degree.degree >= mu + 1.5 * r].author.unique()
cite_sc = cite[cite.target.isin(super_cited)]

## Collaboration network

In [142]:
col = pd.read_csv("../data/processed/co_author_2.csv")

col = col[(col.year >= 1990) & (col.year < 2020)]

## Add gender to collaboration network

In [143]:
col = pd.merge(col,
               all_people[['Short-Id', 'gender']],
               left_on='author1',
               right_on='Short-Id',
               how='left')
col = pd.merge(col,
               all_people[['Short-Id', 'gender']],
               left_on='author2',
               right_on='Short-Id',
               suffixes=["_1", "_2"],
               how='left')

In [144]:
col = col.dropna(subset=['gender_1', 'gender_2'])

# Super cited by year

In [145]:
years = []
super_cited = []
for year in [2000, 2003] + list(range(2005, 2020)):
    years.append(year)
    if year == 2000:
        chunk = cite[cite.s_year <= year]
    elif year == 2003:
        chunk = cite[(cite.s_year > 2000) & (cite.s_year <= 2003)]
    elif year == 2005:
        chunk = cite[(cite.s_year > 2003) & (cite.s_year <= 2005)]
    else:
        chunk = cite[cite.s_year == year]
    G_year = nx.from_pandas_edgelist(chunk,
                                     source="source",
                                     target="target",
                                     create_using=nx.DiGraph)
    degree = pd.DataFrame(G_year.in_degree(), columns=["author", "degree"])
    mu = degree.degree.mean()
    r = degree.degree.quantile(.75) - degree.degree.quantile(.25)
    scited = degree[degree.degree >= mu + 1.5 * r].author.unique()
    super_cited.append(set(scited))

## People not SC with a first colaboration with one

In [146]:
genders = dict(zip(all_people['Short-Id'], all_people['gender']))
YEARS_AHEAD = 5

neighbors = []
women_neighbors = []
succ_neighbors = []
women_succ_neighbors = []
not_neighbors = []
women_not_neighbors = []
succ_not_neighbors = []
women_succ_not_neighbors = []
for i, year in enumerate(years[:-YEARS_AHEAD]):
    if year == 2000:
        chunk = col[col.year <= year]
    elif year == 2003:
        chunk = col[(col.year > 2000) & (col.year <= 2003)]
    elif year == 2005:
        chunk = col[(col.year > 2003) & (col.year <= 2005)]
    else:
        chunk = col[col.year == year]
    G_year = nx.from_pandas_edgelist(chunk,
                                     source="author1",
                                     target="author2",
                                     create_using=nx.Graph)
    n = []
    for sc in super_cited[i]:
        if sc in G_year:
            n.extend(list(G_year[sc]))
    n = set(n)
    not_n = set(list(G_year)) - n
    if i == 0:
        n = n - super_cited[i]
        not_n = not_n - super_cited[i]
    else:
        for j in range(0, i+1):
            n = n - super_cited[j]
            not_n = not_n - super_cited[j]
    n_w = set([x for x in n if genders[x]=='female'])
    not_n_w = set([x for x in not_n if genders[x]=='female'])
    neighbors.append(n)
    not_neighbors.append(not_n)
    women_neighbors.append(n_w)
    women_not_neighbors.append(not_n_w)
    succ_neighbors.append(n & super_cited[i+YEARS_AHEAD])
    succ_not_neighbors.append(not_n & super_cited[i+YEARS_AHEAD])
    women_succ_neighbors.append(n_w & super_cited[i+YEARS_AHEAD])
    women_succ_not_neighbors.append(not_n_w & super_cited[i+YEARS_AHEAD])

## People active at each year

In [147]:
article = pd.read_csv('../data/econometry/base_1_todos.csv')

article = article.dropna(subset=['year', 'gender'])

graphs = []
for i, year in enumerate(years):
    if year == 2000:
        chunk = article[article.year <= year]
    elif year == 2003:
        chunk = article[(article.year > 2000) & (article.year <= 2003)]
    elif year == 2005:
        chunk = article[(article.year > 2003) & (article.year <= 2005)]
    else:
        chunk = article[article.year == year]
    graphs.append(chunk['Short-Id'].unique())

## Age of people

Drop Everything from 1970 onward

In [148]:
article = article.sort_values(by='year')

age = article[article.year >= 1970].groupby('Short-Id').year.first().rename('age').reset_index()

all_people = pd.merge(all_people, age, how='left')

In [149]:
ages = dict(zip(all_people['Short-Id'], all_people['age']))

In [150]:
not_sc = [neighbors[i] | not_neighbors[i] for i in range(len(neighbors))]

In [151]:
data = []
columns = ['author', 'year', 'colaborator', 'gender', 'successful', 'active', 'age']
for i, people in enumerate(not_sc):
    for node in people:
        _age = years[i] - ages[node]
        _colaborator = 0
        _active = 0
        _success = 0
        _year = years[i]
        _gender = 'male'
        if node in neighbors[i]:
            _colaborator = 1
        if node in graphs[i+YEARS_AHEAD]:
            _active = 1
        if node in women_neighbors[i] or node in women_not_neighbors[i]:
            _gender = 'female'
        if node in succ_neighbors[i] or node in succ_not_neighbors[i]:
            _success = 1
        data.append([node, _year, _colaborator, _gender, _success, _active, _age])

In [152]:
df = pd.DataFrame(data, columns=columns)

# Add network parameters

In [153]:
networks = []
for i, group in enumerate(not_sc):
    print(i)
    if years[i] == 2000:
        chunk = col[((col.author1.isin(group) | col.author2.isin(group))) & col.year <= year]
    elif years[i] == 2003:
        chunk = col[((col.author1.isin(group) | col.author2.isin(group))) & (col.year > 2000) & (col.year <= 2003)]
    elif years[i] == 2005:
        chunk = col[((col.author1.isin(group) | col.author2.isin(group))) & (col.year > 2003) & (col.year <= 2005)]
    else:
        chunk = col[((col.author1.isin(group) | col.author2.isin(group))) & (col.year==years[i])]
    chunk = chunk.groupby(['author1', 'author2']).size().rename('weight').reset_index()
    G_y = nx.from_pandas_edgelist(chunk, source='author1', target='author2', edge_attr='weight')
    networks.append(G_y)    

0
1
2
3
4
5
6
7
8
9
10
11


In [154]:
data = []
columns = ['author', 'year', 'nodes', 'edges', 'weight', 'clustering']
for i, group in enumerate(not_sc):
    print(i)
    G_y = networks[i]
    for node in group:
        neighbors = set(G_y.neighbors(node)) & super_cited[i]
        neighbors = list(neighbors)
        G = G_y.subgraph(neighbors + [node])
        c = nx.average_clustering(G)
        order = len(G)
        edges = len(G.edges)
        weight = sum(dict(G.degree(weight='weight')).values())
        data.append([node, years[i], order, edges, weight, c])        

0
1
2
3
4
5
6
7
8
9
10
11


In [155]:
df_net = pd.DataFrame(data, columns=columns)

In [156]:
df = pd.merge(df, df_net, how='left')

In [157]:
df.to_csv('../data/processed/new_colabs.csv', index=False)