In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### Citation table

We take cites from 1990 to 2019 avoiding self-cites.

In [4]:
cite = pd.read_csv("../data/processed/cites.csv")

cite = cite[(cite.t_year >= 1990) & (cite.t_year < 2020)]
cite = cite[(cite.s_year >= 1990) & (cite.s_year < 2020)]

cite = cite[cite.target != cite.source]

In [5]:
papers = pd.read_csv("../data/processed/adjacency_papers.csv")

papers = papers[(papers.t_year >= 1990) & (papers.t_year < 2020)]
papers = papers[(papers.s_year >= 1990) & (papers.s_year < 2020)]

papers = papers[papers.target != papers.source]

In [6]:
cite['s_year'] = cite['s_year'].astype(int).astype(str)

### Authors table

There are two author tables: 

1. One with the comparable groups A and B (`people`) and 
2. the one with the all the authors found in the RePEc repository (`all_people`).

In [7]:
people = pd.read_csv("../data/processed/network_people.csv")
all_people = pd.read_csv("../data/processed/people.csv")

# Base 1

Una base de datos en que el índice sea autor-paper con:
   - Año de publicación
   - Citas por año (excluir auto-citas)
   - co-autores
   - Gender

In [15]:
pivot= pd.pivot_table(cite.groupby(['paper', 'target', 't_year']).s_year.value_counts().rename('N').reset_index(),
         index=['paper', 'target', 't_year'],
         columns='s_year',
         values='N').reset_index()

In [16]:
years = pivot['t_year'].values

H = pivot.loc[:, '1990':].fillna(0).values

M = np.empty((H.shape))

M[:] = np.nan

indices = {y: int(y) - 1990 for y in set(years)}

In [17]:
for i, y in enumerate(years):
    for j, x in enumerate(range(indices[y], 30)):
        M[i][j] = H[i][x]

In [18]:
pivot.loc[:, '1990':] = M

In [19]:
pivot.columns = ['paper', 'author', 'year_0'] + [f"cites_year_{i}" for i in range(30)] 

In [None]:
pivot.to_csv("../data/econometry/base_1.csv", index=False)

In [104]:
pivot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 329285 entries, 0 to 329284
Data columns (total 33 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   paper          329285 non-null  object 
 1   author         329285 non-null  object 
 2   year_0         329285 non-null  float64
 3   cites_year_0   329285 non-null  float64
 4   cites_year_1   327777 non-null  float64
 5   cites_year_2   321573 non-null  float64
 6   cites_year_3   310766 non-null  float64
 7   cites_year_4   296565 non-null  float64
 8   cites_year_5   279252 non-null  float64
 9   cites_year_6   259755 non-null  float64
 10  cites_year_7   239044 non-null  float64
 11  cites_year_8   219174 non-null  float64
 12  cites_year_9   199306 non-null  float64
 13  cites_year_10  180067 non-null  float64
 14  cites_year_11  161716 non-null  float64
 15  cites_year_12  143921 non-null  float64
 16  cites_year_13  127944 non-null  float64
 17  cites_year_14  113422 non-nul

In [20]:
co = cite.groupby('paper').target.unique()

In [21]:
co = co.reset_index()

In [22]:
co['size'] = co.target.apply(len)

In [23]:
M = np.empty((co.shape[0], co['size'].max()), dtype='U20')

In [24]:
M[:] = np.nan

In [27]:
authors = co.target.values

In [28]:
for i, aths in enumerate(authors):
    for j in range(len(aths)):
        M[i][j] = aths[j]

In [29]:
nco = pd.DataFrame(np.concatenate([co[['paper']].values, M], axis=1))

In [30]:
nco.columns = ['paper'] + [f"co_author_{i+1}" for i in range(co['size'].max())]

In [31]:
nco = nco.replace('nan', np.nan)

In [None]:
nco.to_csv("../data/econometry/base_1_co_autores.csv", index=False)

In [105]:
nco.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215987 entries, 0 to 215986
Data columns (total 18 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   paper         215987 non-null  object
 1   co_author_1   215987 non-null  object
 2   co_author_2   88872 non-null   object
 3   co_author_3   20325 non-null   object
 4   co_author_4   2900 non-null    object
 5   co_author_5   555 non-null     object
 6   co_author_6   220 non-null     object
 7   co_author_7   125 non-null     object
 8   co_author_8   82 non-null      object
 9   co_author_9   64 non-null      object
 10  co_author_10  44 non-null      object
 11  co_author_11  32 non-null      object
 12  co_author_12  26 non-null      object
 13  co_author_13  17 non-null      object
 14  co_author_14  16 non-null      object
 15  co_author_15  10 non-null      object
 16  co_author_16  8 non-null       object
 17  co_author_17  2 non-null       object
dtypes: object(18)
memory usa

# Base 2

Por autor y por año $t$:

   - Gender of author
   - Degree in $t$
   - Closeness in $t$
   - Betweenness centralities in $t$
   - Number of coauthors in $t$

In [33]:
cotor = pd.read_csv("../data/processed/co_author.csv")

In [34]:
cotor = pd.merge(cotor,
               all_people[['Short-Id', 'gender']],
               left_on='author1',
               right_on='Short-Id',
               how='left')
cotor = pd.merge(cotor,
               all_people[['Short-Id', 'gender']],
               left_on='author2',
               right_on='Short-Id',
               suffixes=["_1", "_2"],
               how='left')

In [35]:
cotor = cotor.dropna(subset=['gender_1', 'gender_2'])

In [36]:
cotor = cotor[(cotor.year >= 1990) & (cotor.year < 2020)]
cotor = cotor[(cotor.year >= 1990) & (cotor.year < 2020)]

In [37]:
graphs = []
for y in sorted(cotor.year.unique()):
    chunk = cotor[cotor.year==y]
    A = chunk.groupby(['author1', 'author2']).size().rename('weight').reset_index()
    G = nx.from_pandas_edgelist(A,
                                source='author1',
                                target='author2',
                                edge_attr='weight')
    graphs.append(G)

In [38]:
A = cotor.groupby(['author1', 'author2']).size().rename('weight').reset_index()

G = nx.from_pandas_edgelist(A,
                            source='author1',
                            target='author2',
                            edge_attr='weight')

In [39]:
nodes = list(G)

In [40]:
years = sorted(cotor.year.unique())

In [71]:
#degs = []
#wdegs = []
bets = []
clos = []
for i, G in enumerate(graphs):
    print(i)
    #degs.append(pd.DataFrame(G.degree, columns=['author', 'co_authors']))
    #wdegs.append(pd.DataFrame(G.degree(weight='weight'), columns=['author', 'collaborations']))
    bets.append(pd.DataFrame(nx.betweenness_centrality(G).items(), columns=['author', 'betweenness']))
    clos.append(pd.DataFrame(nx.closeness_centrality(G).items(), columns=['author', 'closeness']))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29


In [72]:
for i, deg in enumerate(degs):
    deg['year'] = years[i]
    wdegs[i]['year'] = years[i]
    bets[i]['year'] = years[i]
    clos[i]['year'] = years[i]

In [73]:
df_deg = pd.concat(degs)

In [74]:
df_wdeg = pd.concat(wdegs)

In [75]:
df_bets = pd.concat(bets)

In [76]:
df_clos = pd.concat(clos)

In [86]:
df = pd.merge(df_deg, df_wdeg)

In [87]:
df = pd.merge(df, df_bets)

In [89]:
df = pd.merge(df, df_clos)

In [96]:
df = pd.merge(df, all_people[['Short-Id', 'gender']], how="left", left_on="author", right_on="Short-Id")

In [98]:
df = df.drop('Short-Id', axis=1)

In [103]:
df.to_csv("../data/econometry/base_2.csv", index=False)