In [1]:
from itertools import combinations
import re
from tqdm import tqdm
import pandas as pd
import xmltodict
import networkx as nx
import collections
import matplotlib.pyplot as plt
from unidecode import unidecode

In [2]:
def extract_authors(dc):
    if type(dc) == list:
        result = [
            unidecode(i.get("#text").title().strip())
            for i in dc
            if i.get("#text") is not None and i.get("@scheme") != "institution"
        ]
        return [x for x in result if x != "And Others"]
    elif dc.get("#text") is not None and dc.get("@scheme") != "institution":
        return [unidecode(dc.get("#text").title().strip())]


def clean_name(name):
    if ', ' in name:
        lst = name.split(', ')
        lst = [item.split(' ')[0] for item in lst]
        return lst[1] + ' ' + lst[0]
    elif ',' in name:
        lst = name.split(',')
        lst = [item.split(' ')[0] for item in lst]
        return lst[1] + ' ' + lst[0]
    else:
        return name


def get_edges(auth_list):
    return list(combinations(auth_list, 2))

def extract_ids(dc):
    if type(dc) == list:
        return [
            i.get("#text").upper().strip()
            for i in dc
            if i.get("#text") is not None and i.get("@scheme") == "eric_accno"
        ][0]
    elif dc.get("#text") is not None and dc.get("@scheme") == "eric_accno":
        return dc.get("#text").upper().strip()

In [3]:
df_all = []

for year in tqdm(range(1965, 2021)):
    file_name = "data/eric" + str(year)
    with open(file_name + ".xml", encoding="utf-8") as fd:
        dict = xmltodict.parse(fd.read())
    recs = [rec["metadata"] for rec in dict["records"]["record"]]
    df = pd.DataFrame(recs)
    df = df[df['dc:type'].notna()]
    df = df[df['eric:peer_reviewed'].notna()]
    df['type'] = [''.join(map(str, l)).lower() for l in df['dc:type']]
    df = df.loc[df['eric:peer_reviewed'] == 'T']
    # df = df[['ids', 'authors', 'edges', 'dc:type', 'dc:subject', 'eric:keywords', 'eric:keywords_geo', 'dc:title', 'eric:pageCount', 'dc:date', 'eric:dateAdded']]
    df_all.append(df)
df_all = pd.concat(df_all)

df_all = df_all.loc[(df_all['type'].str.contains("journal"))]
df_all["authors"] = df_all.apply(lambda row: extract_authors(row["dc:creator"]), axis=1)
df_all = df_all[df_all['authors'].notna()]
df_all['authors'] = df_all.apply(lambda row: [clean_name(item) for item in row['authors']], axis=1)
df_all["edges"] = df_all.apply(lambda row: get_edges(sorted(row["authors"])), axis=1)
df_all["ids"] = df_all.apply(lambda row: extract_ids(row["dc:identifier"]), axis=1)
df_all.info()

100%|██████████████████████████████████████████████████████████████████████████████████| 56/56 [11:07<00:00, 11.91s/it]


<class 'pandas.core.frame.DataFrame'>
Int64Index: 908421 entries, 17594 to 44897
Data columns (total 39 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   dcterms:accessRights       908421 non-null  object
 1   dc:subject                 908417 non-null  object
 2   dc:creator                 908421 non-null  object
 3   dc:type                    908421 non-null  object
 4   eric:keywords              398883 non-null  object
 5   eric:keywords_geo          301814 non-null  object
 6   eric:issn                  746293 non-null  object
 7   dc:language                905402 non-null  object
 8   dcterms:educationLevel     326980 non-null  object
 9   dc:description             907932 non-null  object
 10  dc:identifier              908421 non-null  object
 11  dc:title                   908421 non-null  object
 12  dc:source                  908421 non-null  object
 13  eric:citation              908376 non-nul

In [4]:
df_all['eric:dateAdded'].value_counts()

2009    41251
2016    38588
2017    38305
2014    37226
2020    37152
2010    37039
2019    36992
2015    36610
2011    36226
2018    35528
2008    35104
2012    35059
2007    28125
2013    19696
2006    18837
2005    18775
2002    18486
2000    17873
2003    17449
1999    17212
1994    16970
1995    16748
1996    16117
1997    15988
2001    15853
1981    15090
1980    15008
1992    14889
1989    14852
1993    14683
1998    14482
1991    14467
1990    14224
1984    13707
1983    13565
1987    13475
1985    13415
1986    13360
1988    12683
1982    12408
1979     7996
2004     6432
1975      131
1976       96
1977       81
1972       42
1973       36
1974       34
1971       28
1978       28
Name: eric:dateAdded, dtype: int64

In [5]:
def generate_graph(df_local):

    nodelist1 = df_local["authors"].tolist()
    nodelist2 = [x for x in nodelist1 if x is not None]  # remove none
    node_list = [item for sublist in nodelist2 for item in sublist]
    node_list = list(set(node_list))

    edge_list1 = df_local["edges"].tolist()
    edge_list2 = [x for x in edge_list1 if x is not None]  # remove none
    edge_list = [item for sublist in edge_list2 for item in sublist]

    G = nx.Graph()
    G.add_nodes_from(node_list)
    G.add_edges_from(edge_list)
    return G

In [7]:
list1 = []

for year in tqdm(range(1965, 2021)):
    
    df_local = df_all.loc[df_all['eric:dateAdded'] == str(year)]
    
    if len(df_local) == 0:
        continue
    
    G = generate_graph(df_local)
    n_papers = len(df_local)
    n_authors = len(G)
    n_collabs = nx.number_of_edges(G)
    n_isolates = nx.number_of_isolates(G)
    mean_collabs = 2 * G.number_of_edges() / float(G.number_of_nodes())
    
    G_largest_comp = G.subgraph(sorted(nx.connected_components(G), key=len, reverse=True)[0])
    largest_component = len(G_largest_comp)/len(G) 
    
    list1.append((year, n_papers, n_authors, n_collabs, n_isolates, mean_collabs, largest_component))

df_summary = pd.DataFrame(list1, columns = ["year", "n_papers", "n_authors", "n_collabs", "n_isolates", "mean_collabs", "largest_component"])
df_summary

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:18<00:00, 18.79s/it]


Unnamed: 0,year,n_papers,n_authors,n_collabs,n_isolates,mean_collabs,largest_component
0,1965,908421,697452,1545765,115526,4.432606,0.635176


In [None]:
len(G.subgraph(sorted(nx.connected_components(G), key=len, reverse=True)[0]))

In [None]:
nx.info(G)

In [None]:
# mean papers per author
len([item for sublist in list2 for item in sublist])/len(node_list)

In [None]:
# nx.degree_histogram(G)

In [None]:
# nx.write_edgelist(G, "all.edgelist.gz")

In [None]:
# G = nx.read_edgelist("all.edgelist.gz")

In [None]:
hub_ego = nx.ego_graph(G, "Prashant Loyalka")
pos = nx.spring_layout(hub_ego)
nx.draw(hub_ego, pos, node_color="b", node_size=50, with_labels=False)
options = {"node_size": 300, "node_color": "r"}
nx.draw_networkx_nodes(hub_ego, pos, nodelist=["Prashant Loyalka"], **options)

In [None]:
sorted([s for s in node_list if "Steven Raphael" in s])

In [None]:
df_all.head(20)