In [1]:
from itertools import combinations
import re
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import xmltodict
import networkx as nx
import pickle
import collections
import matplotlib.pyplot as plt
from unidecode import unidecode
import altair as alt
import seaborn as sns
sns.set()

tqdm.pandas()
from collections import Counter, OrderedDict
import string
import json

pd.set_option("display.max_columns", None)

In [2]:
def extract_authors(dc):
    if type(dc) == list:
        result = [
            unidecode(i.get("#text").title().strip())
            for i in dc
            if i.get("#text") is not None and i.get("@scheme") != "institution"
        ]
        return [x for x in result if x != "And Others"]
    elif dc.get("#text") is not None and dc.get("@scheme") != "institution":
        return [unidecode(dc.get("#text").title().strip())]


def clean_name(name):
    if ", " in name:
        lst = name.split(", ")
        lst = [item.split(" ")[0] for item in lst]
        return lst[1] + " " + lst[0]
    elif "," in name:
        lst = name.split(",")
        lst = [item.split(" ")[0] for item in lst]
        return lst[1] + " " + lst[0]
    else:
        return name


def get_first_names(author_list):

    if len(author_list) > 0:
        names = []
        for x in author_list:
            words_in_name = len(x.split())

            if words_in_name > 0:
                first = re.sub(r"[^\w\s]", "", x.split()[0])
            if words_in_name > 1:
                second = re.sub(r"[^\w\s]", "", x.split()[1])
            if words_in_name > 2:
                third = re.sub(r"[^\w\s]", "", x.split()[2])
            if words_in_name > 3:
                fourth = re.sub(r"[^\w\s]", "", x.split()[3])

            if words_in_name > 0 and len(first) > 1:
                names.append(first)
            elif words_in_name > 1 and len(second) > 1:
                names.append(second)
            elif words_in_name > 2 and len(third) > 1:
                names.append(third)
            elif words_in_name > 3 and len(fourth) > 1:
                names.append(fourth)
            else:
                names.append(x)

        return names
    else:
        return None


def name_to_gender(first_name_list):
    try: 
        if first_name_list and len(first_name_list) > 0:
            return [dict_genders[name] for name in first_name_list]
        else:
            return None
    except:
        return None


def get_edges(auth_list):
    return list(combinations(auth_list, 2))


def extract_subject(dc):
    try:
        result = []
        for item in dc:
            if type(item) == collections.OrderedDict:
                result.append(unidecode(item.get("#text").title().strip()))
            elif type(item) == str:
                result.append(unidecode(item.title().strip()))
            else:
                result.append(unidecode(item.title().strip()))
        return result
    except:
        return None

In [3]:
df = []

for year in tqdm(range(1965, 2022)):
    file_name = "data/eric" + str(year)
    with open(file_name + ".xml", encoding="utf-8") as fd:
        dict = xmltodict.parse(fd.read())
    recs = [rec["metadata"] for rec in dict["records"]["record"]]
    df_mini = pd.DataFrame(recs)
    
    df_mini = df_mini[df_mini["dc:type"].notna()]
    df_mini["type"] = ["".join(map(str, l)).lower() for l in df_mini["dc:type"]]
    df_mini["eric:dateAdded"] = pd.to_numeric(df_mini["eric:dateAdded"])
    df.append(df_mini)

df = pd.concat(df)

100%|███████████████████████████████████████████| 57/57 [08:42<00:00,  9.16s/it]


In [4]:
# filter to peer-reviewed journals + all books
df = df.loc[
    (
        (df["eric:peer_reviewed"] == "T")
        & (df["type"].str.contains(pat="journal", case=False))
    )
    | (df["type"].str.contains(pat="book", case=False))
]

In [5]:
# get author names
df["authors"] = df.progress_apply(
    lambda row: extract_authors(row["dc:creator"]), axis=1
)

# remove rows with no human authors
df = df[df["authors"].notna()]

# clean author name
df["authors"] = df.progress_apply(
    lambda row: [clean_name(item) for item in row["authors"]], axis=1
)
df["authors"].head(10)

100%|█████████████████████████████████| 987310/987310 [02:59<00:00, 5495.61it/s]
100%|█████████████████████████████████| 980226/980226 [03:43<00:00, 4384.27it/s]


12279              [Rudolph Masciantonio]
17594    [Donivan Watley, Rosalyn Kaplan]
17595                     [Lenore Harmon]
17596    [William Kuvlesky, Jane Dameron]
17730      [Charles Elton, Harriett Rose]
17892                 [Jeffrey Greenhaus]
17961      [Samuel Osipow, August Scheid]
18508                 [Nathaniel Pallone]
18572                        [Nancy Cole]
18573                      [Milton Hakel]
Name: authors, dtype: object

In [6]:
# get edges
df["n_authors"] = df.progress_apply(lambda row: len(row["authors"]), axis=1)
df["edges"] = df.progress_apply(lambda row: get_edges(sorted(row["authors"])), axis=1)
df[["n_authors", "edges"]].head(10)

100%|█████████████████████████████████| 980226/980226 [05:36<00:00, 2916.40it/s]
100%|█████████████████████████████████| 980226/980226 [03:00<00:00, 5420.00it/s]


Unnamed: 0,n_authors,edges
12279,1,[]
17594,2,"[(Donivan Watley, Rosalyn Kaplan)]"
17595,1,[]
17596,2,"[(Jane Dameron, William Kuvlesky)]"
17730,2,"[(Charles Elton, Harriett Rose)]"
17892,1,[]
17961,2,"[(August Scheid, Samuel Osipow)]"
18508,1,[]
18572,1,[]
18573,1,[]


In [7]:
# get subjects
df = df[df["dc:subject"].notna()]
df["subjects"] = df.progress_apply(lambda row: extract_subject(row["dc:subject"]), axis=1)
df[["dc:subject", "subjects"]].head(10)

100%|█████████████████████████████████| 980222/980222 [03:00<00:00, 5439.04it/s]


Unnamed: 0,dc:subject,subjects
12279,"[Audiolingual Methods, Classical Languages, Cu...","[Audiolingual Methods, Classical Languages, Cu..."
17594,"[Academic Aspiration, {'@weight': 'MAJOR', '#t...","[Academic Aspiration, Aspiration, Females, Mar..."
17595,"[{'@weight': 'MAJOR', '#text': 'Adolescents'},...","[Adolescents, Aspiration, Career Choice, Child..."
17596,"[{'@weight': 'MAJOR', '#text': 'Aspiration'}, ...","[Aspiration, Black Attitudes, Black Youth, Mil..."
17730,"[{'@weight': 'MAJOR', '#text': 'Ability'}, {'@...","[Ability, Career Choice, Career Development, C..."
17892,"[Aspiration, {'@weight': 'MAJOR', '#text': 'Ca...","[Aspiration, Career Choice, Career Planning, C..."
17961,"[{'@weight': 'MAJOR', '#text': 'Behavior Chang...","[Behavior Change, Behavior Patterns, College S..."
18508,"[Employment Potential, {'@weight': 'MAJOR', '#...","[Employment Potential, Job Satisfaction, Liter..."
18572,"[Career Counseling, {'@weight': 'MAJOR', '#tex...","[Career Counseling, Career Guidance, College F..."
18573,"[College Students, {'@weight': 'MAJOR', '#text...","[College Students, Forced Choice Technique, In..."


In [8]:
# load genders from first names
df_genders = pd.read_csv("df_genders.csv")[['name', 'gender']]
dict_genders = {i[0]:i[1] for i in list(df_genders.values)}

# get author first name and then use it to predict gender
df["author_first_names"] = df.progress_apply(
    lambda row: get_first_names(row["authors"]), axis=1
)

df["author_genders"] = df.progress_apply(
    lambda row: name_to_gender(row["author_first_names"]), axis=1
)
df[["author_first_names", "author_genders"]].head(10)

100%|█████████████████████████████████| 980222/980222 [03:51<00:00, 4228.04it/s]
100%|█████████████████████████████████| 980222/980222 [05:55<00:00, 2758.10it/s]


Unnamed: 0,author_first_names,author_genders
12279,[Rudolph],[male]
17594,"[Donivan, Rosalyn]","[male, female]"
17595,[Lenore],[female]
17596,"[William, Jane]","[male, female]"
17730,"[Charles, Harriett]","[male, female]"
17892,[Jeffrey],[male]
17961,"[Samuel, August]","[male, male]"
18508,[Nathaniel],[male]
18572,[Nancy],[female]
18573,[Milton],[male]


In [9]:
# year-wise counts
df["eric:dateAdded"].value_counts().sort_index()

1971       29
1972       43
1973       41
1974      386
1975     1375
1976     1296
1977      959
1978      856
1979     8564
1980    15317
1981    15367
1982    12699
1983    13806
1984    13964
1985    13629
1986    13589
1987    13703
1988    12983
1989    15242
1990    14651
1991    14810
1992    15235
1993    15201
1994    18089
1995    17971
1996    17603
1997    17300
1998    15935
1999    18417
2000    18990
2001    17051
2002    19610
2003    18545
2004     7053
2005    19040
2006    19453
2007    28843
2008    35349
2009    41340
2010    37589
2011    37225
2012    36459
2013    19884
2014    37284
2015    36753
2016    39032
2017    38620
2018    36063
2019    37674
2020    40255
2021    39050
Name: eric:dateAdded, dtype: int64

In [None]:
df.tail()

In [None]:
df.iloc[-2223]

In [None]:
print(df["dc:identifier"].iloc[-2423])
print(df["dc:creator"].iloc[-2423])

In [None]:
df.info()

## Graph Analysis

In [10]:
def generate_graph(df_local):

    node_list1 = df_local["authors"].tolist()
    node_list2 = [x for x in node_list1 if x is not None]  # remove none
    node_list3 = [item for sublist in node_list2 for item in sublist]
    node_list = list(set(node_list3))

    n_papers_per_author = len(node_list3) / len(node_list)

    edge_list1 = df_local["edges"].tolist()
    edge_list2 = [x for x in edge_list1 if x is not None]  # remove none
    edge_list = [item for sublist in edge_list2 for item in sublist]

    G = nx.Graph()
    G.add_nodes_from(node_list)
    G.add_edges_from(edge_list)

    return (G, n_papers_per_author)

### Cumulative graph

In [None]:
# CUMULATIVE NOW
graph_list = []

for year in tqdm(range(1965, 2022)):

    df_local = df.loc[df["eric:dateAdded"] <= year]

    if len(df_local) == 0:
        continue

    result = generate_graph(df_local)
    G = result[0]
    
    n_authors = len(G)
    n_papers = len(df_local)
    n_authors_per_paper = df_local["n_authors"].mean()
    n_papers_per_author = result[1]
    n_collabs = nx.number_of_edges(G)
    n_isolates = nx.number_of_isolates(G)
    mean_collabs = 2 * G.number_of_edges() / float(G.number_of_nodes())

    G_largest_comp = G.subgraph(
        sorted(nx.connected_components(G), key=len, reverse=True)[0]
    )
    largest_component = len(G_largest_comp) / len(G)
    
    G_largest_bicomp = G.subgraph(
        sorted(nx.biconnected_components(G), key=len, reverse=True)[0]
    )
    largest_bicomponent = len(G_largest_bicomp) / len(G)

    deg_assort = nx.degree_assortativity_coefficient(G)
    avg_clustering = nx.average_clustering(G)
    transitivity = nx.transitivity(G)
    
    avg_path_len = nx.average_shortest_path_length(G_largest_comp)
    

    graph_list.append(
        (
            year,
            n_authors,
            n_papers,
            n_authors_per_paper,
            n_papers_per_author,
            n_collabs,
            n_isolates,
            mean_collabs,
            largest_component,
            largest_bicomponent,
            deg_assort,
            avg_clustering,
            transitivity,
            avg_path_len
        )
    )

df_summary = pd.DataFrame(
    graph_list,
    columns=[
        "year",
        "n_authors",
        "n_papers",
        "n_authors_per_paper",
        "n_papers_per_author",
        "n_collabs",
        "n_isolates",
        "mean_collabs",
        "largest_component",
        "largest_bicomponent",
        "deg_assort",
        "avg_clustering",
        "transitivity",
        "avg_path_len"
    ],
)

df_summary

 60%|█████████████████████▍              | 34/57 [3:36:08<14:02:19, 2197.35s/it]

In [None]:
df_summary.to_csv("df_graph_summary.csv", encoding='utf-8', index=False)

### Descriptives

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", line=True
).encode(x="year", y="n_authors")

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", line=True
).encode(x="year", y="n_papers")

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", line=True
).encode(x="year", y="n_authors_per_paper")

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", line=True
).encode(x="year", y="n_papers_per_author")

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", line=True
).encode(x="year", y="n_collabs")

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", line=True
).encode(x="year", y="mean_collabs")

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", line=Tru
).encode(x="year", y="n_isolates")

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", line=True
).encode(x="year", y="largest_component")

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", line=True
).encode(x="year", y="deg_assort")

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", line=True
).encode(x="year", y="avg_clustering")

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", line=True
).encode(x="year", y="transitivity")

In [None]:
list1 = []

result = generate_graph(df)
G = result[0]
n_authors = len(G)
n_papers = len(df)

n_authors_per_paper = df["n_authors"].mean()
n_papers_per_author = result[1]

n_collabs = nx.number_of_edges(G)
n_isolates = nx.number_of_isolates(G)
mean_collabs = 2 * G.number_of_edges() / float(G.number_of_nodes())

G_largest_comp = G.subgraph(
    sorted(nx.connected_components(G), key=len, reverse=True)[0]
)
largest_component = len(G_largest_comp) / len(G)

G_largest_bicomp = G.subgraph(
    sorted(nx.biconnected_components(G), key=len, reverse=True)[0]
)
largest_bicomponent = len(G_largest_bicomp) / len(G)

deg_assort = nx.degree_assortativity_coefficient(G)
avg_clustering = nx.average_clustering(G)
transitivity = nx.transitivity(G)

avg_path_len = nx.average_shortest_path_length(G_largest_comp)

list1.append(
    (
        year,
        n_authors,
        n_papers,
        n_authors_per_paper,
        n_papers_per_author,
        n_collabs,
        n_isolates,
        mean_collabs,
        largest_component,
        largest_bicomponent,
        deg_assort,
        avg_clustering,
        transitivity,
        avg_path_len
    )
)

df_overall = pd.DataFrame(
    list1,
    columns=[
        "year",
        "n_authors",
        "n_papers",
        "n_authors_per_paper",
        "n_papers_per_author",
        "n_collabs",
        "n_isolates",
        "mean_collabs",
        "largest_component",
        "largest_bicomponent",
        "deg_assort",
        "avg_clustering",
        "transitivity",
        "avg_path_len"
    ],
)
df_overall

In [None]:
nx.info(G)

In [None]:
# most influential researchers
ec_dict = nx.eigenvector_centrality(G)
sorted(ec_dict.items(), key=lambda item: item[1], reverse = True)

## Analyses for paper

### Analysis 1: Changing likelihood of co-authorships

In [None]:
df['collaboration'] = np.where(df['n_authors'] > 1, 1, 0)
df.tail()

In [None]:
df_fig = df.groupby('eric:dateAdded')[['collaboration']].mean().reset_index()

alt.Chart(df_fig).mark_line().encode(x="eric:dateAdded", y="collaboration")

### Analysis 2

In [None]:
G_largest_bicomp = G.subgraph(
    sorted(nx.biconnected_components(G), key=len, reverse=True)[0]
)
largest_bicomponent = len(G_largest_bicomp) / len(G)
largest_bicomponent

### Analysis 3

In [None]:
degree_sequence = sorted((d for n, d in G.degree()), reverse=True)

In [None]:
fig = plt.figure(figsize = (12, 8))
ax = fig.add_subplot()
ax.set_xscale("log")
ax.set_yscale("log")
ax.set_xlim([10, 1000])
ax.scatter(*np.unique(degree_sequence, return_counts=True))

In [None]:
# check if power law holds
import powerlaw
results = powerlaw.Fit(degree_sequence)
print(results.power_law.alpha)
print(results.power_law.xmin)
results.distribution_compare('power_law', 'exponential', normalized_ratio = True)

### Analysis 4

In [None]:
G_random = nx.gnm_random_graph(749249, 1704772, seed=27, directed=False)
nx.info(G_random)

In [None]:
nx.average_clustering(G)

In [None]:
nx.average_clustering(G_random)

In [None]:
nx.average_shortest_path_length(G_largest_comp)

In [None]:
G_random_largest_comp = G_random.subgraph(
    sorted(nx.connected_components(G_random), key=len, reverse=True)[0]
)
nx.average_shortest_path_length(G_random_largest_comp)

### Analysis 5: Subjects

In [None]:
node_list1 = df["subjects"].to_list()
node_list2 = [x for x in node_list1 if x is not None]  # remove none
node_list3 = [item for sublist in node_list2 for item in sublist]

In [None]:
# take subjects out for manual coding quant 0/1
df_temp = pd.DataFrame(Counter(node_list3).most_common(), columns=["subject", "n"])
df_temp.to_csv("subjects.csv", encoding='utf-8', index=False)