In [1]:
from itertools import combinations
import re
from tqdm import tqdm
import pandas as pd
import xmltodict
import networkx as nx
import collections
import matplotlib.pyplot as plt
from unidecode import unidecode
import altair as alt
tqdm.pandas()

In [2]:
def extract_authors(dc):
    if type(dc) == list:
        result = [
            unidecode(i.get("#text").title().strip())
            for i in dc
            if i.get("#text") is not None and i.get("@scheme") != "institution"
        ]
        return [x for x in result if x != "And Others"]
    elif dc.get("#text") is not None and dc.get("@scheme") != "institution":
        return [unidecode(dc.get("#text").title().strip())]


def clean_name(name):
    if ', ' in name:
        lst = name.split(', ')
        lst = [item.split(' ')[0] for item in lst]
        return lst[1] + ' ' + lst[0]
    elif ',' in name:
        lst = name.split(',')
        lst = [item.split(' ')[0] for item in lst]
        return lst[1] + ' ' + lst[0]
    else:
        return name


def get_edges(auth_list):
    return list(combinations(auth_list, 2))

def extract_ids(dc):
    if type(dc) == list:
        return [
            i.get("#text").upper().strip()
            for i in dc
            if i.get("#text") is not None and i.get("@scheme") == "eric_accno"
        ][0]
    elif dc.get("#text") is not None and dc.get("@scheme") == "eric_accno":
        return dc.get("#text").upper().strip()

In [3]:
df_all = []

for year in tqdm(range(1965, 2021)):
    file_name = "data/eric" + str(year)
    with open(file_name + ".xml", encoding="utf-8") as fd:
        dict = xmltodict.parse(fd.read())
    recs = [rec["metadata"] for rec in dict["records"]["record"]]
    df = pd.DataFrame(recs)
    df = df[df['dc:type'].notna()]
    df = df[df['eric:peer_reviewed'].notna()]
    df['type'] = [''.join(map(str, l)).lower() for l in df['dc:type']]
    df = df.loc[df['eric:peer_reviewed'] == 'T']
    # df = df[['ids', 'authors', 'edges', 'dc:type', 'dc:subject', 'eric:keywords', 'eric:keywords_geo', 'dc:title', 'eric:pageCount', 'dc:date', 'eric:dateAdded']]
    df_all.append(df)
df_all = pd.concat(df_all)

df_all = df_all.loc[(df_all['type'].str.contains("journal"))]
df_all["authors"] = df_all.apply(lambda row: extract_authors(row["dc:creator"]), axis=1)
df_all = df_all[df_all['authors'].notna()]
df_all['authors'] = df_all.apply(lambda row: [clean_name(item) for item in row['authors']], axis=1)
df_all['n_authors'] = df_all.apply(lambda row: len(row["authors"]), axis=1)
df_all["edges"] = df_all.apply(lambda row: get_edges(sorted(row["authors"])), axis=1)
df_all["ids"] = df_all.apply(lambda row: extract_ids(row["dc:identifier"]), axis=1)
df_all.info()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 56/56 [10:41<00:00, 11.46s/it]


<class 'pandas.core.frame.DataFrame'>
Int64Index: 908421 entries, 17594 to 44897
Data columns (total 40 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   dcterms:accessRights       908421 non-null  object
 1   dc:subject                 908417 non-null  object
 2   dc:creator                 908421 non-null  object
 3   dc:type                    908421 non-null  object
 4   eric:keywords              398883 non-null  object
 5   eric:keywords_geo          301814 non-null  object
 6   eric:issn                  746293 non-null  object
 7   dc:language                905402 non-null  object
 8   dcterms:educationLevel     326980 non-null  object
 9   dc:description             907932 non-null  object
 10  dc:identifier              908421 non-null  object
 11  dc:title                   908421 non-null  object
 12  dc:source                  908421 non-null  object
 13  eric:citation              908376 non-nul

In [4]:
df_all.groupby(['eric:dateAdded'])['dcterms:educationLevel'].apply(lambda x: x.isnull().mean())
# df_all["dc:subject"].iloc[328615]

eric:dateAdded
1971    1.000000
1972    1.000000
1973    1.000000
1974    1.000000
1975    1.000000
1976    1.000000
1977    1.000000
1978    1.000000
1979    1.000000
1980    0.999933
1981    1.000000
1982    1.000000
1983    1.000000
1984    1.000000
1985    1.000000
1986    1.000000
1987    1.000000
1988    1.000000
1989    1.000000
1990    1.000000
1991    1.000000
1992    0.999933
1993    1.000000
1994    0.999529
1995    0.999582
1996    0.999814
1997    1.000000
1998    1.000000
1999    1.000000
2000    0.999944
2001    1.000000
2002    1.000000
2003    0.999885
2004    0.998445
2005    0.592437
2006    0.592186
2007    0.481067
2008    0.456899
2009    0.425032
2010    0.389724
2011    0.399492
2012    0.382241
2013    0.383530
2014    0.320045
2015    0.322835
2016    0.318985
2017    0.321002
2018    0.332780
2019    0.324151
2020    0.327627
Name: dcterms:educationLevel, dtype: float64

In [5]:
def extract_subject(dc):
#     if pd.isna(dc):
#         return None
    result = []
    for item in dc:
        if type(item) == collections.OrderedDict:
            result.append(unidecode(item.get("#text").title().strip()))
        elif type(item) == str:
            result.append(unidecode(item.title().strip()))
        else:
            result.append(unidecode(item.title().strip()))
    return result

extract_subject(df_all["dc:subject"].iloc[17722])

['Anxiety',
 'College Students',
 'Counselor Client Relationship',
 'Depression (Psychology)',
 'Males',
 'Methods',
 'Participation',
 'Patients',
 'Psychotherapy']

In [6]:
df_all = df_all[df_all['dc:subject'].notna()]
df_all["subjects"] = df_all.progress_apply(lambda row: extract_subject(row["dc:subject"]), axis=1)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 908417/908417 [00:46<00:00, 19395.94it/s]


In [7]:
df_all["subjects"]

17594    [Academic Aspiration, Aspiration, Females, Mar...
17595    [Adolescents, Aspiration, Career Choice, Child...
17596    [Aspiration, Black Attitudes, Black Youth, Mil...
17730    [Ability, Career Choice, Career Development, C...
17892    [Aspiration, Career Choice, Career Planning, C...
                               ...                        
44893    [Metacognition, Learning Strategies, College S...
44894    [Educational Technology, Technology Uses In Ed...
44895    [School Administration, Principals, Leadership...
44896    [Mathematics Anxiety, Mathematics Achievement,...
44897    [College Athletics, Racial Bias, Gender Bias, ...
Name: subjects, Length: 908417, dtype: object

In [10]:
df_all.loc[:, 'sub1'] = df_all.subjects.map(lambda x: x[0])
df_all['sub1'].value_counts()

Foreign Countries          55703
Academic Achievement       14269
Adolescents                12207
Higher Education            9843
College Students            7813
                           ...  
Portfolios Assessment          1
Rural Environment              1
Handwriting Instruction        1
Middle Class Standards         1
Equal Facilities               1
Name: sub1, Length: 5252, dtype: int64

In [11]:
node_list1 = df_all["subjects"].tolist()
node_list2 = [x for x in node_list1 if x is not None]  # remove none
node_list3 = [item for sublist in node_list2 for item in sublist]
node_list = list(set(node_list3))
len(node_list)

6630

In [None]:
df_all['eric:dateAdded'].value_counts().sort_index()

In [None]:
def generate_graph(df_local):

    node_list1 = df_local["authors"].tolist()
    node_list2 = [x for x in node_list1 if x is not None]  # remove none
    node_list3 = [item for sublist in node_list2 for item in sublist]
    node_list = list(set(node_list3))

    n_papers_per_author = len(node_list3)/len(node_list)
    
    edge_list1 = df_local["edges"].tolist()
    edge_list2 = [x for x in edge_list1 if x is not None]  # remove none
    edge_list = [item for sublist in edge_list2 for item in sublist]

    G = nx.Graph()
    G.add_nodes_from(node_list)
    G.add_edges_from(edge_list)
    return (G, n_papers_per_author)

In [None]:
# year by year (NOT cumulative)
list1 = []

for year in tqdm(range(1965, 2021)):
    
    df_local = df_all.loc[df_all['eric:dateAdded'] == str(year)]
    
    if len(df_local) == 0:
        continue
    
    result = generate_graph(df_local)
    G = result[0]
    n_authors = len(G)
    n_papers = len(df_local)
    
    n_authors_per_paper = df_local['n_authors'].mean()
    n_papers_per_author = result[1]
    
    n_collabs = nx.number_of_edges(G)
    n_isolates = nx.number_of_isolates(G)
    mean_collabs = 2 * G.number_of_edges() / float(G.number_of_nodes())
    
    G_largest_comp = G.subgraph(sorted(nx.connected_components(G), key=len, reverse=True)[0])
    largest_component = len(G_largest_comp)/len(G) 
    
    deg_assort = nx.degree_assortativity_coefficient(G)
    avg_clustering = nx.average_clustering(G)
    transitivity = nx.transitivity(G)
    
    list1.append((year, n_authors, n_papers, n_authors_per_paper, n_papers_per_author, n_collabs, n_isolates, mean_collabs, largest_component, deg_assort, avg_clustering, transitivity))

df_summary = pd.DataFrame(list1, columns = ["year", "n_authors", "n_papers", "n_authors_per_paper", "n_papers_per_author", "n_collabs", "n_isolates", "mean_collabs", "largest_component", "deg_assort", "avg_clustering", "transitivity"])
df_summary

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue",
    interpolate='step-after',
    line=True
).encode(
    x='year',
    y='n_authors'
)

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue",
    interpolate='step-after',
    line=True
).encode(
    x='year',
    y='n_papers'
)

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue",
    interpolate='step-after',
    line=True
).encode(
    x='year',
    y='n_authors_per_paper'
)

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue",
    interpolate='step-after',
    line=True
).encode(
    x='year',
    y='n_papers_per_author'
)

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue",
    interpolate='step-after',
    line=True
).encode(
    x='year',
    y='n_collabs'
)

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue",
    interpolate='step-after',
    line=True
).encode(
    x='year',
    y='mean_collabs'
)

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue",
    interpolate='step-after',
    line=True
).encode(
    x='year',
    y='n_isolates'
)

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue",
    interpolate='step-after',
    line=True
).encode(
    x='year',
    y='largest_component'
)

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue",
    interpolate='step-after',
    line=True
).encode(
    x='year',
    y='deg_assort'
)

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue",
    interpolate='step-after',
    line=True
).encode(
    x='year',
    y='avg_clustering'
)

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue",
    interpolate='step-after',
    line=True
).encode(
    x='year',
    y='transitivity'
)

In [None]:
list1 = []

result = generate_graph(df_all)
G = result[0]
n_authors = len(G)
n_papers = len(df_all)

n_authors_per_paper = df_all['n_authors'].mean()
n_papers_per_author = result[1]

n_collabs = nx.number_of_edges(G)
n_isolates = nx.number_of_isolates(G)
mean_collabs = 2 * G.number_of_edges() / float(G.number_of_nodes())

G_largest_comp = G.subgraph(sorted(nx.connected_components(G), key=len, reverse=True)[0])
largest_component = len(G_largest_comp)/len(G) 

deg_assort = nx.degree_assortativity_coefficient(G)
avg_clustering = nx.average_clustering(G)
transitivity = nx.transitivity(G)

list1.append((year, n_authors, n_papers, n_authors_per_paper, n_papers_per_author, n_collabs, n_isolates, mean_collabs, largest_component, deg_assort, avg_clustering, transitivity))

df_overall = pd.DataFrame(list1, columns = ["year", "n_authors", "n_papers", "n_authors_per_paper", "n_papers_per_author", "n_collabs", "n_isolates", "mean_collabs", "largest_component", "deg_assort", "avg_clustering", "transitivity"])
df_overall

In [None]:
nx.info(G)

In [None]:
plt.figure(figsize=(12,12))

def draw_ego(name):
    hub_ego = nx.ego_graph(G, name)
    pos = nx.spring_layout(hub_ego)
    nx.draw(hub_ego, pos, node_color="b", node_size=50, with_labels=True)
    options = {"node_size": 300, "node_color": "r"}
    nx.draw_networkx_nodes(hub_ego, pos, nodelist=[name], **options)

draw_ego("Linda Darling-Hammond")