In [1]:
from itertools import combinations
import re
import os
from tqdm import tqdm
import pandas as pd
import xmltodict
import networkx as nx
import pickle
import collections
import matplotlib.pyplot as plt
from unidecode import unidecode
import altair as alt

tqdm.pandas()
from collections import Counter
import string

# from genderize import Genderize
# genderize = Genderize(
#     user_agent='GenderizeDocs/0.0',
#     api_key='c363eacf807f4af4992b358200ebc15c',
#     timeout=30.0)

with open("dict_genders.pickle", "rb") as handle:
    dict_genders = pickle.load(handle)

pd.set_option("display.max_columns", None)

In [2]:
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import gensim.corpora as corpora
import pyLDAvis.gensim_models
import pyLDAvis

stop_words = stopwords.words('english')
stop_words.extend(['from', 're', 'use'])

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saurabh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def extract_authors(dc):
    if type(dc) == list:
        result = [
            unidecode(i.get("#text").title().strip())
            for i in dc
            if i.get("#text") is not None and i.get("@scheme") != "institution"
        ]
        return [x for x in result if x != "And Others"]
    elif dc.get("#text") is not None and dc.get("@scheme") != "institution":
        return [unidecode(dc.get("#text").title().strip())]


def clean_name(name):
    if ", " in name:
        lst = name.split(", ")
        lst = [item.split(" ")[0] for item in lst]
        return lst[1] + " " + lst[0]
    elif "," in name:
        lst = name.split(",")
        lst = [item.split(" ")[0] for item in lst]
        return lst[1] + " " + lst[0]
    else:
        return name


def get_first_names(author_list):

    if len(author_list) > 0:
        names = []
        for x in author_list:
            words_in_name = len(x.split())

            if words_in_name > 0:
                first = re.sub(r"[^\w\s]", "", x.split()[0])
            if words_in_name > 1:
                second = re.sub(r"[^\w\s]", "", x.split()[1])
            if words_in_name > 2:
                third = re.sub(r"[^\w\s]", "", x.split()[2])
            if words_in_name > 3:
                fourth = re.sub(r"[^\w\s]", "", x.split()[3])

            if words_in_name > 0 and len(first) > 1:
                names.append(first)
            elif words_in_name > 1 and len(second) > 1:
                names.append(second)
            elif words_in_name > 2 and len(third) > 1:
                names.append(third)
            elif words_in_name > 3 and len(fourth) > 1:
                names.append(fourth)
            else:
                names.append(x)

        return names
    else:
        return None


def name_to_gender(first_name_list):
    if first_name_list and len(first_name_list) > 0:
        return [dict_genders[name] for name in first_name_list]
    else:
        return None


def get_edges(auth_list):
    return list(combinations(auth_list, 2))


def extract_ids(dc):
    if type(dc) == list:
        return [
            i.get("#text").upper().strip()
            for i in dc
            if i.get("#text") is not None and i.get("@scheme") == "eric_accno"
        ][0]
    elif dc.get("#text") is not None and dc.get("@scheme") == "eric_accno":
        return dc.get("#text").upper().strip()


def extract_subject(dc):
    result = []
    for item in dc:
        if type(item) == collections.OrderedDict:
            result.append(unidecode(item.get("#text").title().strip()))
        elif type(item) == str:
            result.append(unidecode(item.title().strip()))
        else:
            result.append(unidecode(item.title().strip()))
    return result


def plot_degree_dist(G):
    degrees = [G.degree(n) for n in G.nodes()]
    plt.hist(degrees)
    plt.show()

In [4]:
df_all = []

for year in tqdm(range(1965, 2021)):
    file_name = "data/eric" + str(year)
    with open(file_name + ".xml", encoding="utf-8") as fd:
        dict = xmltodict.parse(fd.read())
    recs = [rec["metadata"] for rec in dict["records"]["record"]]
    df = pd.DataFrame(recs)

    df = df[df["dc:type"].notna()]
    df = df[df["eric:peer_reviewed"].notna()]
    df["type"] = ["".join(map(str, l)).lower() for l in df["dc:type"]]
    df = df.loc[df["eric:peer_reviewed"] == "T"]
    # df = df[['ids', 'authors', 'edges', 'dc:type', 'dc:subject', 'eric:keywords', 'eric:keywords_geo', 'dc:title', 'eric:pageCount', 'dc:date', 'eric:dateAdded']]
    df_all.append(df)

df_all = pd.concat(df_all)

100%|███████████████████████████████████████████| 56/56 [07:45<00:00,  8.32s/it]


In [93]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 908417 entries, 17594 to 44897
Data columns (total 43 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   dcterms:accessRights       908417 non-null  object
 1   dc:subject                 908417 non-null  object
 2   dc:creator                 908417 non-null  object
 3   dc:type                    908417 non-null  object
 4   eric:keywords              398882 non-null  object
 5   eric:keywords_geo          301814 non-null  object
 6   eric:issn                  746289 non-null  object
 7   dc:language                905398 non-null  object
 8   dcterms:educationLevel     326980 non-null  object
 9   dc:description             907928 non-null  object
 10  dc:identifier              908417 non-null  object
 11  dc:title                   908417 non-null  object
 12  dc:source                  908417 non-null  object
 13  eric:citation              908372 non-nul

In [6]:
df_all["eric:dateAdded"] = pd.to_numeric(df_all["eric:dateAdded"])

In [7]:
df_all = df_all.loc[(df_all["type"].str.contains("journal"))]

# get author names
df_all["authors"] = df_all.progress_apply(lambda row: extract_authors(row["dc:creator"]), axis=1)
df_all = df_all[df_all["authors"].notna()]  # remove nan authors
df_all["authors"] = df_all.progress_apply(
    lambda row: [clean_name(item) for item in row["authors"]], axis=1
)

# get edges
df_all["n_authors"] = df_all.progress_apply(lambda row: len(row["authors"]), axis=1)
df_all["edges"] = df_all.progress_apply(
    lambda row: get_edges(sorted(row["authors"])), axis=1
)
# df_all["ids"] = df_all.progress_apply(lambda row: extract_ids(row["dc:identifier"]), axis=1)

# get subjects
df_all = df_all[df_all["dc:subject"].notna()]
df_all["subjects"] = df_all.progress_apply(
    lambda row: extract_subject(row["dc:subject"]), axis=1
)
df_all["subjects"] = df_all["subjects"].map(lambda x: list(map(str.lower, x)))
# df_all.loc[:, "subject_top"] = df_all.subjects.map(lambda x: x[0])

# get author first name and then use it to predict gender
df_all["author_first_names"] = df_all.progress_apply(
    lambda row: get_first_names(row["authors"]), axis=1
)
df_all["author_genders"] = df_all.progress_apply(
    lambda row: name_to_gender(row["author_first_names"]), axis=1
)  # get genders from dict

100%|█████████████████████████████████| 913835/913835 [02:25<00:00, 6279.98it/s]
100%|█████████████████████████████████| 908421/908421 [01:54<00:00, 7908.64it/s]
100%|█████████████████████████████████| 908421/908421 [01:50<00:00, 8201.76it/s]
100%|████████████████████████████████| 908421/908421 [00:50<00:00, 17933.17it/s]
100%|█████████████████████████████████| 908417/908417 [01:39<00:00, 9086.89it/s]
100%|█████████████████████████████████| 908417/908417 [03:21<00:00, 4516.54it/s]
100%|█████████████████████████████████| 908417/908417 [02:57<00:00, 5113.09it/s]


In [8]:
df_all["subjects_str"] = [",".join(map(str, l)).lower() for l in df_all["subjects"]]

## Subject categories

Subjects are terms from the ERIC Thesaurus added to records to help identify materials on topics in the field of education. Subject terms are assigned to every record in ERIC and reflect the subjects specified in the content.

In [89]:
df_temp["dc:title"].iloc[1000]

'The Development and Implementation of a Peer-Led Intervention to Prevent Smoking among Secondary School Students Using Their Established Social Networks'

In [90]:
df_temp["dc:description"].iloc[1000]

'Objective: To design, implement and evaluate a peer-led intervention to reduce smoking amongst secondary school students. Design: A health promotion intervention combining peer education with diffusion of innovation theory, to be rigorously evaluated by means of a cluster randomised controlled trial with concurrent process and economic evaluations. Setting: Year 8 students (12/13 year olds) in 30 secondary schools in south-west England and south Wales. Method: Approximately 15 per cent of students, identified by their peers as being influential within the school, were trained to intervene in everyday situations and encourage their fellow students not to smoke. These &quot;peer supporters&quot; received two days of intensive training from a team of trainers led by professional health educators at training venues and four follow-up sessions back at school. The training aimed to equip the peer supporters with the knowledge, skills and confidence to encourage their peers not to smoke. Res

In [91]:
df_temp["subjects"].iloc[1000]

['intervention',
 'health promotion',
 'smoking',
 'foreign countries',
 'social networks',
 'secondary school students',
 'student attitudes',
 'peer influence',
 'innovation',
 'grade 8',
 'program development',
 'program implementation',
 'program evaluation',
 'educational theories',
 'costs']

### Network

In [94]:
key = "network"

In [95]:
df_temp = df_all.loc[(df_all["subjects_str"].str.contains(key))]
print("Total peer-reviewd articles:", len(df_temp))

Total peer-reviewd articles: 12083


In [96]:
temp_list1 = df_temp["subjects"].tolist()
temp_list2 = [x for x in temp_list1 if x is not None]  # remove nans
temp_list3 = [item for sublist in temp_list2 for item in sublist]
temp_list4 = [x for x in temp_list3 if x != key]  # remove key
temp_list5 = [x for x in temp_list4 if x != key + "s"]  # remove key
Counter(temp_list5).most_common()[:30]

[('social networks', 6467),
 ('foreign countries', 4052),
 ('higher education', 1842),
 ('computer networks', 1691),
 ('computer mediated communication', 1590),
 ('information networks', 1375),
 ('internet', 1336),
 ('network analysis', 1305),
 ('educational technology', 1258),
 ('teaching methods', 1125),
 ('student attitudes', 1021),
 ('case studies', 945),
 ('information technology', 942),
 ('interviews', 887),
 ('college students', 881),
 ('telecommunications', 841),
 ('web sites', 819),
 ('technology uses in education', 700),
 ('models', 683),
 ('questionnaires', 662),
 ('computer uses in education', 656),
 ('library networks', 641),
 ('qualitative research', 589),
 ('educational change', 583),
 ('elementary secondary education', 582),
 ('interpersonal relationship', 581),
 ('computer software', 569),
 ('college faculty', 546),
 ('correlation', 545),
 ('social capital', 545)]

### Social Network

In [109]:
key = "social network"

In [110]:
df_temp = df_all.loc[(df_all["subjects_str"].str.contains(key))]
print("Total peer-reviewd articles:", len(df_temp))

Total peer-reviewd articles: 6466


In [111]:
temp_list1 = df_temp["subjects"].tolist()
temp_list2 = [x for x in temp_list1 if x is not None]  # remove nans
temp_list3 = [item for sublist in temp_list2 for item in sublist]
temp_list4 = [x for x in temp_list3 if x != key]  # remove key
temp_list5 = [x for x in temp_list4 if x != key + "s"]  # remove key
Counter(temp_list5).most_common()

[('foreign countries', 2463),
 ('computer mediated communication', 1245),
 ('internet', 829),
 ('student attitudes', 802),
 ('network analysis', 795),
 ('web sites', 743),
 ('educational technology', 724),
 ('college students', 713),
 ('teaching methods', 712),
 ('interviews', 650),
 ('case studies', 584),
 ('higher education', 546),
 ('technology uses in education', 533),
 ('interpersonal relationship', 506),
 ('social capital', 496),
 ('questionnaires', 487),
 ('social support groups', 480),
 ('qualitative research', 449),
 ('peer relationship', 426),
 ('social media', 423),
 ('correlation', 422),
 ('electronic publishing', 420),
 ('communities of practice', 413),
 ('statistical analysis', 406),
 ('adolescents', 399),
 ('electronic learning', 377),
 ('undergraduate students', 361),
 ('web 2.0 technologies', 358),
 ('college faculty', 346),
 ('information technology', 344),
 ('friendship', 335),
 ('online courses', 315),
 ('gender differences', 311),
 ('interaction', 297),
 ('comparat

### Social Capital

In [100]:
key = "social capital"

In [101]:
df_temp = df_all.loc[(df_all["subjects_str"].str.contains(key))]
print("Total peer-reviewd articles:", len(df_temp))

Total peer-reviewd articles: 2877


In [102]:
temp_list1 = df_temp["subjects"].tolist()
temp_list2 = [x for x in temp_list1 if x is not None]  # remove nans
temp_list3 = [item for sublist in temp_list2 for item in sublist]
temp_list4 = [x for x in temp_list3 if x != key]  # remove key
temp_list5 = [x for x in temp_list4 if x != key + "s"]  # remove key
Counter(temp_list5).most_common()[:30]

[('foreign countries', 1352),
 ('social networks', 496),
 ('cultural capital', 414),
 ('interviews', 344),
 ('student attitudes', 317),
 ('higher education', 289),
 ('case studies', 289),
 ('academic achievement', 277),
 ('correlation', 263),
 ('human capital', 257),
 ('qualitative research', 220),
 ('college students', 218),
 ('educational attainment', 181),
 ('educational policy', 180),
 ('interpersonal relationship', 172),
 ('semi structured interviews', 168),
 ('immigrants', 162),
 ('teacher attitudes', 162),
 ('trust (psychology)', 157),
 ('high school students', 155),
 ('socioeconomic status', 154),
 ('educational change', 152),
 ('adolescents', 150),
 ('equal education', 147),
 ('parent participation', 142),
 ('self concept', 141),
 ('questionnaires', 139),
 ('access to education', 138),
 ('gender differences', 138),
 ('statistical analysis', 138)]

### Peer

In [103]:
key = "peer"

In [104]:
df_temp = df_all.loc[(df_all["subjects_str"].str.contains(key))]
print("Total peer-reviewd articles:", len(df_temp))

Total peer-reviewd articles: 23517


In [105]:
temp_list1 = df_temp["subjects"].tolist()
temp_list2 = [x for x in temp_list1 if x is not None]  # remove nans
temp_list3 = [item for sublist in temp_list2 for item in sublist]
temp_list4 = [x for x in temp_list3 if x != key]  # remove key
temp_list5 = [x for x in temp_list4 if x != key + "s"]  # remove key
Counter(temp_list5).most_common()[:30]

[('peer relationship', 11279),
 ('foreign countries', 6725),
 ('peer influence', 4073),
 ('student attitudes', 4039),
 ('peer evaluation', 3830),
 ('adolescents', 3242),
 ('peer teaching', 3184),
 ('teaching methods', 2805),
 ('higher education', 2313),
 ('college students', 2100),
 ('teacher student relationship', 2011),
 ('gender differences', 2007),
 ('correlation', 1986),
 ('elementary school students', 1965),
 ('intervention', 1865),
 ('interpersonal competence', 1847),
 ('peer acceptance', 1806),
 ('peer groups', 1751),
 ('academic achievement', 1707),
 ('questionnaires', 1663),
 ('friendship', 1571),
 ('comparative analysis', 1543),
 ('program effectiveness', 1481),
 ('interviews', 1453),
 ('teacher attitudes', 1374),
 ('cooperative learning', 1340),
 ('bullying', 1322),
 ('undergraduate students', 1279),
 ('feedback (response)', 1268),
 ('aggression', 1267)]

## Topic Model

In [None]:
# Remove punctuation
df_all['dc:description'] = df_all['dc:description'].map(lambda x: re.sub('[,\.!?]', '', str(x)))

# Convert to lowercase
df_all['dc:description'] = df_all['dc:description'].map(lambda x: x.lower())

# Print out the first rows of papers
df_all['dc:description'].head()

In [None]:
data = df_all["dc:description"].values.tolist()
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)

In [None]:
len(data_words)

In [None]:
# del df_all
data_words[27]

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_words)

# Create Corpus
texts = data_words

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
# number of topics
num_topics = 10

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
# Print the Keyword in the 10 topics
# pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('results/ldavis_prepared_'+ str(num_topics))

# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

pyLDAvis.save_html(LDAvis_prepared, 'results/educ-sn-lda/ldavis_prepared_'+ str(num_topics) +'.html')
# LDAvis_prepared

## Graph Analysis

In [None]:
def generate_graph(df_local):

    node_list1 = df_local["authors"].tolist()
    node_list2 = [x for x in node_list1 if x is not None]  # remove none
    node_list3 = [item for sublist in node_list2 for item in sublist]
    node_list = list(set(node_list3))

    n_papers_per_author = len(node_list3) / len(node_list)

    edge_list1 = df_local["edges"].tolist()
    edge_list2 = [x for x in edge_list1 if x is not None]  # remove none
    edge_list = [item for sublist in edge_list2 for item in sublist]

    G = nx.Graph()
    G.add_nodes_from(node_list)
    G.add_edges_from(edge_list)
    return (G, n_papers_per_author)

### Yearwise graph

In [None]:
# CUMULATIVE NOW
list1 = []

for year in tqdm(range(1965, 2021)):

    df_local = df_all.loc[df_all["eric:dateAdded"] <= year]

    if len(df_local) == 0:
        continue

    result = generate_graph(df_local)
    G = result[0]
    n_authors = len(G)
    n_papers = len(df_local)

    n_authors_per_paper = df_local["n_authors"].mean()
    n_papers_per_author = result[1]

    n_collabs = nx.number_of_edges(G)
    n_isolates = nx.number_of_isolates(G)
    mean_collabs = 2 * G.number_of_edges() / float(G.number_of_nodes())

    G_largest_comp = G.subgraph(
        sorted(nx.connected_components(G), key=len, reverse=True)[0]
    )
    largest_component = len(G_largest_comp) / len(G)

    deg_assort = nx.degree_assortativity_coefficient(G)
    avg_clustering = nx.average_clustering(G)
    transitivity = nx.transitivity(G)

    list1.append(
        (
            year,
            n_authors,
            n_papers,
            n_authors_per_paper,
            n_papers_per_author,
            n_collabs,
            n_isolates,
            mean_collabs,
            largest_component,
            deg_assort,
            avg_clustering,
            transitivity,
        )
    )

df_summary = pd.DataFrame(
    list1,
    columns=[
        "year",
        "n_authors",
        "n_papers",
        "n_authors_per_paper",
        "n_papers_per_author",
        "n_collabs",
        "n_isolates",
        "mean_collabs",
        "largest_component",
        "deg_assort",
        "avg_clustering",
        "transitivity",
    ],
)
df_summary

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", interpolate="step-after", line=True
).encode(x="year", y="n_papers")

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", interpolate="step-after", line=True
).encode(x="year", y="n_authors")

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", interpolate="step-after", line=True
).encode(x="year", y="n_authors_per_paper")

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", interpolate="step-after", line=True
).encode(x="year", y="n_papers_per_author")

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", interpolate="step-after", line=True
).encode(x="year", y="n_collabs")

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", interpolate="step-after", line=True
).encode(x="year", y="mean_collabs")

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", interpolate="step-after", line=True
).encode(x="year", y="n_isolates")

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", interpolate="step-after", line=True
).encode(x="year", y="largest_component")

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", interpolate="step-after", line=True
).encode(x="year", y="deg_assort")

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", interpolate="step-after", line=True
).encode(x="year", y="avg_clustering")

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", interpolate="step-after", line=True
).encode(x="year", y="transitivity")

### Cumulative Graph

In [None]:
list1 = []

result = generate_graph(df_all)
G = result[0]
n_authors = len(G)
n_papers = len(df_all)

n_authors_per_paper = df_all["n_authors"].mean()
n_papers_per_author = result[1]

n_collabs = nx.number_of_edges(G)
n_isolates = nx.number_of_isolates(G)
mean_collabs = 2 * G.number_of_edges() / float(G.number_of_nodes())

G_largest_comp = G.subgraph(
    sorted(nx.connected_components(G), key=len, reverse=True)[0]
)
largest_component = len(G_largest_comp) / len(G)

deg_assort = nx.degree_assortativity_coefficient(G)
avg_clustering = nx.average_clustering(G)
transitivity = nx.transitivity(G)

list1.append(
    (
        year,
        n_authors,
        n_papers,
        n_authors_per_paper,
        n_papers_per_author,
        n_collabs,
        n_isolates,
        mean_collabs,
        largest_component,
        deg_assort,
        avg_clustering,
        transitivity,
    )
)

df_overall = pd.DataFrame(
    list1,
    columns=[
        "year",
        "n_authors",
        "n_papers",
        "n_authors_per_paper",
        "n_papers_per_author",
        "n_collabs",
        "n_isolates",
        "mean_collabs",
        "largest_component",
        "deg_assort",
        "avg_clustering",
        "transitivity",
    ],
)
df_overall

In [None]:
nx.info(G)

In [None]:
# prop of isolates
nx.number_of_isolates(G)/len(G)

In [None]:
sorted(G.degree, key=lambda x: x[1], reverse=True)[:20]

In [None]:
# function to plot egonet
def draw_ego(name):
    hub_ego = nx.ego_graph(G, name)
    pos = nx.spring_layout(hub_ego)
    nx.draw(hub_ego, pos, node_color="b", node_size=50, with_labels=True)
    options = {"node_size": 300, "node_color": "r"}
    nx.draw_networkx_nodes(hub_ego, pos, nodelist=[name], **options)

In [None]:
plt.figure(figsize=(10, 10))
draw_ego("Alan Daly")

In [None]:
# df_summary.to_csv("df_summary.csv", encoding='utf-8', index=False)