In [None]:
from itertools import combinations
import re
import os
from tqdm import tqdm
import pandas as pd
import xmltodict
import networkx as nx
import pickle
import collections
import matplotlib.pyplot as plt
from unidecode import unidecode
import altair as alt

tqdm.pandas()
from collections import Counter
import string

# from genderize import Genderize
# genderize = Genderize(
#     user_agent='GenderizeDocs/0.0',
#     api_key='c363eacf807f4af4992b358200ebc15c',
#     timeout=30.0)

with open("dict_genders.pickle", "rb") as handle:
    dict_genders = pickle.load(handle)

pd.set_option("display.max_columns", None)

In [None]:
import gensim
from gensim.utils import simple_preprocess
import nltk

nltk.download("stopwords")
from nltk.corpus import stopwords

import gensim.corpora as corpora
import pyLDAvis.gensim_models
import pyLDAvis

stop_words = stopwords.words("english")
stop_words.extend(["from", "re", "use"])


def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield (gensim.utils.simple_preprocess(str(sentence), deacc=True))


def remove_stopwords(texts):
    return [
        [word for word in simple_preprocess(str(doc)) if word not in stop_words]
        for doc in texts
    ]

In [None]:
def extract_authors(dc):
    if type(dc) == list:
        result = [
            unidecode(i.get("#text").title().strip())
            for i in dc
            if i.get("#text") is not None and i.get("@scheme") != "institution"
        ]
        return [x for x in result if x != "And Others"]
    elif dc.get("#text") is not None and dc.get("@scheme") != "institution":
        return [unidecode(dc.get("#text").title().strip())]


def clean_name(name):
    if ", " in name:
        lst = name.split(", ")
        lst = [item.split(" ")[0] for item in lst]
        return lst[1] + " " + lst[0]
    elif "," in name:
        lst = name.split(",")
        lst = [item.split(" ")[0] for item in lst]
        return lst[1] + " " + lst[0]
    else:
        return name


def get_first_names(author_list):

    if len(author_list) > 0:
        names = []
        for x in author_list:
            words_in_name = len(x.split())

            if words_in_name > 0:
                first = re.sub(r"[^\w\s]", "", x.split()[0])
            if words_in_name > 1:
                second = re.sub(r"[^\w\s]", "", x.split()[1])
            if words_in_name > 2:
                third = re.sub(r"[^\w\s]", "", x.split()[2])
            if words_in_name > 3:
                fourth = re.sub(r"[^\w\s]", "", x.split()[3])

            if words_in_name > 0 and len(first) > 1:
                names.append(first)
            elif words_in_name > 1 and len(second) > 1:
                names.append(second)
            elif words_in_name > 2 and len(third) > 1:
                names.append(third)
            elif words_in_name > 3 and len(fourth) > 1:
                names.append(fourth)
            else:
                names.append(x)

        return names
    else:
        return None


def name_to_gender(first_name_list):
    if first_name_list and len(first_name_list) > 0:
        return [dict_genders[name] for name in first_name_list]
    else:
        return None


def get_edges(auth_list):
    return list(combinations(auth_list, 2))


def extract_ids(dc):
    if type(dc) == list:
        return [
            i.get("#text").upper().strip()
            for i in dc
            if i.get("#text") is not None and i.get("@scheme") == "eric_accno"
        ][0]
    elif dc.get("#text") is not None and dc.get("@scheme") == "eric_accno":
        return dc.get("#text").upper().strip()


def extract_subject(dc):
    result = []
    for item in dc:
        if type(item) == collections.OrderedDict:
            result.append(unidecode(item.get("#text").title().strip()))
        elif type(item) == str:
            result.append(unidecode(item.title().strip()))
        else:
            result.append(unidecode(item.title().strip()))
    return result


def plot_degree_dist(G):
    degrees = [G.degree(n) for n in G.nodes()]
    plt.hist(degrees)
    plt.show()

In [None]:
df_all = []

for year in tqdm(range(1965, 2021)):
    file_name = "data/eric" + str(year)
    with open(file_name + ".xml", encoding="utf-8") as fd:
        dict = xmltodict.parse(fd.read())
    recs = [rec["metadata"] for rec in dict["records"]["record"]]
    df = pd.DataFrame(recs)

    df = df[df["dc:type"].notna()]
    df = df[df["eric:peer_reviewed"].notna()]
    df["type"] = ["".join(map(str, l)).lower() for l in df["dc:type"]]
    df = df.loc[df["eric:peer_reviewed"] == "T"]
    # df = df[['ids', 'authors', 'edges', 'dc:type', 'dc:subject', 'eric:keywords', 'eric:keywords_geo', 'dc:title', 'eric:pageCount', 'dc:date', 'eric:dateAdded']]
    df_all.append(df)

df_all = pd.concat(df_all)

In [None]:
df_all.head()

In [None]:
df_all["eric:dateAdded"] = pd.to_numeric(df_all["eric:dateAdded"])

In [None]:
df_all = df_all.loc[(df_all["type"].str.contains("journal"))]

# get author names
df_all["authors"] = df_all.progress_apply(
    lambda row: extract_authors(row["dc:creator"]), axis=1
)
df_all = df_all[df_all["authors"].notna()]  # remove nan authors
df_all["authors"] = df_all.progress_apply(
    lambda row: [clean_name(item) for item in row["authors"]], axis=1
)

# get edges
df_all["n_authors"] = df_all.progress_apply(lambda row: len(row["authors"]), axis=1)
df_all["edges"] = df_all.progress_apply(
    lambda row: get_edges(sorted(row["authors"])), axis=1
)
# df_all["ids"] = df_all.progress_apply(lambda row: extract_ids(row["dc:identifier"]), axis=1)

# get subjects
df_all = df_all[df_all["dc:subject"].notna()]
df_all["subjects"] = df_all.progress_apply(
    lambda row: extract_subject(row["dc:subject"]), axis=1
)
# df_all.loc[:, "subject_top"] = df_all.subjects.map(lambda x: x[0])

# get author first name and then use it to predict gender
df_all["author_first_names"] = df_all.progress_apply(
    lambda row: get_first_names(row["authors"]), axis=1
)
df_all["author_genders"] = df_all.progress_apply(
    lambda row: name_to_gender(row["author_first_names"]), axis=1
)  # get genders from dict

In [None]:
df_all.info()

In [None]:
df_all.tail()

In [None]:
df_all["eric:dateAdded"].value_counts().sort_index()

## Topic Model

In [None]:
# Remove punctuation
df_all["dc:description"] = df_all["dc:description"].map(
    lambda x: re.sub("[,\.!?]", "", str(x))
)
# Convert to lowercase
df_all["dc:description"] = df_all["dc:description"].map(lambda x: x.lower())
# Print out the first rows of papers
df_all["dc:description"].head()

In [None]:
data = df_all["dc:description"].values.tolist()
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)

In [None]:
len(data_words)

In [None]:
del df_all

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_words)

# Create Corpus
texts = data_words

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
# number of topics
num_topics = 10

# Build LDA model
lda_model = gensim.models.LdaMulticore(
    corpus=corpus, id2word=id2word, num_topics=num_topics
)
# Print the Keyword in the 10 topics
# pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join("results/ldavis_prepared_" + str(num_topics))

# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, "wb") as f:
        pickle.dump(LDAvis_prepared, f)

# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, "rb") as f:
    LDAvis_prepared = pickle.load(f)

pyLDAvis.save_html(
    LDAvis_prepared, "results/ldavis_prepared_" + str(num_topics) + ".html"
)
LDAvis_prepared

## Graph Analysis

In [None]:
def generate_graph(df_local):

    node_list1 = df_local["authors"].tolist()
    node_list2 = [x for x in node_list1 if x is not None]  # remove none
    node_list3 = [item for sublist in node_list2 for item in sublist]
    node_list = list(set(node_list3))

    n_papers_per_author = len(node_list3) / len(node_list)

    edge_list1 = df_local["edges"].tolist()
    edge_list2 = [x for x in edge_list1 if x is not None]  # remove none
    edge_list = [item for sublist in edge_list2 for item in sublist]

    G = nx.Graph()
    G.add_nodes_from(node_list)
    G.add_edges_from(edge_list)
    return (G, n_papers_per_author)

### Cumulative graph

In [None]:
# CUMULATIVE NOW
list1 = []

for year in tqdm(range(1965, 2021)):

    df_local = df_all.loc[df_all["eric:dateAdded"] <= year]

    if len(df_local) == 0:
        continue

    result = generate_graph(df_local)
    G = result[0]
    n_authors = len(G)
    n_papers = len(df_local)

    n_authors_per_paper = df_local["n_authors"].mean()
    n_papers_per_author = result[1]

    n_collabs = nx.number_of_edges(G)
    n_isolates = nx.number_of_isolates(G)
    mean_collabs = 2 * G.number_of_edges() / float(G.number_of_nodes())

    G_largest_comp = G.subgraph(
        sorted(nx.connected_components(G), key=len, reverse=True)[0]
    )
    largest_component = len(G_largest_comp) / len(G)

    deg_assort = nx.degree_assortativity_coefficient(G)
    avg_clustering = nx.average_clustering(G)
    transitivity = nx.transitivity(G)

    list1.append(
        (
            year,
            n_authors,
            n_papers,
            n_authors_per_paper,
            n_papers_per_author,
            n_collabs,
            n_isolates,
            mean_collabs,
            largest_component,
            deg_assort,
            avg_clustering,
            transitivity,
        )
    )

df_summary = pd.DataFrame(
    list1,
    columns=[
        "year",
        "n_authors",
        "n_papers",
        "n_authors_per_paper",
        "n_papers_per_author",
        "n_collabs",
        "n_isolates",
        "mean_collabs",
        "largest_component",
        "deg_assort",
        "avg_clustering",
        "transitivity",
    ],
)
df_summary

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", interpolate="step-after", line=True
).encode(x="year", y="n_authors")

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", interpolate="step-after", line=True
).encode(x="year", y="n_papers")

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", interpolate="step-after", line=True
).encode(x="year", y="n_authors_per_paper")

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", interpolate="step-after", line=True
).encode(x="year", y="n_papers_per_author")

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", interpolate="step-after", line=True
).encode(x="year", y="n_collabs")

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", interpolate="step-after", line=True
).encode(x="year", y="mean_collabs")

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", interpolate="step-after", line=True
).encode(x="year", y="n_isolates")

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", interpolate="step-after", line=True
).encode(x="year", y="largest_component")

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", interpolate="step-after", line=True
).encode(x="year", y="deg_assort")

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", interpolate="step-after", line=True
).encode(x="year", y="avg_clustering")

In [None]:
alt.Chart(df_summary).mark_area(
    color="lightblue", interpolate="step-after", line=True
).encode(x="year", y="transitivity")

In [None]:
list1 = []

result = generate_graph(df_all)
G = result[0]
n_authors = len(G)
n_papers = len(df_all)

n_authors_per_paper = df_all["n_authors"].mean()
n_papers_per_author = result[1]

n_collabs = nx.number_of_edges(G)
n_isolates = nx.number_of_isolates(G)
mean_collabs = 2 * G.number_of_edges() / float(G.number_of_nodes())

G_largest_comp = G.subgraph(
    sorted(nx.connected_components(G), key=len, reverse=True)[0]
)
largest_component = len(G_largest_comp) / len(G)

deg_assort = nx.degree_assortativity_coefficient(G)
avg_clustering = nx.average_clustering(G)
transitivity = nx.transitivity(G)

list1.append(
    (
        year,
        n_authors,
        n_papers,
        n_authors_per_paper,
        n_papers_per_author,
        n_collabs,
        n_isolates,
        mean_collabs,
        largest_component,
        deg_assort,
        avg_clustering,
        transitivity,
    )
)

df_overall = pd.DataFrame(
    list1,
    columns=[
        "year",
        "n_authors",
        "n_papers",
        "n_authors_per_paper",
        "n_papers_per_author",
        "n_collabs",
        "n_isolates",
        "mean_collabs",
        "largest_component",
        "deg_assort",
        "avg_clustering",
        "transitivity",
    ],
)
df_overall

In [None]:
nx.info(G)

In [None]:
plt.figure(figsize=(12, 12))


def draw_ego(name):
    hub_ego = nx.ego_graph(G, name)
    pos = nx.spring_layout(hub_ego)
    nx.draw(hub_ego, pos, node_color="b", node_size=50, with_labels=True)
    options = {"node_size": 300, "node_color": "r"}
    nx.draw_networkx_nodes(hub_ego, pos, nodelist=[name], **options)


draw_ego("Linda Darling-Hammond")

In [None]:
# df_summary.to_csv("df_summary.csv", encoding='utf-8', index=False)

In [None]:
names = []

for x in tqdm(G.nodes()):
    words_in_name = len(x.split())

    if words_in_name > 0:
        first = re.sub(r"[^\w\s]", "", x.split()[0])
    if words_in_name > 1:
        second = re.sub(r"[^\w\s]", "", x.split()[1])
    if words_in_name > 2:
        third = re.sub(r"[^\w\s]", "", x.split()[2])

    if words_in_name > 0 and len(first) > 1:
        names.append(first)
    elif words_in_name > 1 and len(second) > 1:
        names.append(second)
    elif words_in_name > 2 and len(third) > 1:
        names.append(third)

Counter(names).most_common()

In [None]:
len(set(names))

In [None]:
nx.effective_size(G)

In [None]:
degs = {}
for n in G.nodes():
    deg = G.degree(n)
    if deg not in degs:
        degs[deg] = 0
        degs[deg] += 1
items = sorted(degs.items())
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot([k for (k, v) in items], [v for (k, v) in items])
ax.set_xscale("log")
ax.set_yscale("log")
plt.title("My Degree Distribution")

In [None]:
# from networkx.algorithms import bipartite
# remove = [node for node, degree in G.degree() if degree < 2]
# G.remove_nodes_from(remove)
# bipartite.node_redundancy(G)

In [None]:
len(G)

In [None]:
G.degree()