In [None]:
from itertools import combinations
from tqdm import tqdm
import pandas as pd
import xmltodict
import networkx as nx

In [None]:
def extract_authors(dc):
    if type(dc) == list:
        result = [
            i.get("#text").title().strip()
            for i in dc
            if i.get("#text") is not None and i.get("@scheme") != "institution"
        ]
        return [x for x in result if x != "And Others"]
    elif dc.get("#text") is not None and dc.get("@scheme") != "institution":
        return dc.get("#text").title().strip()


def get_edges(auth_list):
    return list(combinations(auth_list, 2))

def extract_ids(dc):
    if type(dc) == list:
        return [
            i.get("#text").upper().strip()
            for i in dc
            if i.get("#text") is not None and i.get("@scheme") == "eric_accno"
        ][0]
    elif dc.get("#text") is not None and dc.get("@scheme") == "eric_accno":
        return dc.get("#text").upper().strip()

In [None]:
df_all = []

for year in tqdm(range(1965, 2021)):
    file_name = "data/eric" + str(year)
    with open(file_name + ".xml", encoding="utf-8") as fd:
        dict = xmltodict.parse(fd.read())
    recs = [rec["metadata"] for rec in dict["records"]["record"]]
    df = pd.DataFrame(recs)
    df = df[df['dc:type'].notna()]
    df = df[df['eric:peer_reviewed'].notna()]
    df['type'] = [''.join(map(str, l)).lower() for l in df['dc:type']]
    df = df.loc[df['eric:peer_reviewed'] == 'T']
    # df = df[['ids', 'authors', 'edges', 'dc:type', 'dc:subject', 'eric:keywords', 'eric:keywords_geo', 'dc:title', 'eric:pageCount', 'dc:date', 'eric:dateAdded']]
    df_all.append(df)
df_all = pd.concat(df_all)

df_all = df_all.loc[(df_all['type'].str.contains("journal"))]
df_all["authors"] = df_all.apply(lambda row: extract_authors(row["dc:creator"]), axis=1)
df_all["ids"] = df_all.apply(lambda row: extract_ids(row["dc:identifier"]), axis=1)
df_all = df_all[df_all['authors'].notna()]
df_all["edges"] = df_all.apply(lambda row: get_edges(sorted(row["authors"])), axis=1)
df_all.info()

In [None]:
df_all[['dc:date', 'eric:dateAdded']].to_csv("data/all.csv", encoding='utf-8', index=False)

In [None]:
df_all['eric:peer_reviewed'].value_counts()

In [None]:
list1 = df_all["authors"].tolist()
list2 = [x for x in list1 if x is not None]  # remove none
node_list = [item for sublist in list2 for item in sublist]
node_list = list(set(node_list))

In [None]:
list1 = df_all["edges"].tolist()
list2 = [x for x in list1 if x is not None]  # remove none
edge_list = [item for sublist in list2 for item in sublist]

In [None]:
G = nx.Graph()
G.add_nodes_from(node_list)
G.add_edges_from(edge_list)

In [None]:
nx.write_edgelist(G, "all.edgelist.gz")

In [None]:
# G = nx.read_edgelist("all.edgelist.gz")

In [None]:
#nx.info(G, n = "Mcfarland, Daniel A.")
hub_ego = nx.ego_graph(G, "Bettinger, Eric")
pos = nx.spring_layout(hub_ego)
nx.draw(hub_ego, pos, node_color="b", node_size=50, with_labels=True)
options = {"node_size": 300, "node_color": "r"}
nx.draw_networkx_nodes(hub_ego, pos, nodelist=["Bettinger, Eric"], **options)

In [None]:
[s for s in node_list if "Mcfarland, Dan" in s]

In [None]:
nx.info(G)