In [1]:
import json
import sqlite3
import numpy as np
import pandas as pd
import seaborn as sns
import scipy
import networkx as nx
import matplotlib.pyplot as plt

In [2]:

con = sqlite3.connect('../data/interim/articles_with_author_mapping.db')
cur = con.cursor()


In [3]:
cur.execute("select id, name, abbreviation, matching_certainty from authors")
rows = cur.fetchall()
authors = pd.DataFrame(columns=["id", "name", "abbreviation", "certainty"], data=rows)
authors.set_index("id", inplace=True)

In [4]:
# convert null to nans
authors = authors.replace("null", np.nan)

In [5]:
# set certainty of a row to the average certainty over all rows with same name and abbreviation, ignore nan values
mean_certainty = authors.groupby(["name", "abbreviation"])["certainty"].transform(lambda x: np.nanmean(x))
authors["certainty"] = mean_certainty

  mean_certainty = authors.groupby(["name", "abbreviation"])["certainty"].transform(lambda x: np.nanmean(x))


In [6]:
authors["name_pointing_to_abbreviation_count"] = authors.groupby(["name", "abbreviation"])["name"].transform("count")
authors["abbreviation_pointing_to_name_count"] = authors.groupby(["name", "abbreviation"])["abbreviation"].transform("count")

In [7]:
# drop duplicates based on name, abbreviation, certainty. I can drop them because ..._count saved the count
authors.drop_duplicates(subset=["name", "abbreviation", "certainty"], inplace=True)

In [8]:
# set the share that the name has of all names that point to that abbreviation
authors_with_name_pointing_to_abbreviation_sum =  authors.groupby(["abbreviation"]).agg(names_pointing_to_abbreviation_sum=("name_pointing_to_abbreviation_count", "sum")).reset_index()
authors = pd.merge(authors, authors_with_name_pointing_to_abbreviation_sum, on="abbreviation")
authors["name_pointing_to_abbreviation_share"] = authors["name_pointing_to_abbreviation_count"] / authors["names_pointing_to_abbreviation_sum"]

In [9]:
# set the share that the abbreviation has of all abbreviations that point to that name
authors_with_abbreviation_pointing_to_name_sum = authors.groupby(["name"]).agg(abbreviations_pointing_to_name_sum=("abbreviation_pointing_to_name_count", "sum")).reset_index()
authors = pd.merge(authors, authors_with_abbreviation_pointing_to_name_sum, on="name")
authors["abbreviation_pointing_to_name_share"] = authors["abbreviation_pointing_to_name_count"] / authors["abbreviations_pointing_to_name_sum"]

In [10]:
# calculate a final score for the weighted edge between name and abbreviation
authors["score"] = authors["certainty"] + authors["name_pointing_to_abbreviation_share"] + authors["abbreviation_pointing_to_name_share"]

In [11]:
# reduce the problem space to a bipartite graph, assigning all (name, abbreviation) pairs that are not connected to the graph as final/solved
# add pairs to final mapping where abbreviation_pointing_to_name_count == 1 or name_pointing_to_abbreviation_count == 1 and remove those from the authors df#
#one_to_one_mappings = authors[(authors["abbreviations_pointing_to_name_sum"] == 1) & (authors["names_pointing_to_abbreviation_sum"] == 1)]
#final_mapping = pd.concat([final_mapping, one_to_one_mappings[["name", "abbreviation"]]], ignore_index=True)

#authors = authors[(authors["abbreviations_pointing_to_name_sum"] != 1) | (authors["names_pointing_to_abbreviation_sum"] != 1)]

In [23]:
authors = authors[authors["name"] != "A. Krieger"]


In [24]:
# remove self referencing nodes
authors = authors[authors["name"] != authors["abbreviation"]]

In [25]:
# add name and abbreviation as nodes list and remove duplicates
nodes = list(authors["name"].unique()) + list(authors["abbreviation"].unique())

# add unique edges based on entries in the authors table
edges = []
for index, row in authors.iterrows():
    edges.append((row["name"], row["abbreviation"]))

In [26]:
G = nx.Graph()
G.clear()
author_list = list(authors["name"].unique())
abbr_list = list(authors[authors["name"].isin(author_list)]["abbreviation"].unique())
G.add_nodes_from(author_list, bipartite=0)
G.add_nodes_from(abbr_list, bipartite=1)
dummy_nodes = [f"{name}_dummy" for name in author_list]
G.add_nodes_from(dummy_nodes, bipartite=1)

edges = []
for index, row in authors[authors["name"].isin(author_list) & authors["abbreviation"].isin(abbr_list)].iterrows():
    G.add_edges_from([(row["name"], row["abbreviation"])], weight=round(row["score"],2) * -1)

for name in author_list:
    G.add_edges_from([(name, f"{name}_dummy")], weight=0)


In [27]:
new_edges = nx.bipartite.minimum_weight_full_matching(G, top_nodes=list(authors["name"].unique()), weight="weight")
g_new = nx.Graph()
g_new.clear()
g_new.add_nodes_from(author_list, bipartite=0)
g_new.add_nodes_from(abbr_list, bipartite=1)
g_new.add_edges_from(new_edges.items())

In [39]:
g_new.edges

EdgeView([('kai-uwe brandt', 'kub'), ('jan peter', 'jap'), ('manfred lüttich', 'malü'), ('märz frank pfütze', 'märz frank pfütze_dummy'), ('magdalena froehlich', 'mf'), ('michael frömmert', 'maf'), ('heiko trebs', 'ht'), ('andreas tappert', 'art'), ('annett riedel', 'ari'), ('angelika raulien', 'arau'), ('andrea richter', 'ar'), ('nadja topfstedt', 'dpe'), ('norbert töpfer', 'nt'), ('niklas tolkamp', 'niklas tolkamp_dummy'), ('thomas lang', 'thlang'), ('thomas haegeler', 'thl'), ('antje henselin-rudolph', 'ahr'), ('nico fliegner', 'nf'), ('lutz schmidt', 'ls'), ('heike liesaus', 'hl'), ('juliane streich', 'just'), ('lisa garn', 'lisa garn_dummy'), ('lisa berins', 'lisa'), ('lisa seliger', 'lis'), ('melanie steitz', 'mes'), ('lisa schliep', 'liep'), ('laurine schubert', 'laurine schubert_dummy'), ('frank pfütze', 'pfü'), ('pfütze frank', 'pfütze'), ('kendra reinhardt', 'kir'), ('karoline maria keybe', 'key'), ('kinderwerkstatt anmeldet', 'kinderwerkstatt anmeldet_dummy'), ('kommentar jo

In [29]:
print(f"new graph has {len(g_new.nodes)} nodes and {len(g_new.edges)} edges")
# todo 11.07: es müssten eigentlich 1172 nodes sein, wenn es 586 Kanten also 1-1 mappings gibt, oder?

new graph has 1372 nodes and 586 edges


In [43]:
# transform graph to dataframe author_mapping with columns name and abbreviation
author_mapping = pd.DataFrame(columns=["name", "abbreviation"], data=g_new.edges)
author_mapping = author_mapping.astype(str)

# remove all abbreviations that contain "dummy" in their name
author_mapping = author_mapping[~author_mapping["abbreviation"].str.contains("dummy")]
# print shape
print(f"author_mapping has {author_mapping.shape[0]} rows")

author_mapping has 470 rows


In [44]:
author_mapping

Unnamed: 0,name,abbreviation
0,kai-uwe brandt,kub
1,jan peter,jap
2,manfred lüttich,malü
4,magdalena froehlich,mf
5,michael frömmert,maf
...,...,...
581,max hempel,m hempel
582,tim niklas herholz,tnh
583,susanne plecher,sup
584,michael klamp,cmp


In [45]:
len(author_list)

586