In [57]:
import json
import sqlite3
import numpy as np
import pandas as pd
import seaborn as sns
import scipy
import networkx as nx
import matplotlib.pyplot as plt
from dateutil import relativedelta
from datetime import datetime
import re

In [2]:

con = sqlite3.connect('../data/interim/articles_with_author_mapping.db')
cur = con.cursor()


In [3]:
cur.execute("select id, name, abbreviation, matching_certainty from authors")
rows = cur.fetchall()
authors = pd.DataFrame(columns=["id", "name", "abbreviation", "certainty"], data=rows)
authors.set_index("id", inplace=True)

In [4]:
# convert null to nans
authors = authors.replace("null", np.nan)

In [5]:
# set certainty of a row to the average certainty over all rows with same name and abbreviation, ignore nan values
mean_certainty = authors.groupby(["name", "abbreviation"])["certainty"].transform(lambda x: np.nanmean(x))
authors["certainty"] = mean_certainty

  mean_certainty = authors.groupby(["name", "abbreviation"])["certainty"].transform(lambda x: np.nanmean(x))


In [6]:
authors["name_pointing_to_abbreviation_count"] = authors.groupby(["name", "abbreviation"])["name"].transform("count")
authors["abbreviation_pointing_to_name_count"] = authors.groupby(["name", "abbreviation"])["abbreviation"].transform("count")

In [7]:
# drop duplicates based on name, abbreviation, certainty. I can drop them because ..._count saved the count
authors.drop_duplicates(subset=["name", "abbreviation", "certainty"], inplace=True)

In [8]:
# set the share that the name has of all names that point to that abbreviation
authors_with_name_pointing_to_abbreviation_sum =  authors.groupby(["abbreviation"]).agg(names_pointing_to_abbreviation_sum=("name_pointing_to_abbreviation_count", "sum")).reset_index()
authors = pd.merge(authors, authors_with_name_pointing_to_abbreviation_sum, on="abbreviation")
authors["name_pointing_to_abbreviation_share"] = authors["name_pointing_to_abbreviation_count"] / authors["names_pointing_to_abbreviation_sum"]

In [9]:
# set the share that the abbreviation has of all abbreviations that point to that name
authors_with_abbreviation_pointing_to_name_sum = authors.groupby(["name"]).agg(abbreviations_pointing_to_name_sum=("abbreviation_pointing_to_name_count", "sum")).reset_index()
authors = pd.merge(authors, authors_with_abbreviation_pointing_to_name_sum, on="name")
authors["abbreviation_pointing_to_name_share"] = authors["abbreviation_pointing_to_name_count"] / authors["abbreviations_pointing_to_name_sum"]

In [10]:
# calculate a final score for the weighted edge between name and abbreviation
authors["score"] = authors["certainty"] + authors["name_pointing_to_abbreviation_share"] + authors["abbreviation_pointing_to_name_share"]

In [11]:
# reduce the problem space to a bipartite graph, assigning all (name, abbreviation) pairs that are not connected to the graph as final/solved
# add pairs to final mapping where abbreviation_pointing_to_name_count == 1 or name_pointing_to_abbreviation_count == 1 and remove those from the authors df#
#one_to_one_mappings = authors[(authors["abbreviations_pointing_to_name_sum"] == 1) & (authors["names_pointing_to_abbreviation_sum"] == 1)]
#final_mapping = pd.concat([final_mapping, one_to_one_mappings[["name", "abbreviation"]]], ignore_index=True)

#authors = authors[(authors["abbreviations_pointing_to_name_sum"] != 1) | (authors["names_pointing_to_abbreviation_sum"] != 1)]

In [12]:
authors = authors[authors["name"] != "A. Krieger"]


In [13]:
# remove self referencing nodes
authors = authors[authors["name"] != authors["abbreviation"]]

In [14]:
# add name and abbreviation as nodes list and remove duplicates
nodes = list(authors["name"].unique()) + list(authors["abbreviation"].unique())

# add unique edges based on entries in the authors table
edges = []
for index, row in authors.iterrows():
    edges.append((row["name"], row["abbreviation"]))

In [15]:
G = nx.Graph()
G.clear()
author_list = list(authors["name"].unique())
abbr_list = list(authors[authors["name"].isin(author_list)]["abbreviation"].unique())
G.add_nodes_from(author_list, bipartite=0)
G.add_nodes_from(abbr_list, bipartite=1)
dummy_nodes = [f"{name}_dummy" for name in author_list]
G.add_nodes_from(dummy_nodes, bipartite=1)

edges = []
for index, row in authors[authors["name"].isin(author_list) & authors["abbreviation"].isin(abbr_list)].iterrows():
    G.add_edges_from([(row["name"], row["abbreviation"])], weight=round(row["score"],2) * -1)

for name in author_list:
    G.add_edges_from([(name, f"{name}_dummy")], weight=0)


In [16]:
new_edges = nx.bipartite.minimum_weight_full_matching(G, top_nodes=list(authors["name"].unique()), weight="weight")
g_new = nx.Graph()
g_new.clear()
g_new.add_nodes_from(author_list, bipartite=0)
g_new.add_nodes_from(abbr_list, bipartite=1)
g_new.add_edges_from(new_edges.items())

In [17]:
g_new.edges

EdgeView([('kai-uwe brandt', 'kub'), ('jan peter', 'jap'), ('manfred lüttich', 'malü'), ('märz frank pfütze', 'märz frank pfütze_dummy'), ('magdalena froehlich', 'mf'), ('michael frömmert', 'maf'), ('heiko trebs', 'ht'), ('andreas tappert', 'art'), ('annett riedel', 'ari'), ('angelika raulien', 'arau'), ('andrea richter', 'ar'), ('nadja topfstedt', 'dpe'), ('norbert töpfer', 'nt'), ('niklas tolkamp', 'niklas tolkamp_dummy'), ('thomas lang', 'thlang'), ('thomas haegeler', 'thl'), ('antje henselin-rudolph', 'ahr'), ('nico fliegner', 'nf'), ('lutz schmidt', 'ls'), ('heike liesaus', 'hl'), ('juliane streich', 'just'), ('lisa garn', 'lisa garn_dummy'), ('lisa berins', 'lisa'), ('lisa seliger', 'lis'), ('melanie steitz', 'mes'), ('lisa schliep', 'liep'), ('laurine schubert', 'laurine schubert_dummy'), ('frank pfütze', 'pfü'), ('pfütze frank', 'pfütze'), ('kendra reinhardt', 'kir'), ('karoline maria keybe', 'key'), ('kinderwerkstatt anmeldet', 'kinderwerkstatt anmeldet_dummy'), ('kommentar jo

In [36]:
print(f"new graph has {len(g_new.nodes)} nodes and {len(g_new.edges)} edges")

new graph has 1372 nodes and 586 edges


In [None]:
# transform graph to dataframe author_mapping with columns name and abbreviation
author_mapping = pd.DataFrame(columns=["name", "abbreviation"], data=g_new.edges)
author_mapping = author_mapping.astype(str)

print(f"author_mapping has {author_mapping.shape[0]} rows")


# remove all abbreviations that contain "dummy" in their name
author_mapping = author_mapping[~author_mapping["abbreviation"].str.contains("dummy")]
# print shape
print(f"author_mapping has {author_mapping.shape[0]} rows")

### In the following there are some analyses covering the correctness of the approach

In [60]:
def has_german_umlauts(input_string):
    umlaut_pattern = re.compile(r'[äöüÄÖÜ]')
    return bool(umlaut_pattern.search(input_string))

# test if there is an authors that was mapped to an abbreviation that exceeds the authors name writing time by more than 6 months in the time before or after
for index, row in author_mapping.iterrows():
    author = row["name"]
    if has_german_umlauts(author):
        # do not handle authors with umlauts because issues with sqlite LIKE operator
        continue
    abbreviation = row["abbreviation"]
    author_like = f"%\"{author}\"%"
    abbr_like = f"%\"{abbreviation}\"%"
    first_article_author = cur.execute(f"SELECT MIN(published_at) FROM articles where author_array like ?", (author_like,)).fetchone()[0]
    first_article_abbr = cur.execute(f"SELECT MIN(published_at) FROM articles where author_array like ?", (abbr_like,)).fetchone()[0]
    first_article_author = datetime.strptime(first_article_author, "%Y-%m-%dT%H:%M:%S+00:00")
    first_article_abbr = datetime.strptime(first_article_abbr, "%Y-%m-%dT%H:%M:%S+00:00")

    # check if first_article_abbr is six month ahead of first_article_author
    if first_article_abbr < first_article_author - relativedelta.relativedelta(months=6):
        print(f"author: {author}, first_article_author: {first_article_author}, abbreviation: {abbreviation}, first_article_abbr: {first_article_abbr}")

    last_article_author = cur.execute(f"SELECT MAX(published_at) FROM articles where author_array like ?", (author_like,)).fetchone()[0]
    last_article_abbr = cur.execute(f"SELECT MAX(published_at) FROM articles where author_array like ?", (abbr_like,)).fetchone()[0]
    last_article_author = datetime.strptime(last_article_author, "%Y-%m-%dT%H:%M:%S+00:00")
    last_article_abbr = datetime.strptime(last_article_abbr, "%Y-%m-%dT%H:%M:%S+00:00")

    # check if last_article_abbr is six month behind of last_article_author
    if last_article_abbr > last_article_author + relativedelta.relativedelta(months=6):
        print(f"author: {author}, last_article_author: {last_article_author}, abbreviation: {abbreviation}, last_article_abbr: {last_article_abbr}")

author: magdalena froehlich, last_article_author: 2011-03-25 15:35:46, abbreviation: mf, last_article_abbr: 2018-06-28 10:55:16
author: andrea richter, first_article_author: 2013-11-21 22:23:00, abbreviation: ar, first_article_abbr: 2010-01-05 16:04:50
author: lisa seliger, first_article_author: 2014-01-14 19:13:00, abbreviation: lis, first_article_abbr: 2010-01-07 15:35:14
author: lisa seliger, last_article_author: 2014-01-14 19:13:00, abbreviation: lis, last_article_abbr: 2021-12-14 15:31:00
author: kai-uwe arnold, first_article_author: 2020-05-13 10:01:00, abbreviation: ka, first_article_abbr: 2010-01-07 23:00:00
author: kai-uwe arnold, last_article_author: 2020-05-13 10:01:00, abbreviation: ka, last_article_abbr: 2021-12-29 14:11:47
author: dominik bath, first_article_author: 2011-04-01 12:25:03, abbreviation: dom, first_article_abbr: 2010-01-21 18:05:19
author: dominik bath, last_article_author: 2013-05-01 15:50:00, abbreviation: dom, last_article_abbr: 2021-12-27 08:49:57
author:

TypeError: strptime() argument 1 must be str, not None

We can conclude that we need a mapping that takes into account the authors' lifespans. Abbreviations can be used for more than one author.

Further, we notice that there are abbreviations that do not have an associated author. E.g. "joka", does not have any good fit.

In [None]:
print(f"new graph has {len(g_new.nodes)} nodes and {len(g_new.edges)} edges")

There are abbr nodes that were not matched with any author. That's why we have: edges * 2 < nodes.

In [31]:
# test that only abbrs are not in the matches nodes
for node in g_new.nodes:
    if g_new.degree(node) == 0 and node in author_list:
        print(f"author node {node} has no edges")
    elif g_new.degree(node) == 0 and node in abbr_list:
        print(f"abbreviation node {node} has no edges")

abbreviation node ast has no edges
abbreviation node dap has no edges
abbreviation node at has no edges
abbreviation node ept has no edges
abbreviation node ata has no edges
abbreviation node agr has no edges
abbreviation node tha has no edges
abbreviation node tmn has no edges
abbreviation node th has no edges
abbreviation node thoma has no edges
abbreviation node the has no edges
abbreviation node thg has no edges
abbreviation node thll has no edges
abbreviation node fliegner has no edges
abbreviation node nie has no edges
abbreviation node heike has no edges
abbreviation node berins has no edges
abbreviation node franz has no edges
abbreviation node ken has no edges
abbreviation node kas has no edges
abbreviation node kst has no edges
abbreviation node würker has no edges
abbreviation node kretz has no edges
abbreviation node kaz has no edges
abbreviation node kri has no edges
abbreviation node mafa has no edges
abbreviation node in has no edges
abbreviation node ri has no edges
abb

In [35]:
# list the abbreviations that were connected to the author that the edges with zero edges were pointing to
# so we can e.g. check if we need to enable multiple abbreviations for the same author
for node in g_new.nodes:
    if g_new.degree(node) == 0:
        # check the authors of the old graph that were pointing to this abbreviation
        old_neighbors = G.neighbors(node)
        for old_neighbor in old_neighbors:
            print(f"new neighbors of old abbr {node} node neighbors {old_neighbor} are {list(g_new.neighbors(old_neighbor))}")


new neighbors of old abbr ast node neighbors andreas tappert are ['art']
new neighbors of old abbr dap node neighbors andreas tappert are ['art']
new neighbors of old abbr dap node neighbors nadja topfstedt are ['dpe']
new neighbors of old abbr at node neighbors andreas tappert are ['art']
new neighbors of old abbr ept node neighbors andreas tappert are ['art']
new neighbors of old abbr ata node neighbors andreas tappert are ['art']
new neighbors of old abbr agr node neighbors angelika raulien are ['arau']
new neighbors of old abbr tha node neighbors thomas lang are ['thlang']
new neighbors of old abbr tha node neighbors thomas haegeler are ['thl']
new neighbors of old abbr tmn node neighbors thomas lang are ['thlang']
new neighbors of old abbr tmn node neighbors thomas steingen are ['totei']
new neighbors of old abbr th node neighbors thomas haegeler are ['thl']
new neighbors of old abbr thoma node neighbors thomas haegeler are ['thl']
new neighbors of old abbr the node neighbors thom

From this we do see that some authors have several abbreviations
For example: krysta brown has probably two abbreviations: "krysta" and "brown"

In [41]:
print(f"{(len(g_new.edges))} abbreviations were matched")
print(f"There are {len(g_new.nodes) - (len(g_new.edges) * 2)} abbreviations that were not matched")

586 abbreviations were matched
There are 200 abbreviations that were not matched


In [42]:
# print not matched abbreviations
print([node for node in g_new.nodes if g_new.degree(node) == 0])

['ast', 'dap', 'at', 'ept', 'ata', 'agr', 'tha', 'tmn', 'th', 'thoma', 'the', 'thg', 'thll', 'fliegner', 'nie', 'heike', 'berins', 'franz', 'ken', 'kas', 'kst', 'würker', 'kretz', 'kaz', 'kri', 'mafa', 'in', 'ri', 'rol', 'mape', 'mp', 'rö', 'robn', 'ben', 'ret', 'dimo', 'awo', 'mir', 'mahr', 'mot', 'mabe', 'mato', 'mathias orb', 'm orbeck', 'mmc', 'bau', 'birgit', 'bpa', 'seb', 'fb', 'brown', 'müller', 'hn', 'guido', 'kar', 'das', 'ard', 'maw', 'mtg', 'mwi', 'mas', 'syr', 'stroh', 'caro', 'uk', 'ts', 'tos', 'meine', 'tin', 'thomas', 'th sparrer', 'bley', 'ulrike', 'rka', 'rare', 'ck', 'haku', 'chk', 'ja', 'jana', 'krenz', 'frei', 'senf', 'fink', 'döring', 'großnick', 'trache', 'kfm', 'koch', 'pom', 'nelle', 'sho', 'kha', 'skg', 'ski', 'grätz', 'sag', 'fel', 'fö', 'hörügel', 'isc', 'kerst', 'keine', 'decker', 'kern', 'kecke', 'krieck', 'lmk', 'axl', 'bü', 'jl', 'rd', 'roger', 'rie', 'rabe', 'kl', 'kel', 'büchel', 'rb', 'pe', 'cat', 'ang', 'kreuz', 'sabine', 'sbu', 'kat', 'kunze', 'sas',