In [21]:
import json
import sqlite3
import numpy as np
import pandas as pd
import seaborn as sns
import scipy
from src.models.MatchingType import MatchingType

In [22]:
con = sqlite3.connect('../data/interim/articles_with_author_mapping.db')
cur = con.cursor()

In [23]:
def get_abbreviations_with_names():
    cur.execute("select id, name, abbreviation, matching_certainty from unmapped_authors where matching_type = ? or matching_type = ?", (MatchingType.FUZZY_MATCH.name, MatchingType.DIRECT_MATCH.name))
    rows = cur.fetchall()
    authors = pd.DataFrame(columns=["id", "name", "abbreviation", "certainty"], data=rows)
    authors.set_index("id", inplace=True)
    return authors

authors = get_abbreviations_with_names()

In [24]:
final_mapping = pd.DataFrame(columns=["name", "abbreviation"])

In [25]:
# set certainty of a row to the average certainty over all rows with same name and abbreviation, ignore nan values
mean_certainty = authors.groupby(["name", "abbreviation"])["certainty"].transform(lambda x: np.nanmean(x))
authors["certainty"] = mean_certainty

In [26]:
authors["name_pointing_to_abbreviation_count"] = authors.groupby(["name", "abbreviation"])["name"].transform("count")
authors["abbreviation_pointing_to_name_count"] = authors.groupby(["name", "abbreviation"])["abbreviation"].transform("count")

In [27]:
# drop duplicates based on name, abbreviation, certainty. I can drop them because ..._count saved the count
authors.drop_duplicates(subset=["name", "abbreviation", "certainty"], inplace=True)

In [28]:
# set the share that the name has of all names that point to that abbreviation
authors_with_name_pointing_to_abbreviation_sum =  authors.groupby(["abbreviation"]).agg(names_pointing_to_abbreviation_sum=("name_pointing_to_abbreviation_count", "sum")).reset_index()
authors = pd.merge(authors, authors_with_name_pointing_to_abbreviation_sum, on="abbreviation")
authors["name_pointing_to_abbreviation_share"] = authors["name_pointing_to_abbreviation_count"] / authors["names_pointing_to_abbreviation_sum"]

In [29]:
# set the share that the abbreviation has of all abbreviations that point to that name
authors_with_abbreviation_pointing_to_name_sum = authors.groupby(["name"]).agg(abbreviations_pointing_to_name_sum=("abbreviation_pointing_to_name_count", "sum")).reset_index()
authors = pd.merge(authors, authors_with_abbreviation_pointing_to_name_sum, on="name")
authors["abbreviation_pointing_to_name_share"] = authors["abbreviation_pointing_to_name_count"] / authors["abbreviations_pointing_to_name_sum"]

In [30]:
authors

Unnamed: 0,name,abbreviation,certainty,name_pointing_to_abbreviation_count,abbreviation_pointing_to_name_count,names_pointing_to_abbreviation_sum,name_pointing_to_abbreviation_share,abbreviations_pointing_to_name_sum,abbreviation_pointing_to_name_share
0,Kai-Uwe Brandt,kub,0.800000,35,35,35,1.000000,50,0.700000
1,Kai-Uwe Brandt,ka,0.422200,15,15,452,0.033186,50,0.300000
2,Nadja Topfstedt,jap,0.300000,21,21,21,1.000000,21,1.000000
3,Andreas Tappert,art,0.691664,137,137,137,1.000000,202,0.678218
4,Andreas Tappert,ast,0.711140,43,43,43,1.000000,202,0.212871
...,...,...,...,...,...,...,...,...,...
273,Lilly Günthner,lg,0.900000,70,70,70,1.000000,70,1.000000
274,Yvonne Schmidt,ys,0.800000,78,78,78,1.000000,78,1.000000
275,Gislinde Redepenning,gislinde redepenning,0.800000,2,2,2,1.000000,2,1.000000
276,Tim Niklas Herholz,tnh,0.800000,77,77,77,1.000000,77,1.000000


In [31]:
# calculate a final score for the weighted edge between name and abbreviation
authors["score"] = authors["certainty"] + authors["name_pointing_to_abbreviation_share"] + authors["abbreviation_pointing_to_name_share"]

In [32]:
authors[authors["name"] == "Kai-Uwe Brandt"]

Unnamed: 0,name,abbreviation,certainty,name_pointing_to_abbreviation_count,abbreviation_pointing_to_name_count,names_pointing_to_abbreviation_sum,name_pointing_to_abbreviation_share,abbreviations_pointing_to_name_sum,abbreviation_pointing_to_name_share,score
0,Kai-Uwe Brandt,kub,0.8,35,35,35,1.0,50,0.7,2.5
1,Kai-Uwe Brandt,ka,0.4222,15,15,452,0.033186,50,0.3,0.755386


In [33]:
# todo 10.07 data viz, hungarian anwenden evtl. das von scipy sonst:
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.linear_sum_assignment.html
# https://www.hpl.hp.com/techreports/2012/HPL-2012-40R1.pdf
# https://iopscience.iop.org/article/10.1088/1742-6596/1963/1/012104/pdf
# https://link.springer.com/article/10.1007/BF02278710
# evtl. auch was von networkx (vielleicht mal hier: https://towardsdatascience.com/matching-of-bipartite-graphs-using-networkx-6d355b164567)
# https://link.springer.com/content/pdf/10.1007/s10479-010-0757-3.pdf

In [34]:
# reduce the problem space to a bipartite graph, assigning all (name, abbreviation) pairs that are not connected to the graph as final/solved
# add pairs to final mapping where abbreviation_pointing_to_name_count == 1 or name_pointing_to_abbreviation_count == 1 and remove those from the authors df
one_to_one_mappings = authors[(authors["abbreviations_pointing_to_name_sum"] == 1) & (authors["names_pointing_to_abbreviation_sum"] == 1)]
final_mapping = pd.concat([final_mapping, one_to_one_mappings[["name", "abbreviation"]]], ignore_index=True)

authors = authors[(authors["abbreviations_pointing_to_name_sum"] != 1) | (authors["names_pointing_to_abbreviation_sum"] != 1)]

In [35]:
authors

Unnamed: 0,name,abbreviation,certainty,name_pointing_to_abbreviation_count,abbreviation_pointing_to_name_count,names_pointing_to_abbreviation_sum,name_pointing_to_abbreviation_share,abbreviations_pointing_to_name_sum,abbreviation_pointing_to_name_share,score
0,Kai-Uwe Brandt,kub,0.800000,35,35,35,1.000000,50,0.700000,2.500000
1,Kai-Uwe Brandt,ka,0.422200,15,15,452,0.033186,50,0.300000,0.755386
2,Nadja Topfstedt,jap,0.300000,21,21,21,1.000000,21,1.000000,2.300000
3,Andreas Tappert,art,0.691664,137,137,137,1.000000,202,0.678218,2.369882
4,Andreas Tappert,ast,0.711140,43,43,43,1.000000,202,0.212871,1.924011
...,...,...,...,...,...,...,...,...,...,...
273,Lilly Günthner,lg,0.900000,70,70,70,1.000000,70,1.000000,2.900000
274,Yvonne Schmidt,ys,0.800000,78,78,78,1.000000,78,1.000000,2.800000
275,Gislinde Redepenning,gislinde redepenning,0.800000,2,2,2,1.000000,2,1.000000,2.800000
276,Tim Niklas Herholz,tnh,0.800000,77,77,77,1.000000,77,1.000000,2.800000
