In [2]:
import json
import sqlite3
import numpy as np
import pandas as pd
import seaborn as sns
import scipy

In [3]:
con = sqlite3.connect('../data/interim/articles_with_author_mapping.db')
cur = con.cursor()

In [4]:
cur.execute("select id, name, abbreviation, matching_certainty from authors")
rows = cur.fetchall()
authors = pd.DataFrame(columns=["id", "name", "abbreviation", "certainty"], data=rows)
authors.set_index("id", inplace=True)

In [24]:
final_mapping = pd.DataFrame(columns=["name", "abbreviation"])

In [6]:
# convert null to nans
authors = authors.replace("null", np.nan)

In [7]:
# set certainty of a row to the average certainty over all rows with same name and abbreviation, ignore nan values
mean_certainty = authors.groupby(["name", "abbreviation"])["certainty"].transform(lambda x: np.nanmean(x))
authors["certainty"] = mean_certainty

  mean_certainty = authors.groupby(["name", "abbreviation"])["certainty"].transform(lambda x: np.nanmean(x))


In [8]:
authors["name_pointing_to_abbreviation_count"] = authors.groupby(["name", "abbreviation"])["name"].transform("count")
authors["abbreviation_pointing_to_name_count"] = authors.groupby(["name", "abbreviation"])["abbreviation"].transform("count")

In [9]:
# drop duplicates based on name, abbreviation, certainty. I can drop them because ..._count saved the count
authors.drop_duplicates(subset=["name", "abbreviation", "certainty"], inplace=True)

In [10]:
# set the share that the name has of all names that point to that abbreviation
authors_with_name_pointing_to_abbreviation_sum =  authors.groupby(["abbreviation"]).agg(names_pointing_to_abbreviation_sum=("name_pointing_to_abbreviation_count", "sum")).reset_index()
authors = pd.merge(authors, authors_with_name_pointing_to_abbreviation_sum, on="abbreviation")
authors["name_pointing_to_abbreviation_share"] = authors["name_pointing_to_abbreviation_count"] / authors["names_pointing_to_abbreviation_sum"]

In [11]:
# set the share that the abbreviation has of all abbreviations that point to that name
authors_with_abbreviation_pointing_to_name_sum = authors.groupby(["name"]).agg(abbreviations_pointing_to_name_sum=("abbreviation_pointing_to_name_count", "sum")).reset_index()
authors = pd.merge(authors, authors_with_abbreviation_pointing_to_name_sum, on="name")
authors["abbreviation_pointing_to_name_share"] = authors["abbreviation_pointing_to_name_count"] / authors["abbreviations_pointing_to_name_sum"]

In [20]:
authors

Unnamed: 0,name,abbreviation,certainty,name_pointing_to_abbreviation_count,abbreviation_pointing_to_name_count,names_pointing_to_abbreviation_sum,name_pointing_to_abbreviation_share,abbreviations_pointing_to_name_sum,abbreviation_pointing_to_name_share,score,abbreviation_to_name_score
0,kai-uwe brandt,kub,0.810345,29.0,29.0,29.0,1.000000,29.0,1.000000,2.810345,1.00
1,jan peter,jap,0.600000,21.0,21.0,21.0,1.000000,21.0,1.000000,2.600000,1.00
2,manfred lüttich,maf,0.600000,99.0,99.0,1213.0,0.081616,104.0,0.951923,1.633539,0.25
3,manfred lüttich,malü,0.800000,2.0,2.0,2.0,1.000000,104.0,0.019231,1.819231,1.00
4,manfred lüttich,ml,0.333333,3.0,3.0,7.0,0.428571,104.0,0.028846,0.790751,0.20
...,...,...,...,...,...,...,...,...,...,...,...
1025,tim niklas herholz,tnh,0.800000,2.0,2.0,2.0,1.000000,2.0,1.000000,2.800000,1.00
1026,susanne plecher,sup,0.602000,1.0,1.0,1.0,1.000000,1.0,1.000000,2.602000,1.00
1027,michael klamp,cmp,0.300000,2.0,2.0,2.0,1.000000,2.0,1.000000,2.300000,1.00
1028,josa mania-schlegel,jms,0.900000,1.0,1.0,1.0,1.000000,1.0,1.000000,2.900000,1.00


In [13]:
# calculate a final score for the weighted edge between name and abbreviation
authors["score"] = authors["certainty"] + authors["name_pointing_to_abbreviation_share"] + authors["abbreviation_pointing_to_name_share"]

In [14]:
authors[authors["name"] == "manfred lüttich"]

Unnamed: 0,name,abbreviation,certainty,name_pointing_to_abbreviation_count,abbreviation_pointing_to_name_count,names_pointing_to_abbreviation_sum,name_pointing_to_abbreviation_share,abbreviations_pointing_to_name_sum,abbreviation_pointing_to_name_share,score
2,manfred lüttich,maf,0.6,99.0,99.0,1213.0,0.081616,104.0,0.951923,1.633539
3,manfred lüttich,malü,0.8,2.0,2.0,2.0,1.0,104.0,0.019231,1.819231
4,manfred lüttich,ml,0.333333,3.0,3.0,7.0,0.428571,104.0,0.028846,0.790751


In [15]:
# todo 10.07 data viz, hungarian anwenden evtl. das von scipy sonst:
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.linear_sum_assignment.html
# https://www.hpl.hp.com/techreports/2012/HPL-2012-40R1.pdf
# https://iopscience.iop.org/article/10.1088/1742-6596/1963/1/012104/pdf
# https://link.springer.com/article/10.1007/BF02278710
# evtl. auch was von networkx (vielleicht mal hier: https://towardsdatascience.com/matching-of-bipartite-graphs-using-networkx-6d355b164567)
# https://link.springer.com/content/pdf/10.1007/s10479-010-0757-3.pdf

In [33]:
# reduce the problem space to a bipartite graph, assigning all (name, abbreviation) pairs that are not connected to the graph as final/solved
# add pairs to final mapping where abbreviation_pointing_to_name_count == 1 or name_pointing_to_abbreviation_count == 1 and remove those from the authors df
one_to_one_mappings = authors[(authors["abbreviations_pointing_to_name_sum"] == 1) & (authors["names_pointing_to_abbreviation_sum"] == 1)]
final_mapping = pd.concat([final_mapping, one_to_one_mappings[["name", "abbreviation"]]], ignore_index=True)

authors = authors[(authors["abbreviations_pointing_to_name_sum"] != 1) | (authors["names_pointing_to_abbreviation_sum"] != 1)]

In [35]:
authors

Unnamed: 0,name,abbreviation,certainty,name_pointing_to_abbreviation_count,abbreviation_pointing_to_name_count,names_pointing_to_abbreviation_sum,name_pointing_to_abbreviation_share,abbreviations_pointing_to_name_sum,abbreviation_pointing_to_name_share,score,abbreviation_to_name_score
0,kai-uwe brandt,kub,0.810345,29.0,29.0,29.0,1.000000,29.0,1.000000,2.810345,1.00
1,jan peter,jap,0.600000,21.0,21.0,21.0,1.000000,21.0,1.000000,2.600000,1.00
2,manfred lüttich,maf,0.600000,99.0,99.0,1213.0,0.081616,104.0,0.951923,1.633539,0.25
3,manfred lüttich,malü,0.800000,2.0,2.0,2.0,1.000000,104.0,0.019231,1.819231,1.00
4,manfred lüttich,ml,0.333333,3.0,3.0,7.0,0.428571,104.0,0.028846,0.790751,0.20
...,...,...,...,...,...,...,...,...,...,...,...
1019,lilly günthner,lg,0.900000,13.0,13.0,13.0,1.000000,13.0,1.000000,2.900000,1.00
1020,lisa konstantinidis,liko,0.800000,42.0,42.0,48.0,0.875000,42.0,1.000000,2.675000,0.50
1021,luise jahn und victoria kovacs,liko,0.500000,6.0,6.0,48.0,0.125000,6.0,1.000000,1.625000,0.50
1025,tim niklas herholz,tnh,0.800000,2.0,2.0,2.0,1.000000,2.0,1.000000,2.800000,1.00
