In [1]:
import json
import sqlite3
import numpy as np
import pandas as pd
import seaborn as sns
import scipy

In [2]:
con = sqlite3.connect('../data/interim/articles_with_author_mapping.db')
cur = con.cursor()

In [9]:
cur.execute("select id, name, abbreviation, matching_certainty from authors")
rows = cur.fetchall()
authors = pd.DataFrame(columns=["id", "name", "abbreviation", "certainty"], data=rows)
authors.set_index("id", inplace=True)

In [10]:
# convert null to nans
authors = authors.replace("null", np.nan)

In [11]:
# set certainty of a row to the average certainty over all rows with same name and abbreviation, ignore nan values
mean_certainty = authors.groupby(["name", "abbreviation"])["certainty"].transform(lambda x: np.nanmean(x))
authors["certainty"] = mean_certainty

  mean_certainty = authors.groupby(["name", "abbreviation"])["certainty"].transform(lambda x: np.nanmean(x))


In [12]:
authors["name_pointing_to_abbreviation_count"] = authors.groupby(["name", "abbreviation"])["name"].transform("count")
authors["abbreviation_pointing_to_name_count"] = authors.groupby(["name", "abbreviation"])["abbreviation"].transform("count")

In [13]:
# drop duplicates based on name, abbreviation, certainty. I can drop them because ..._count saved the count
authors.drop_duplicates(subset=["name", "abbreviation", "certainty"], inplace=True)

In [14]:
# set the share that the name has of all names that point to that abbreviation
authors_with_name_pointing_to_abbreviation_sum =  authors.groupby(["abbreviation"]).agg(names_pointing_to_abbreviation_sum=("name_pointing_to_abbreviation_count", "sum")).reset_index()
authors = pd.merge(authors, authors_with_name_pointing_to_abbreviation_sum, on="abbreviation")
authors["name_pointing_to_abbreviation_share"] = authors["name_pointing_to_abbreviation_count"] / authors["names_pointing_to_abbreviation_sum"]

In [15]:
# set the share that the abbreviation has of all abbreviations that point to that name
authors_with_abbreviation_pointing_to_name_sum = authors.groupby(["name"]).agg(abbreviations_pointing_to_name_sum=("abbreviation_pointing_to_name_count", "sum")).reset_index()
authors = pd.merge(authors, authors_with_abbreviation_pointing_to_name_sum, on="name")
authors["abbreviation_pointing_to_name_share"] = authors["abbreviation_pointing_to_name_count"] / authors["abbreviations_pointing_to_name_sum"]

In [21]:
authors

Unnamed: 0,name,abbreviation,certainty,name_pointing_to_abbreviation_count,abbreviation_pointing_to_name_count,names_pointing_to_abbreviation_sum,name_pointing_to_abbreviation_share,abbreviations_pointing_to_name_sum,abbreviation_pointing_to_name_share
0,kai-uwe brandt,kub,0.810345,29.0,29.0,29.0,1.000000,29.0,1.000000
1,jan peter,jap,0.600000,21.0,21.0,21.0,1.000000,21.0,1.000000
2,manfred lüttich,maf,0.600000,99.0,99.0,1213.0,0.081616,104.0,0.951923
3,manfred lüttich,malü,0.800000,2.0,2.0,2.0,1.000000,104.0,0.019231
4,manfred lüttich,ml,0.333333,3.0,3.0,7.0,0.428571,104.0,0.028846
...,...,...,...,...,...,...,...,...,...
1025,tim niklas herholz,tnh,0.800000,2.0,2.0,2.0,1.000000,2.0,1.000000
1026,susanne plecher,sup,0.602000,1.0,1.0,1.0,1.000000,1.0,1.000000
1027,michael klamp,cmp,0.300000,2.0,2.0,2.0,1.000000,2.0,1.000000
1028,josa mania-schlegel,jms,0.900000,1.0,1.0,1.0,1.000000,1.0,1.000000


In [23]:
# calculate a final score for the weighted edge between name and abbreviation
authors["score"] = authors["certainty"] + authors["name_pointing_to_abbreviation_share"] + authors["abbreviation_pointing_to_name_share"]

In [27]:
authors[authors["name"] == "manfred lüttich"]

Unnamed: 0,name,abbreviation,certainty,name_pointing_to_abbreviation_count,abbreviation_pointing_to_name_count,names_pointing_to_abbreviation_sum,name_pointing_to_abbreviation_share,abbreviations_pointing_to_name_sum,abbreviation_pointing_to_name_share,score
2,manfred lüttich,maf,0.6,99.0,99.0,1213.0,0.081616,104.0,0.951923,1.633539
3,manfred lüttich,malü,0.8,2.0,2.0,2.0,1.0,104.0,0.019231,1.819231
4,manfred lüttich,ml,0.333333,3.0,3.0,7.0,0.428571,104.0,0.028846,0.790751


In [9]:
# todo 10.07 data viz, hungarian anwenden evtl. das von scipy sonst:
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.linear_sum_assignment.html
# https://www.hpl.hp.com/techreports/2012/HPL-2012-40R1.pdf
# https://iopscience.iop.org/article/10.1088/1742-6596/1963/1/012104/pdf
# evtl. auch was von networkx (vielleicht mal hier: https://towardsdatascience.com/matching-of-bipartite-graphs-using-networkx-6d355b164567)

In [10]:
# Generate edge weights based on certainty and frequency

def mapping_func(abbr):
    abbr = calc_abbreviation_to_name_score(abbr)
    return abbr

def name_mapping_func(name):
    print("Name: " + name)
    # get all names pointing to that abbreviation
    abbreviations = authors[authors.name == name].abbreviation
    certainties = authors[authors.name == name][["abbreviation", "certainty"]].groupby("abbreviation").agg({"certainty": "mean"}).reset_index()
    frequencies = abbreviations.value_counts(normalize=True)
    # merge certainties and frequencies on name
    merged = pd.merge(certainties, frequencies, on="abbreviation")
    print(merged)
    return name

def calc_abbreviation_to_name_score(row):
    abbr, name = row["abbreviation"], row["name"]
    # get all names pointing to that abbreviation
    names = authors[authors.abbreviation == abbr].name
    # get the proportion of the name in the list
    proportions = names.value_counts(normalize=True).to_frame().reset_index()
    proportion = proportions[proportions["name"] == name]["proportion"].iloc[0]
    return proportion

# loop through names
authors["abbreviation_to_name_score"] = authors.apply(lambda x: calc_abbreviation_to_name_score(x), axis=1)
authors

IndexError: single positional indexer is out-of-bounds