In [20]:
import json
import sqlite3
import pandas as pd
import seaborn as sns
from src.models.MatchingType import MatchingType

In [21]:
con = sqlite3.connect('../data/interim/articles_with_author_mapping.db')
cur = con.cursor()

In [22]:
def get_abbreviations_with_names():
    cur.execute("select id, name, abbreviation, matching_certainty from unmapped_authors where matching_type = ? or matching_type = ?", (MatchingType.FUZZY_MATCH.name, MatchingType.DIRECT_MATCH.name))
    rows = cur.fetchall()
    authors = pd.DataFrame(columns=["id", "name", "abbreviation", "certainty"], data=rows)
    authors.set_index("id", inplace=True)
    return authors

authors = get_abbreviations_with_names()

In [23]:
authors

Unnamed: 0_level_0,name,abbreviation,certainty
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
632071,Kai-Uwe Brandt,kub,0.800
632099,Nadja Topfstedt,jap,0.300
632112,Andreas Tappert,art,0.672
632130,Kai-Uwe Brandt,kub,0.800
632148,Andreas Tappert,ast,0.786
...,...,...,...
842075,Roger Dietze,red,0.843
842084,Janina Fleischer,jaf,0.900
842102,Matthias Roth,mro,0.800
842107,Lilly Günthner,lg,0.900


### Potential Final mapping of an abbreviation to a name
Test if significant shares of an abbreviation to same name exist. If theres is a dominant share, assign this as a final mapping

In [24]:
def mapping_func(abbr):
    abbr = abbr_mapping_func(abbr)
    return abbr

def abbr_mapping_func(abbr):
    print("Abbreviation: " + abbr)
    # get all names pointing to that abbreviation
    names = authors[authors.abbreviation == abbr].name
    certainties = authors[authors.abbreviation == abbr][["name", "certainty"]].groupby("name").agg({"certainty": "mean"}).reset_index()
    frequencies = names.value_counts(normalize=True)
    # merge certainties and frequencies on name
    merged = pd.merge(certainties, frequencies, on="name")
    merged["score"] = merged.certainty * merged.proportion
    print(merged)
    return abbr
    
# loop through names
authors.head(20)["abbreviation"].apply(mapping_func)

Abbreviation: kub
             name  certainty  proportion  score
0  Kai-Uwe Brandt        0.8         1.0    0.8
Abbreviation: jap
              name  certainty  proportion  score
0  Nadja Topfstedt        0.3         1.0    0.3
Abbreviation: art
              name  certainty  proportion     score
0  Andreas Tappert   0.691664         1.0  0.691664
Abbreviation: kub
             name  certainty  proportion  score
0  Kai-Uwe Brandt        0.8         1.0    0.8
Abbreviation: ast
              name  certainty  proportion    score
0  Andreas Tappert    0.71114         1.0  0.71114
Abbreviation: ar
               name  certainty  proportion     score
0    Andrea Richter   0.822222    0.136364  0.112121
1  Angelika Raulien   0.900000    0.015152  0.013636
2     Annett Riedel   0.900000    0.848485  0.763636
Abbreviation: jap
              name  certainty  proportion  score
0  Nadja Topfstedt        0.3         1.0    0.3
Abbreviation: ar
               name  certainty  proportion     score

id
632071    kub
632099    jap
632112    art
632130    kub
632148    ast
632149     ar
632168    jap
632182     ar
632204    jap
632208    ahr
632211    lis
632212     nf
632231    pfü
632232     ka
632247    ast
632270     ka
632280     ar
632312    ahr
632314     ar
632340    ahr
Name: abbreviation, dtype: object

### Potential Final mapping of a name to an abbreviation
Test if significant shares of name to same abbreviation exist. If theres is a dominant share, assign this as a final mapping

In [25]:
def name_mapping_func(name):
    print("Name: " + name)
    # get all names pointing to that abbreviation
    abbreviations = authors[authors.name == name].abbreviation
    certainties = authors[authors.name == name][["abbreviation", "certainty"]].groupby("abbreviation").agg({"certainty": "mean"}).reset_index()
    frequencies = abbreviations.value_counts(normalize=True)
    # merge certainties and frequencies on name
    merged = pd.merge(certainties, frequencies, on="abbreviation")
    print(merged)
    return name

# loop through names
authors.head(5)["name"].apply(lambda row: name_mapping_func(row))

Name: Kai-Uwe Brandt
  abbreviation  certainty  proportion
0           ka     0.4222         0.3
1          kub     0.8000         0.7
Name: Nadja Topfstedt
  abbreviation  certainty  proportion
0          jap        0.3         1.0
Name: Andreas Tappert
  abbreviation  certainty  proportion
0          art   0.691664    0.678218
1          ast   0.711140    0.212871
2           at   0.800000    0.108911
Name: Kai-Uwe Brandt
  abbreviation  certainty  proportion
0           ka     0.4222         0.3
1          kub     0.8000         0.7
Name: Andreas Tappert
  abbreviation  certainty  proportion
0          art   0.691664    0.678218
1          ast   0.711140    0.212871
2           at   0.800000    0.108911


id
632071     Kai-Uwe Brandt
632099    Nadja Topfstedt
632112    Andreas Tappert
632130     Kai-Uwe Brandt
632148    Andreas Tappert
Name: name, dtype: object