This script aims to develop a scoring to map abbreviations to full names based on ressort membership.
Acceptance criteria:
- The score should be between 0 and 1
- The higher the similarity between the abbreviation's department article distribution and the full name's department article distribution, the higher the score
- 

In [1]:
import json
import sqlite3
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
import networkx as nx
from dateutil import relativedelta
from datetime import datetime
import re
import tqdm
from src.models.MatchingType import MatchingType
from scipy.stats import wasserstein_distance, spearmanr

In [2]:
full_name_articles = [55, 50, 20, 10]
abbr1_articles = [20, 50, 55, 10]
abbr2_articles = [20, 50, 20, 45]

def summed_distance(distr1, distr2):
    sum = 0
    for i in range(len(distr1)):
        sum += abs(distr1[i] - distr2[i])
    return sum

print(f"summed distance between full name and abbr1: {summed_distance(full_name_articles, abbr1_articles)}")
print(f"summed distance between full name and abbr2: {summed_distance(full_name_articles, abbr2_articles)}")
print(f"spearman correlation between full name and abbr1: {spearmanr(full_name_articles, abbr1_articles).statistic}")
print(f"spearman correlation between full name and abbr2: {spearmanr(full_name_articles, abbr2_articles).statistic}")
print(f"wasserstein distance between full name and abbr1: {wasserstein_distance(range(0, len(full_name_articles)), range(0, len(full_name_articles)), full_name_articles, abbr1_articles)}")
print(f"wasserstein distance between full name and abbr2: {wasserstein_distance(range(0, len(full_name_articles)), range(0, len(full_name_articles)), full_name_articles, abbr2_articles)}")


summed distance between full name and abbr1: 70
summed distance between full name and abbr2: 70
spearman correlation between full name and abbr1: 0.19999999999999998
spearman correlation between full name and abbr2: -0.21081851067789195
wasserstein distance between full name and abbr1: 0.5185185185185186
wasserstein distance between full name and abbr2: 0.7777777777777779


We conclude that the Wasserstein distance is an appropriate metric to determine the differences in department membership. We must first order the departments in an ordinary fashion based on the article count. 

In [3]:
con = sqlite3.connect('../data/interim/articles_with_author_mapping.db')
cur = con.cursor()

In [4]:
rows = cur.execute("SELECT ar.id, ar.article_namespace_array, ar.published_at, a.name FROM articles ar join article_authors aa on ar.id = aa.article_id join unmapped_authors a on aa.author_id = a.id where a.matching_type = ?", (MatchingType.IS_FULL_NAME.name,)).fetchall()

In [5]:
departments = pd.DataFrame(columns=['id', 'department', 'published_at', 'name'], data=rows)
departments

Unnamed: 0,id,department,published_at,name
0,1504,"[""Region"", ""Delitzsch""]",2021-12-10T06:22:00+00:00,Mathias Schönknecht
1,1505,"[""Region"", ""Altenburg""]",2021-12-10T05:27:19+00:00,Kay Würker
2,1507,"[""Leipzig"", ""Lokales""]",2021-12-10T06:01:00+00:00,Andreas Dunte
3,1514,"[""Region"", ""Markkleeberg""]",2021-12-10T08:01:00+00:00,Rainer Küster
4,1517,"[""Region"", ""Oschatz""]",2021-12-10T08:17:00+00:00,Christian Kunze
...,...,...,...,...
130961,367701,"[""Region"", ""Geithain""]",2010-01-01T15:11:44+00:00,Thomas Lang
130962,367703,"[""Region"", ""Markranstaedt""]",2010-01-01T17:14:06+00:00,Kendra Reinhardt
130963,367705,"[""Region"", ""Delitzsch""]",2010-01-01T17:32:16+00:00,Thomas Steingen
130964,367706,"[""Region"", ""Bad-Dueben""]",2010-01-01T17:46:10+00:00,Daniel Kaiser


In [6]:
departments["department"] = departments["department"].apply(lambda x: json.loads(x))
departments = departments.explode('department')
# remove rows where department equals "Region" or "Nachrichten" for being to unspecific
departments = departments[departments.department != 'Region']
departments = departments[departments.department != 'Nachrichten']

In [7]:
departments[departments["name"] == "Anke Herold"]

Unnamed: 0,id,department,published_at,name
1664,4444,Delitzsch,2021-11-07T13:19:00+00:00,Anke Herold
1990,5019,Eilenburg,2021-10-31T09:57:00+00:00,Anke Herold
2011,5059,Eilenburg,2021-10-31T18:14:00+00:00,Anke Herold
2671,6149,Eilenburg,2021-10-17T16:02:00+00:00,Anke Herold
2674,6152,Delitzsch,2021-10-17T17:02:00+00:00,Anke Herold
...,...,...,...,...
108888,311271,Eilenburg,2013-11-10T19:13:00+00:00,Anke Herold
109186,311885,Delitzsch,2013-10-27T19:52:25+00:00,Anke Herold
114822,323509,Delitzsch,2013-02-03T15:39:58+00:00,Anke Herold
117253,331228,Delitzsch,2012-07-22T21:54:00+00:00,Anke Herold


In [None]:
# TODO: 22.08: herausfinden, wie ich verteilungen abgleichen will, wo name und abbr nicht die selbe ressorts menge haben
# auch beachten: abbr ressorts müssen nach den name ressorts geordnet werden, nachdem die nach article count geordnet wurden oder vice versa