This script aims to develop a scoring to map abbreviations to full names based on ressort membership.
Acceptance criteria:
- The score should be between 0 and 1
- The higher the similarity between the abbreviation's department article distribution and the full name's department article distribution, the higher the score
- 

In [7]:
import json
import sqlite3
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
import networkx as nx
from dateutil import relativedelta
from datetime import datetime
import re
import tqdm
from src.models.MatchingType import MatchingType
from scipy.stats import wasserstein_distance, spearmanr

In [36]:
# Note: the lower the wasserstein distance, the better the match

full_name_articles = [55, 50, 20, 10]
abbr1_articles = [55, 30, 40, 10] # this is the better abbreviation match as the mismatch of article counts is closer together (idx 1,2) as in abbr2 (idx 1,3)
abbr2_articles = [55, 30, 20, 30] 

def summed_distance(distr1, distr2):
    sum = 0
    for i in range(len(distr1)):
        sum += abs(distr1[i] - distr2[i])
    return sum

print(f"summed distance between full name and abbr1: {summed_distance(full_name_articles, abbr1_articles)}")
print(f"summed distance between full name and abbr2: {summed_distance(full_name_articles, abbr2_articles)}")
print(f"spearman correlation between full name and abbr1: {spearmanr(full_name_articles, abbr1_articles).statistic}")
print(f"spearman correlation between full name and abbr2: {spearmanr(full_name_articles, abbr2_articles).statistic}")
print(f"wasserstein distance between full name and abbr1: {wasserstein_distance(range(0, len(full_name_articles)), range(0, len(full_name_articles)), full_name_articles, abbr1_articles)}")
print(f"wasserstein distance between full name and abbr2: {wasserstein_distance(range(0, len(full_name_articles)), range(0, len(full_name_articles)), full_name_articles, abbr2_articles)}")


summed distance between full name and abbr1: 40
summed distance between full name and abbr2: 40
spearman correlation between full name and abbr1: 0.7999999999999999
spearman correlation between full name and abbr2: 0.632455532033676
wasserstein distance between full name and abbr1: 0.14814814814814814
wasserstein distance between full name and abbr2: 0.2962962962962963


We conclude that the Wasserstein distance is an appropriate metric to determine the differences in department membership. We must first order the departments in an ordinary fashion based on the article count. 

In [9]:
con = sqlite3.connect('../data/interim/articles_with_author_mapping.db')
cur = con.cursor()

In [10]:
rows = cur.execute("SELECT ar.id, ar.article_namespace_array, ar.published_at, a.name, a.abbreviation FROM articles ar join article_authors aa on ar.id = aa.article_id join unmapped_authors a on aa.author_id = a.id where a.matching_type = ? or a.matching_type = ?", (MatchingType.IS_FULL_NAME.name, MatchingType.IS_ABBREVIATION.name)).fetchall()

In [11]:
departments = pd.DataFrame(columns=['id', 'department', 'published_at', 'name', 'abbreviation'], data=rows)
# set name to abbreviation where name is None
departments["name"] = departments["name"].fillna(departments["abbreviation"])
del departments["abbreviation"]
departments

Unnamed: 0,id,department,published_at,name
0,1504,"[""Region"", ""Delitzsch""]",2021-12-10T06:22:00+00:00,Mathias Schönknecht
1,1505,"[""Region"", ""Altenburg""]",2021-12-10T05:27:19+00:00,Kay Würker
2,1507,"[""Leipzig"", ""Lokales""]",2021-12-10T06:01:00+00:00,Andreas Dunte
3,1513,"[""Region"", ""Borna""]",2021-12-10T06:53:43+00:00,es
4,1514,"[""Region"", ""Markkleeberg""]",2021-12-10T08:01:00+00:00,Rainer Küster
...,...,...,...,...
166023,367701,"[""Region"", ""Geithain""]",2010-01-01T15:11:44+00:00,Thomas Lang
166024,367703,"[""Region"", ""Markranstaedt""]",2010-01-01T17:14:06+00:00,Kendra Reinhardt
166025,367705,"[""Region"", ""Delitzsch""]",2010-01-01T17:32:16+00:00,Thomas Steingen
166026,367706,"[""Region"", ""Bad-Dueben""]",2010-01-01T17:46:10+00:00,Daniel Kaiser


In [12]:
departments["department"] = departments["department"].apply(lambda x: json.loads(x))
departments = departments.explode('department')
# remove rows where department equals "Region" or "Nachrichten" for being to unspecific
departments = departments[departments.department != 'Region']
departments = departments[departments.department != 'Nachrichten']

In [13]:
departments[departments["name"] == "lyn"]

Unnamed: 0,id,department,published_at,name
69297,142430,Leipzig,2018-09-27T10:34:00+00:00,lyn
69297,142430,Polizeiticker,2018-09-27T10:34:00+00:00,lyn
69297,142430,Polizeiticker-Leipzig,2018-09-27T10:34:00+00:00,lyn
69384,142651,Mitteldeutschland,2018-09-26T12:35:00+00:00,lyn
69433,142795,Leipzig,2018-09-25T10:18:00+00:00,lyn
...,...,...,...,...
159377,356487,Wirtschaft-Regional,2010-10-27T07:01:05+00:00,lyn
159506,356723,Wirtschaft,2010-10-21T11:44:50+00:00,lyn
159506,356723,Wirtschaft-Regional,2010-10-21T11:44:50+00:00,lyn
159559,356819,Leipzig,2010-10-19T12:57:39+00:00,lyn


In [14]:
# TODO: 22.08: herausfinden, wie ich verteilungen abgleichen will, wo name und abbr nicht die selbe ressorts menge haben
# auch beachten: abbr ressorts müssen nach den name ressorts geordnet werden, nachdem die nach article count geordnet wurden oder vice versa
# Idee: erstmal alles unter 10 artikeln in dem ressort pro name removen, weil zu unaussagend. threshold an ähnlichen ressorts, dann score auf selbem support (selbe ressorts) errechnen und dann penalizen je ressort, dass nicht in beidem vorkommt und zwar je mehr articles im ressort desto höher penalize
# Note: this will be skewed because there are probably some departments where almost always the authors publish under their full name and vice versa (does this really skew the results?)

In [15]:
# filter departments for name, not matching abbr and matched abbr
name = "Matthias Puppe"
fake_abbr = "lyn"
real_abbr = "mpu"
abbr = real_abbr
departments_filtered = departments[departments["name"].isin([name, abbr])]

In [16]:
author_department_count = departments_filtered.groupby(['name', 'department']).size().reset_index(name='count').sort_values(['name', 'count'], ascending=[True, False])
author_department_count

Unnamed: 0,name,department,count
30,Matthias Puppe,Leipzig,726
33,Matthias Puppe,Mitteldeutschland,656
31,Matthias Puppe,Lokales,592
38,Matthias Puppe,Polizeiticker,148
51,Matthias Puppe,Sportbuzzer,110
...,...,...,...
96,mpu,Polizeiticker-Weltweit,1
99,mpu,Reisereporter,1
101,mpu,Schkeuditz,1
102,mpu,So-koennen-Sie-helfen,1


In [17]:
# remove values where count is < 10
author_department_count = author_department_count[author_department_count["count"] >= 10]

In [18]:
author_department_count[author_department_count["name"] == abbr]

Unnamed: 0,name,department,count
83,mpu,Leipzig,1065
94,mpu,Polizeiticker,673
95,mpu,Polizeiticker-Leipzig,585
85,mpu,Lokales,442
106,mpu,Sportbuzzer,180
79,mpu,Kultur,80
89,mpu,Mitteldeutschland,72
61,mpu,BSG-Chemie,51
105,mpu,Sport-Regional,50
103,mpu,Specials,48


In [19]:
author_department_count[author_department_count["name"] == name]

Unnamed: 0,name,department,count
30,Matthias Puppe,Leipzig,726
33,Matthias Puppe,Mitteldeutschland,656
31,Matthias Puppe,Lokales,592
38,Matthias Puppe,Polizeiticker,148
51,Matthias Puppe,Sportbuzzer,110
39,Matthias Puppe,Polizeiticker-Leipzig,109
23,Matthias Puppe,Kultur,97
42,Matthias Puppe,RB-Leipzig,44
41,Matthias Puppe,RB-Archiv,34
48,Matthias Puppe,Specials,34


In [20]:
# get departments and count for author abbr where also name has an entry with that department
abbr_same_departments_count = author_department_count[(author_department_count["name"] == abbr) & (author_department_count["department"].isin(author_department_count[author_department_count["name"] == name]["department"].tolist()))]
abbr_same_departments_count

Unnamed: 0,name,department,count
83,mpu,Leipzig,1065
94,mpu,Polizeiticker,673
95,mpu,Polizeiticker-Leipzig,585
85,mpu,Lokales,442
106,mpu,Sportbuzzer,180
79,mpu,Kultur,80
89,mpu,Mitteldeutschland,72
61,mpu,BSG-Chemie,51
105,mpu,Sport-Regional,50
103,mpu,Specials,48


In [21]:
# get departments and count for author name where also author abbr has an entry with that department
full_name_same_departments_count = author_department_count[(author_department_count["name"] == name) & (author_department_count["department"].isin(author_department_count[author_department_count["name"] == abbr]["department"].tolist()))]
full_name_same_departments_count

Unnamed: 0,name,department,count
30,Matthias Puppe,Leipzig,726
33,Matthias Puppe,Mitteldeutschland,656
31,Matthias Puppe,Lokales,592
38,Matthias Puppe,Polizeiticker,148
51,Matthias Puppe,Sportbuzzer,110
39,Matthias Puppe,Polizeiticker-Leipzig,109
23,Matthias Puppe,Kultur,97
42,Matthias Puppe,RB-Leipzig,44
41,Matthias Puppe,RB-Archiv,34
48,Matthias Puppe,Specials,34


In [22]:
# create a categorical with all departments in the same order as in full_name_same_departments_count
categorical = pd.Categorical(full_name_same_departments_count["department"], categories=full_name_same_departments_count["department"].tolist())

In [23]:
# order abbr_department_count by categorical, ignore if key is not presentb
abbr_same_departments_count = abbr_same_departments_count.set_index("department").reindex(categorical).reset_index()
abbr_same_departments_count

Unnamed: 0,department,name,count
0,Leipzig,mpu,1065
1,Mitteldeutschland,mpu,72
2,Lokales,mpu,442
3,Polizeiticker,mpu,673
4,Sportbuzzer,mpu,180
5,Polizeiticker-Leipzig,mpu,585
6,Kultur,mpu,80
7,RB-Leipzig,mpu,37
8,RB-Archiv,mpu,25
9,Specials,mpu,48


In [24]:
# get counts into a list from fake_abbr_department_count
abbr_department_count_list = abbr_same_departments_count["count"].tolist()
abbr_department_count_list

[1065, 72, 442, 673, 180, 585, 80, 37, 25, 48, 48, 51, 50, 22, 28]

In [25]:
# get counts into a list from kai_uwe_brandt_department_count
full_name_department_count_list = full_name_same_departments_count["count"].tolist()
full_name_department_count_list

[726, 656, 592, 148, 110, 109, 97, 44, 34, 34, 34, 25, 22, 16, 11]

In [33]:
# compare with wassersetin metric
w_distance = wasserstein_distance(range(0, len(full_name_department_count_list)), range(0, len(abbr_department_count_list)), full_name_department_count_list, abbr_department_count_list)
w_distance

0.8671480903659147

# Following: code for penalizing differences in the department array

In [27]:
# get departments and count for author abbr where name has no entry with that department
abbr_different_departments_count = author_department_count[(author_department_count["name"] == abbr) & ~(author_department_count["department"].isin(author_department_count[author_department_count["name"] == name]["department"].tolist()))]
abbr_different_departments_count

Unnamed: 0,name,department,count
107,mpu,Stadtpolitik,16
114,mpu,Wirtschaft,16
115,mpu,Wirtschaft-Regional,16
62,mpu,Bildung,13
82,mpu,Legida-und-Proteste,11
74,mpu,HC-Leipzig,10
100,mpu,SC-DHfK-Leipzig,10


In [28]:

# get departments and count for author name where author abbr has no entry with that department
full_name_different_departments_count = author_department_count[(author_department_count["name"] == name) & ~(author_department_count["department"].isin(author_department_count[author_department_count["name"] == abbr]["department"].tolist()))]
full_name_different_departments_count

Unnamed: 0,name,department,count
1,Matthias Puppe,Achtung-Baustelle,10


In [32]:
# construct linear function that penalizes the difference in department arrays
# it computes the score be taking the sum of the length of the two lists and divides it by the sum of all departments in that the two entities have written articles in
# it gets scaled by a factor alpha
# this score gets normalized
alpha = 1
penalization_score = alpha * (len(abbr_different_departments_count) + len(full_name_different_departments_count)) / len(author_department_count)
penalization_score
# TODO: normalize penalization score

0.21052631578947367

In [ ]:
# subtract the penalization score from the wasserstein distance
score = w_distance - penalization_score