# Burrows Delta

### Imports

In [2]:
from pathlib import Path
import numpy as np
import pandas as pd
from scipy.spatial import distance
from typing import Dict, List, Tuple
from nltk import word_tokenize
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
#from pydelta import delta

import warnings
warnings.filterwarnings('ignore')

max_features = 2000

In [3]:
corpus = pd.read_csv("../data/rcorpus.csv")

In [4]:
corpus.head()

Unnamed: 0,filename,author,title,year,textlength,text
0,Wilhelm_Walloth_-_Das_Schatzhaus_des_Königs_(1...,Wilhelm Walloth,Das Schatzhaus des Königs,1883,105651,Roman aus dem alten Ägypten Und die Ägypter zw...
1,Max_Eyth_-_Der_Schneider_von_Ulm_(1906),Max Eyth,Der Schneider von Ulm,1906,272961,"[ cover ] Historischer Roman um den Mann , der..."
2,Friedrich_Spielhagen_-_Platt_Land_(1878),Friedrich Spielhagen,Platt Land,1878,205088,"Erstes Buch Erstes Kapitel . Der Wagen hielt ,..."
3,Jakob_Christoph_Heer_-_Der_Wetterwart_(1905),Jakob Christoph Heer,Der Wetterwart,1905,120934,1925 I Die feierliche Abendhelle steht über de...
4,Robert_Kraft_-_Detektiv_Nobody's_Erlebnisse_un...,Robert Kraft,Detektiv Nobodys Erlebnisse und Reiseabenteuer...,1904,203192,. Detektiv Nobody 's Erlebnisse und Reiseabent...


## 2000 frequent words + term document matrix (= tdm)

In [5]:
vectorizer = CountVectorizer(max_features=max_features)
vector = vectorizer.fit_transform(corpus["text"])

In [16]:
features = vectorizer.get_feature_names()
authors = corpus["author"]
M = pd.DataFrame(vector.toarray(), index=list(authors), columns=features)

In [17]:
M.head()

Unnamed: 0,ab,abend,abends,aber,abermals,abschied,absicht,ach,achseln,acht,...,öffnete,übel,über,überall,überhaupt,überrascht,überzeugt,übrig,übrigen,übrigens
Wilhelm Walloth,44,22,8,256,0,8,12,7,0,9,...,13,5,218,2,7,6,5,5,17,9
Max Eyth,56,61,15,1243,18,25,6,7,0,55,...,30,35,527,29,29,2,14,19,19,27
Friedrich Spielhagen,37,80,6,859,25,8,11,24,6,27,...,8,13,411,27,25,2,47,5,32,27
Jakob Christoph Heer,27,87,5,720,0,21,1,6,0,6,...,8,8,382,5,12,16,8,2,7,4
Robert Kraft,98,17,4,896,15,6,23,54,2,32,...,18,2,306,18,109,8,9,9,7,10


In [18]:
M.shape

(866, 2000)

## train test split

In [30]:
X_train, X_test, y_train, y_test = train_test_split(vector,
                                                    authors,
                                                    test_size=0.33,
                                                   stratify=authors,
                                                   random_state=42)
#stratify=list(authors)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [26]:
X_train.shape

(692, 2000)

## z-score

$Z_i = \dfrac{C_i - \mu_i}{\sigma_i}$<br>

$C_i$ = observed frequency<br>
$\mu$ = mean of means<br>
$\sigma$ = standard deviation

In [9]:
def z_score(x):
    return (x-x.mean()) / x.std()

In [10]:
z_M = M.apply(z_score)

In [11]:
z_M.head()

Unnamed: 0,ab,abend,abends,aber,abermals,abschied,absicht,ach,achseln,acht,...,öffnete,übel,über,überall,überhaupt,überrascht,überzeugt,übrig,übrigen,übrigens
Wilhelm Walloth,-0.244003,-0.469858,-0.208756,-0.805282,-0.66261,-0.076802,0.42705,-0.767392,-0.866642,-0.313689,...,-0.102749,-0.346517,-0.186845,-0.821588,-0.61607,0.142765,-0.199766,-0.055809,0.59085,-0.172205
Max Eyth,0.051772,0.901467,0.483519,1.46268,0.94473,1.862604,-0.196426,-0.767392,-0.866642,3.888094,...,1.074431,3.579567,1.648236,0.955998,0.415696,-0.527342,0.687401,2.639921,0.757982,1.102407
Friedrich Spielhagen,-0.416539,1.569548,-0.406549,0.580312,1.569806,-0.076802,0.323137,-0.155589,0.277722,1.330487,...,-0.448979,0.700439,0.959338,0.824325,0.228102,-0.527342,3.94035,-0.055809,1.844343,1.102407
Jakob Christoph Heer,-0.663018,1.815683,-0.505445,0.260913,-0.66261,1.406273,-0.71599,-0.80338,-0.866642,-0.587718,...,-0.448979,0.046091,0.787114,-0.624078,-0.381578,1.818033,0.095956,-0.633465,-0.244812,-0.526264
Robert Kraft,1.086985,-0.645668,-0.604341,0.665331,0.67684,-0.304967,1.570089,0.924062,-0.485187,1.787203,...,0.24348,-0.739125,0.335767,0.231796,4.16757,0.477819,0.19453,0.7144,-0.244812,-0.101393


## comparing unknown

In [15]:
#dict with words as keys and z_scores as values of unknown
unknown_dict = z_dtm.loc[[u]].to_dict(orient="records")[0]

In [16]:
unknown_dict["und"]

-0.024898252355505832

### delta score

$\Delta_c = \sum_i{\dfrac{|Z_{c(i)} - Z_{t(i)}|}{n}}$<br>

$n$ = most frequent words

In [17]:
def compute_delta(df: pd.DataFrame, 
                  unknown_freq: dict,
                  features_count: int,
                  unknown: str) -> pd.DataFrame:

    compare_unknown = {}

    for index, row in df.iterrows():
        if index != unknown:
            delta = 0
            for key, value in dict(row).items():
                delta += (abs(value - unknown_freq[key])/features_count)
            compare_unknown[index] = delta

    output_df = pd.DataFrame.from_dict(compare_unknown, orient="index").reset_index()
    output_df.columns = ["title", "delta"]
    return output_df.sort_values("delta", ascending=True)

In [18]:
delta_df = compute_delta(z_dtm, unknown_dict, max_features, u)
delta_df

Unnamed: 0,title,delta
22,"Marlitt,-Eugenie_Die Frau mit den Karfunkelste...",0.335293
61,"Marlitt,-Eugenie_Das Geheimnis der alten Mamsell",0.348770
23,"Marlitt,-Eugenie_Goldelse",0.367287
68,"Wassermann,-Jakob_Die Juden von Zirndorf",0.468939
57,"Keller,-Gottfried_Das Sinngedicht",0.473243
...,...,...
32,"Freytag,-Gustav_Soll und Haben",1.111732
42,"Wieland,-Christoph-Martin_Aristipp und einige ...",1.400727
10,"Freytag,-Gustav_Die Ahnen",2.665838
52,"Gutzkow,-Karl_Der Zauberer von Rom",3.197457


## compare with pydelta

In [19]:
corpus2 = delta.Corpus(p).top_n(2000)

In [20]:
corpus2.head()

Unnamed: 0,und,die,der,zu,in,den,sie,er,ich,nicht,...,Ansehen,Liebhaber,bösen,ungefähr,käme,Atem,begleitete,Staat,schwach,Kaum
"Huber,-Therese_Ellen Percy",1946.0,1852.0,1503.0,2124.0,1214.0,837.0,1245.0,632.0,2082.0,832.0,...,8.0,10.0,2.0,3.0,1.0,0.0,6.0,0.0,3.0,12.0
"Fischer,-Caroline-Auguste_Margarethe",1266.0,605.0,477.0,508.0,310.0,200.0,660.0,437.0,1129.0,619.0,...,1.0,0.0,2.0,0.0,1.0,0.0,2.0,0.0,1.0,2.0
"Tieck,-Ludwig_Geschichte des Herrn William Lovell",5444.0,3323.0,2429.0,2128.0,2357.0,1290.0,1317.0,1246.0,4651.0,1834.0,...,3.0,15.0,3.0,1.0,7.0,9.0,3.0,1.0,17.0,2.0
"Huber,-Therese_Luise",950.0,793.0,567.0,1091.0,487.0,352.0,1329.0,451.0,61.0,369.0,...,4.0,7.0,2.0,1.0,1.0,0.0,0.0,0.0,3.0,9.0
"Spielhagen,-Friedrich_Hammer und Amboá",8124.0,6309.0,6570.0,3811.0,4416.0,3398.0,2157.0,2591.0,6339.0,3339.0,...,8.0,12.0,22.0,24.0,4.0,0.0,11.0,0.0,4.0,3.0


In [27]:
distances = delta.functions.burrows(corpus2)
delta_df2 = pd.DataFrame.from_dict(dict(distances.loc[u]), orient="index").reset_index()
delta_df2.columns = ["title", "delta"]
delta_df2 = delta_df2.sort_values("delta", ascending=True)
delta_df2 = delta_df2[1:]
delta_df2

Unnamed: 0,title,delta
23,"Marlitt,-Eugenie_Die Frau mit den Karfunkelste...",0.330120
62,"Marlitt,-Eugenie_Das Geheimnis der alten Mamsell",0.334827
24,"Marlitt,-Eugenie_Goldelse",0.359167
69,"Wassermann,-Jakob_Die Juden von Zirndorf",0.461083
58,"Keller,-Gottfried_Das Sinngedicht",0.463065
...,...,...
33,"Freytag,-Gustav_Soll und Haben",1.119223
43,"Wieland,-Christoph-Martin_Aristipp und einige ...",1.353386
11,"Freytag,-Gustav_Die Ahnen",2.620803
53,"Gutzkow,-Karl_Der Zauberer von Rom",3.235148


## compare the two result dataframes

In [22]:
dflist = list(delta_df["title"])
df2list = list(delta_df2["title"])

c = 0
for element, element2 in zip(dflist, df2list):
    if element != element2:
        c+=1
print(f"Anzahl der Unterschiede: {c}")

Anzahl der Unterschiede: 48


In [23]:
columns = list(delta_df)+list(delta_df2)
result2 = pd.concat([delta_df, delta_df2])
result2 = result2.loc[:, columns]
result2

Unnamed: 0,title,delta,title.1,delta.1
22,"Marlitt,-Eugenie_Die Frau mit den Karfunkelste...",0.335293,"Marlitt,-Eugenie_Die Frau mit den Karfunkelste...",0.335293
61,"Marlitt,-Eugenie_Das Geheimnis der alten Mamsell",0.348770,"Marlitt,-Eugenie_Das Geheimnis der alten Mamsell",0.348770
23,"Marlitt,-Eugenie_Goldelse",0.367287,"Marlitt,-Eugenie_Goldelse",0.367287
68,"Wassermann,-Jakob_Die Juden von Zirndorf",0.468939,"Wassermann,-Jakob_Die Juden von Zirndorf",0.468939
57,"Keller,-Gottfried_Das Sinngedicht",0.473243,"Keller,-Gottfried_Das Sinngedicht",0.473243
...,...,...,...,...
33,"Freytag,-Gustav_Soll und Haben",1.119223,"Freytag,-Gustav_Soll und Haben",1.119223
43,"Wieland,-Christoph-Martin_Aristipp und einige ...",1.353386,"Wieland,-Christoph-Martin_Aristipp und einige ...",1.353386
11,"Freytag,-Gustav_Die Ahnen",2.620803,"Freytag,-Gustav_Die Ahnen",2.620803
53,"Gutzkow,-Karl_Der Zauberer von Rom",3.235148,"Gutzkow,-Karl_Der Zauberer von Rom",3.235148


## euclidean and manhatten distance

In [30]:
manhattan = delta.functions.manhattan(corpus2)
manhattan

Unnamed: 0,"Huber,-Therese_Ellen Percy","Fischer,-Caroline-Auguste_Margarethe","Tieck,-Ludwig_Geschichte des Herrn William Lovell","Huber,-Therese_Luise","Spielhagen,-Friedrich_Hammer und Amboá","May,-Karl_Kurdistan","Raabe,-Wilhelm_Stopfkuchen Eine See- und Mordgeschichte",UNBEKANNT_Das Heideprinzesschen,"Dohm,-Hedwig_Christa Ruland","Wieland,-Christoph-Martin_Die Abenteuer des Don Sylvio von Rosalva",...,"Goethe,-Johann-Wolfgang_Die Leiden des jungen Werther","Hauff,-Wilhelm_Die Bettlerin vom Pont des Arts","Francois,-Louise-von_Stufenjahre eines Gluecklichen","Wassermann,-Jakob_Die Juden von Zirndorf","Keller,-Gottfried_Der gruene Heinrich [Erste Fassung]","Francois,-Louise-von_Judith die Kluswirtin","Wassermann,-Jakob_Christian Wahnschaffe","Lewald,-Fanny_Eine Lebensfrage","Huber,-Therese_Die Familie Seldorf","Hauff,-Wilhelm_Lichtenstein"
"Huber,-Therese_Ellen Percy",0.0000,21.5310,36.1360,20.1240,76.9500,34.7050,19.5880,24.1875,17.6470,26.4535,...,21.0330,21.3820,36.4110,19.5015,67.3130,20.2140,62.3295,17.4860,16.6340,26.9785
"Fischer,-Caroline-Auguste_Margarethe",21.5310,0.0000,48.4220,11.5830,92.7130,44.3480,14.2180,33.2125,13.1680,35.4055,...,7.4500,7.8680,45.3240,19.6795,82.4220,11.4680,75.5015,23.2630,24.6940,34.1435
"Tieck,-Ludwig_Geschichte des Herrn William Lovell",36.1360,48.4220,0.0000,49.6360,56.0620,34.8660,41.8120,33.1085,42.4560,31.0885,...,47.6470,48.9290,37.1960,39.6385,53.7180,47.6630,51.6905,34.1700,39.2170,33.3675
"Huber,-Therese_Luise",20.1240,11.5830,49.6360,0.0000,93.5580,46.4830,17.5160,35.2725,13.7990,35.8715,...,10.3150,9.3140,45.3110,20.3175,82.6460,11.5320,76.2085,24.0520,21.0740,35.2225
"Spielhagen,-Friedrich_Hammer und Amboá",76.9500,92.7130,56.0620,93.5580,0.0000,62.9650,85.1460,64.4975,85.2630,65.7885,...,92.6580,93.4150,56.0270,80.0495,43.2980,91.8710,45.1915,74.2100,77.7870,66.6945
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Francois,-Louise-von_Judith die Kluswirtin",20.2140,11.4680,47.6630,11.5320,91.8710,43.3770,13.1030,32.1795,11.3570,35.1155,...,8.7750,8.7730,42.9610,16.1615,81.2800,0.0000,74.2995,23.0700,21.1460,32.4505
"Wassermann,-Jakob_Christian Wahnschaffe",62.3295,75.5015,51.6905,76.2085,45.1915,49.7725,67.4905,50.4250,68.3055,52.6700,...,75.0285,75.8795,42.9525,61.3510,42.3395,74.2995,0.0000,58.5525,61.0365,50.9630
"Lewald,-Fanny_Eine Lebensfrage",17.4860,23.2630,34.1700,24.0520,74.2100,33.0250,20.4650,23.3415,18.1730,25.3375,...,23.0990,24.0440,32.4530,18.5635,66.6380,23.0700,58.5525,0.0000,18.6540,23.7395
"Huber,-Therese_Die Familie Seldorf",16.6340,24.6940,39.2170,21.0740,77.7870,36.4270,22.0980,26.1875,19.9040,26.4165,...,23.2130,22.5960,32.4120,16.8065,67.1800,21.1460,61.0365,18.6540,0.0000,24.1365


In [31]:
euclidean = delta.functions.euclidean(corpus2)
euclidean

Unnamed: 0,"Huber,-Therese_Ellen Percy","Fischer,-Caroline-Auguste_Margarethe","Tieck,-Ludwig_Geschichte des Herrn William Lovell","Huber,-Therese_Luise","Spielhagen,-Friedrich_Hammer und Amboá","May,-Karl_Kurdistan","Raabe,-Wilhelm_Stopfkuchen Eine See- und Mordgeschichte",UNBEKANNT_Das Heideprinzesschen,"Dohm,-Hedwig_Christa Ruland","Wieland,-Christoph-Martin_Die Abenteuer des Don Sylvio von Rosalva",...,"Goethe,-Johann-Wolfgang_Die Leiden des jungen Werther","Hauff,-Wilhelm_Die Bettlerin vom Pont des Arts","Francois,-Louise-von_Stufenjahre eines Gluecklichen","Wassermann,-Jakob_Die Juden von Zirndorf","Keller,-Gottfried_Der gruene Heinrich [Erste Fassung]","Francois,-Louise-von_Judith die Kluswirtin","Wassermann,-Jakob_Christian Wahnschaffe","Lewald,-Fanny_Eine Lebensfrage","Huber,-Therese_Die Familie Seldorf","Hauff,-Wilhelm_Lichtenstein"
"Huber,-Therese_Ellen Percy",0.000000,3719.032132,6349.830076,3938.978548,14241.242853,5100.784253,2987.102610,4051.245364,3018.450596,4075.576401,...,3502.510528,3852.960161,6464.686999,3377.005330,15674.030624,3607.021209,12032.138837,2624.179110,3191.066593,4340.244809
"Fischer,-Caroline-Auguste_Margarethe",3719.032132,0.000000,8910.754738,2007.212993,17227.421339,7216.508297,2375.207780,6401.117324,2141.875813,6479.751924,...,924.041125,1008.230132,8777.111940,3885.067181,18210.264358,1801.194048,14598.954517,4254.797527,4561.749007,6297.296642
"Tieck,-Ludwig_Geschichte des Herrn William Lovell",6349.830076,8910.754738,0.000000,9576.615686,9421.804604,4562.182153,7510.597979,4468.946073,8192.318353,5183.993152,...,8770.523588,9246.662857,6206.575545,7157.540290,11349.101638,8844.420953,8684.542302,6141.755612,7228.428598,5731.912683
"Huber,-Therese_Luise",3938.978548,2007.212993,9576.615686,0.000000,17558.827865,7733.145932,3030.321105,6788.510661,2019.722753,6387.134960,...,1808.906852,1472.926339,8610.455621,3801.992767,18293.936044,1956.865351,14517.351997,4262.403313,3956.700393,6250.727558
"Spielhagen,-Friedrich_Hammer und Amboá",14241.242853,17227.421339,9421.804604,17558.827865,0.000000,11135.298200,15486.388475,11365.787478,16064.492336,11988.249789,...,17019.261676,17408.562376,10299.928155,14621.289991,8137.196077,16753.339727,7318.572743,13698.708114,14333.407690,12099.344156
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Francois,-Louise-von_Judith die Kluswirtin",3607.021209,1801.194048,8844.420953,1956.865351,16753.339727,6940.212677,1910.477427,5857.807354,1592.261913,6093.951345,...,1239.521682,1342.578117,7788.636594,2896.214598,17585.285781,0.000000,13953.311184,3832.403423,3694.174874,5461.613773
"Wassermann,-Jakob_Christian Wahnschaffe",12032.138837,14598.954517,8684.542302,14517.351997,7318.572743,9139.602672,12941.499411,9078.283208,13169.036449,9528.031696,...,14334.606343,14624.661740,7483.262323,11364.444377,7573.424391,13953.311184,0.000000,10851.800358,11077.409941,9224.948022
"Lewald,-Fanny_Eine Lebensfrage",2624.179110,4254.797527,6141.755612,4262.403313,13698.708114,4656.507704,3168.510691,3494.551044,2978.176959,3546.895121,...,4005.980779,4273.442874,5382.653249,2528.778559,14859.953567,3832.403423,10851.800358,0.000000,2520.011111,3342.663459
"Huber,-Therese_Die Familie Seldorf",3191.066593,4561.749007,7228.428598,3956.700393,14333.407690,5486.262662,3377.882473,4170.073261,3162.187850,3978.548605,...,4162.650598,4320.496731,5327.858294,2297.589824,14909.566862,3694.174874,11077.409941,2520.011111,0.000000,3263.361610
