In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
from collections import defaultdict
import seaborn as sns
import json

tqdm.pandas()

### Some functions

In [2]:
def change_format(y):
    y[0] = 1
    indices = [i for i, x in enumerate(y) if x == 1]+[len(y)-1]
    result = []
    for i in range(len(indices)):
        if i != len(indices)-1:
            result.append(indices[i+1] - indices[i])
    result[-1]+=1
    return result

In [3]:
def make_index(split):
    '''Turns a doc length vector like [1,2,1,3,3,5] into a dict with pagenumbers as keys and the set of all 
    pagenumbers in the same document as value.
    This thus is an index which gives for every page its cluster.'''
    l= sum(split)
    pages= list(np.arange(l))
    out = defaultdict(set)
    for block_length in split:
        block= pages[:block_length]
        pages= pages[block_length:]
        for page in block:
            out[page]= set(block)
    return out

In [4]:
def Bcubed(truth,pred):
    assert sum(truth)==sum(pred)  # same amount of pages
    truth,pred = make_index(truth), make_index(pred)
    
    df  ={i:{'size':len(truth[i]),'P':0,'R':0,'F1':0} for i in truth}
    for i in truth:
        df[i]['P']= len(truth[i] & pred[i])/len(pred[i]) 
        df[i]['R']= len(truth[i] & pred[i])/len(truth[i])
        df[i]['F1']= (2*df[i]['P']*df[i]['R'])/(df[i]['P']+df[i]['R'])
    df= pd.DataFrame.from_dict(df, orient='index')
    df.index_name='PageNr'
    return  df


def MeanBcubed(truth,pred):
    assert sum(truth)==sum(pred)  # same amount of pages
    return Bcubed(truth,pred).mean()

### Baseline

In [5]:
f= open('corpus1/TrainTestSet/Trainset/Doclengths_of_the_individual_docs_TRAIN.json')
truth_corpus=json.load(f)

In [6]:
def fixedpage(truth,docsize=3):
    number_of_blocks= sum(truth)//docsize
    rest = sum(truth) % docsize
    if rest !=0:
        return [docsize for _ in range(number_of_blocks)]+[rest]
    else:
        return [docsize for _ in range(number_of_blocks)]  
    
D ={pdf: MeanBcubed(truth_corpus[pdf], fixedpage(truth_corpus[pdf],6))
   for pdf in truth_corpus}
results= pd.DataFrame.from_dict(D,orient='index')
print(results.mean())

size    17.871638
P        0.712924
R        0.661530
F1       0.558666
dtype: float64


### Import

In [7]:
from ast import literal_eval

df_1 = pd.read_csv('dataframes/corpus_1_df_col.csv', index_col = 0)
df_1.head()

Unnamed: 0,full_name,file_name,page,cropbox_x,cropbox_y,text_x,text_y,text_y_cleaned,header,footer,...,footer_tokenized_gensim,header_tokenized_gensim,text_vector_d2v,text_d2v_sim_score,font_diff1,font_diff2,crop_diff,isLastPage,isImage,label
0,868212__concatenated-001.txt,868212,1,419.528015,595.276001,handreiking \nveilige moskee\n,\n\nHandreiking\n\nVeilige Moskee\n\n \n,Handreiking Veilige Moskee,0,0,...,[],[],"[-0.08071482, 0.056871757, -0.0013572321, 0.03...",0.0,-1,-1,-1,0.0,0,1
1,868212__concatenated-002.txt,868212,2,419.528015,595.276001,1\nhandreiking | veilige moskee\ninhoudsopgave...,Handreiking | Veilige Moskee\n\nInhoudsopgave\...,Handreiking Veilige Moskee Inhoudsopgave Inlei...,Handreiking | Veilige Moskee\n,1\n,...,[],"['Handreiking', 'Veilige', 'Moskee']","[-0.041891757, 0.38801417, -0.31855652, -0.298...",0.495501,1,1,1,0.0,0,0
2,868212__concatenated-003.txt,868212,3,419.528015,595.276001,3\nhandreiking | veilige moskee\n1. inleiding\...,Handreiking | Veilige Moskee\n\n1. Inleiding\n...,Handreiking Veilige Moskee Inleiding Een moske...,Handreiking | Veilige Moskee\n,3\n,...,[],"['Handreiking', 'Veilige', 'Moskee']","[-0.35729724, -0.10733895, 0.27150506, 0.01633...",0.18311,1,1,1,0.0,0,0
3,868212__concatenated-004.txt,868212,4,419.528015,595.276001,4\nhandreiking | veilige moskee\ntips voor mos...,Handreiking | Veilige Moskee\n\nTips voor mosk...,Handreiking Veilige Moskee Tips voor moskee ge...,Handreiking | Veilige Moskee\n,4\n,...,[],"['Handreiking', 'Veilige', 'Moskee']","[-0.28620702, 1.0634485, 0.011591506, -0.56470...",0.591445,1,1,1,0.0,0,0
4,868212__concatenated-005.txt,868212,5,419.528015,595.276001,5\nhandreiking | veilige moskee\naanvullingen\...,Handreiking | Veilige Moskee\n\nAanvullingen\n...,Handreiking Veilige Moskee Aanvullingen De han...,Handreiking | Veilige Moskee\n,5\n,...,[],"['Handreiking', 'Veilige', 'Moskee']","[-0.32099822, 0.53849864, -0.055533577, -0.168...",0.537075,1,1,1,0.0,0,0


In [8]:
df_2 = pd.read_csv('dataframes/corpus_2_df_col.csv', index_col = 0)
df_2.head()

Unnamed: 0,full_name,file_name,page,cropbox_x,cropbox_y,text_x,text_y,text_y_cleaned,header,footer,...,footer_tokenized_gensim,header_tokenized_gensim,text_vector_d2v,text_d2v_sim_score,font_diff1,font_diff2,crop_diff,isLastPage,isImage,label
0,Aanvullend-besluit-Wob-verzoek-project-COILED....,Aanvullend-besluit-Wob-verzoek-project-COILED....,1,595.320007,841.919983,\n \n \n \n \n \n \ndatum \n 19 augustus 2021...,Datum\n19 augustus 2021\n\nZaaknummer\n2021-01...,Datum augustus Zaaknummer Onderwerp Aanvullend...,0,www.gelderland.nl \nKvK-nummer: 51468751 \n \n,...,"['www', 'gelderland', 'nl', 'KvK', 'nummer']",[],"[-0.54855424, -1.3298025, 0.19989337, -0.20702...",0.0,-1,-1,-1,0.0,0,1
1,Aanvullend-besluit-Wob-verzoek-project-COILED....,Aanvullend-besluit-Wob-verzoek-project-COILED....,2,595.320007,841.919983,\n \n \n \n \ndatum \n 19 augustus 2021 \n \n...,Datum\n19 augustus 2021\n\nZaaknummer\n2021-01...,Datum augustus Zaaknummer Blad van Niet openba...,0,0,...,[],[],"[-0.11132708, -0.8158811, 0.107630394, -0.1691...",0.721262,1,1,1,0.0,0,0
2,Aanvullend-besluit-Wob-verzoek-project-COILED....,Aanvullend-besluit-Wob-verzoek-project-COILED....,3,595.320007,841.919983,,Datum\n19 augustus 2021\n\nZaaknummer\n2021-01...,Datum augustus Zaaknummer Blad van Deze vindt ...,0,0,...,[],[],"[0.053682096, -0.7946833, 0.140492, -0.0468661...",0.39222,0,0,1,1.0,1,0
3,Aanvullend-besluit-Wob-verzoek-project-COILED....,Aanvullend-besluit-Wob-verzoek-project-COILED....,4,595.0,842.0,inventarislijst aanvullend besluit wob-verzoek...,Inventarislijst aanvullend besluit Wob-verzoek...,Inventarislijst aanvullend besluit Wob verzoek...,0,0,...,[],[],"[-0.19504319, -2.220936, 1.263472, 0.9210858, ...",0.32171,-1,0,-1,1.0,0,1
4,Aanvullend-besluit-Wob-verzoek-project-COILED....,Aanvullend-besluit-Wob-verzoek-project-COILED....,5,612.0,792.0,,Joost\n\ncpenstioneel programma oost\n\n“\nEur...,Joost cpenstioneel programma oost Europese Uni...,0,0,...,[],[],"[0.04210999, -0.36448914, -0.57351416, 0.46199...",0.363172,0,0,-1,1.0,1,1


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, f1_score

### Make predictions - Corpus 1 split into train/test

In [30]:
split = 15036
features = ['font_diff1','crop_diff','text_d2v_sim_score']

X_train = df_1.iloc[:split][features]
y_train = df_1.iloc[:split]['label']
X_test = df_1.iloc[split:][features]
y_test = df_1.iloc[split:]['label']

model = LogisticRegression()
model.fit(X_train, y_train)

true = y_test
preds = model.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(model.predict(X_test))
print(MeanBcubed(vb_truth,vb_pred))
precision_score(true, preds), recall_score(true, preds), f1_score(true, preds)

size    57.580126
P        0.705696
R        0.935606
F1       0.701904
dtype: float64


(0.6473317865429234, 0.38803894297635605, 0.48521739130434777)

In [28]:
### Images taken out ###
df_1_imoff = df_1[df_1['isImage'] == 0]
df_2_imoff = df_2[df_2['isImage'] == 0]

split = 8701
features = ['font_diff1','crop_diff','text_d2v_sim_score']

X_train = df_1_imoff.iloc[:split][features]
y_train = df_1_imoff.iloc[:split]['label']
X_test = df_1_imoff.iloc[split:][features]
y_test = df_1_imoff.iloc[split:]['label']

model = LogisticRegression()
model.fit(X_train, y_train)

true = y_test
preds = model.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(model.predict(X_test))
print(MeanBcubed(vb_truth,vb_pred))
precision_score(true, preds), recall_score(true, preds), f1_score(true, preds)

size    62.190901
P        0.828393
R        0.922802
F1       0.809887
dtype: float64


(0.7006651884700665, 0.6042065009560229, 0.648870636550308)

### Train on df_1, predict df_2

In [43]:
features = ['font_diff1','crop_diff','text_d2v_sim_score']
X_train = df_1[features]
y_train = df_1['label']
X_test = df_2[features]
y_test = df_2['label']

model = LogisticRegression()
model.fit(X_train, y_train)

true = y_test
preds = model.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(model.predict(X_test))
print(MeanBcubed(vb_truth,vb_pred))
precision_score(true, preds), recall_score(true, preds), f1_score(true, preds)

size    70.295543
P        0.496487
R        0.805348
F1       0.404806
dtype: float64


(0.4050387596899225, 0.29814550641940085, 0.34346754313886607)

In [47]:
### Images taken out ###
df_1_imoff = df_1[df_1['isImage'] == 0]
df_2_imoff = df_2[df_2['isImage'] == 0]

features = ['font_diff1','crop_diff','text_d2v_sim_score']
X_train = df_1_imoff[features]
y_train = df_1_imoff['label']
X_test = df_2_imoff[features]
y_test = df_2_imoff['label']

model = LogisticRegression()
model.fit(X_train, y_train)

true = y_test
preds = model.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(model.predict(X_test))
print(MeanBcubed(vb_truth,vb_pred))
precision_score(true, preds), recall_score(true, preds), f1_score(true, preds)

size    37.446129
P        0.421555
R        0.914817
F1       0.419496
dtype: float64


(0.6143187066974596, 0.20556414219474498, 0.30804863925883036)

### Other way around: train on df_2, predict df_1

In [45]:
features = ['font_diff1','crop_diff','text_d2v_sim_score']
X_train = df_2[features]
y_train = df_2['label']
X_test = df_1[features]
y_test = df_1['label']

model = DecisionTreeClassifier()
model.fit(X_train, y_train)

true = y_test
preds = model.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(model.predict(X_test))
print(MeanBcubed(vb_truth,vb_pred))
precision_score(true, preds), recall_score(true, preds), f1_score(true, preds)

size    34.081002
P        0.694798
R        0.612148
F1       0.481437
dtype: float64


(0.3966059082338152, 0.3225971370143149, 0.3557936284183817)

In [38]:
### Images taken out ###
df_1_imoff = df_1[df_1['isImage'] == 0]
df_2_imoff = df_2[df_2['isImage'] == 0]

features = ['font_diff1','crop_diff','text_d2v_sim_score']
X_train = df_2_imoff[features]
y_train = df_2_imoff['label']
X_test = df_1_imoff[features]
y_test = df_1_imoff['label']

model = LogisticRegression()
model.fit(X_train, y_train)

true = y_test
preds = model.predict(X_test)

vb_truth, vb_pred = change_format(y_test.values), change_format(model.predict(X_test))
print(MeanBcubed(vb_truth,vb_pred))
precision_score(true, preds), recall_score(true, preds), f1_score(true, preds)

size    45.037142
P        0.722855
R        0.805904
F1       0.630732
dtype: float64


(0.6598984771573604, 0.42266604737575475, 0.5152887882219704)

In [None]:
features = ['font_diff1','font_diff2', 'crop_is_diff', 'text_sim_score']
models = [DecisionTreeClassifier(), LogisticRegression()]

def score_summary(df_1, df_2, features, models):
    a = defaultdict(list)
    for model in models:
        for feature in features:
            X_train = df_1[[feature]]
            y_train = df_1['label']
            X_test = df_2[[feature]]
            y_test = df_2['label']


            model.fit(X_train, y_train)
            vb_truth, vb_pred = change_format(y_test.values), change_format(model.predict(X_test))
            a['model'].append(str(model))
            a['feature'].append(feature)
            score = MeanBcubed(vb_truth,vb_pred)

            a['P'].append(score['P'])
            a['R'].append(score['R'])
            a['F1'].append(score['F1'])
            
    display(pd.DataFrame(a))
            
score_summary(df_1, df_2, features, models)

#### Precision
Precision is the ratio of correctly predicted positive observations to the total predicted positive observations.

#### Recall
Recall is the ratio of correctly predicted positive observations to the all observations in actual class - 1.