In [47]:
import pandas as pd
from tqdm import tqdm
import numpy as np
from collections import defaultdict
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score


tqdm.pandas()

### Some functions

In [48]:
def change_format(y):
    indices = [i for i, x in enumerate(y) if x == 1]+[len(y)-1]
    result = []
    for i in range(len(indices)):
        if i != len(indices)-1:
            result.append(indices[i+1] - indices[i])
    result[-1]+=1
    return result

In [49]:
def make_index(split):
    '''Turns a doc length vector like [1,2,1,3,3,5] into a dict with pagenumbers as keys and the set of all 
    pagenumbers in the same document as value.
    This thus is an index which gives for every page its cluster.'''
    l= sum(split)
    pages= list(np.arange(l))
    out = defaultdict(set)
    for block_length in split:
        block= pages[:block_length]
        pages= pages[block_length:]
        for page in block:
            out[page]= set(block)
    return out

In [50]:
def Bcubed(truth,pred):
    assert sum(truth)==sum(pred)  # same amount of pages
    truth,pred = make_index(truth), make_index(pred)
    
    df  ={i:{'size':len(truth[i]),'P':0,'R':0,'F1':0} for i in truth}
    for i in truth:
        df[i]['P']= len(truth[i] & pred[i])/len(pred[i]) 
        df[i]['R']= len(truth[i] & pred[i])/len(truth[i])
        df[i]['F1']= (2*df[i]['P']*df[i]['R'])/(df[i]['P']+df[i]['R'])
    df= pd.DataFrame.from_dict(df, orient='index')
    df.index_name='PageNr'
    return  df


def MeanBcubed(truth,pred):
    assert sum(truth)==sum(pred)  # same amount of pages
    return Bcubed(truth,pred).mean()

### Import

In [46]:
from ast import literal_eval

df = pd.read_csv('feature_df.csv', index_col = 0)
df['fonts'] = df['fonts'].progress_apply(literal_eval)
df['tokenized'] = df['tokenized'].progress_apply(literal_eval)

df.head()

100%|██████████| 19101/19101 [00:00<00:00, 50531.58it/s]
100%|██████████| 19101/19101 [00:08<00:00, 2194.00it/s]


Unnamed: 0,full_name,file_name,page,cropbox_x,cropbox_y,header,footer,text_x,text_y,tokenized,fonts,isImage,crop_is_diff,font_is_diff,isLastPage,label
0,868212__concatenated-001.txt,868212,1,419.528015,595.276001,0,0,handreiking \nveilige moskee\n,\n\nHandreiking\n\nVeilige Moskee\n\n \n,"[Handreiking, Veilige, Moskee]",{KPXZKR},0,1,1,0.0,1
1,868212__concatenated-002.txt,868212,2,419.528015,595.276001,Handreiking | Veilige Moskee\n,1\n,1\nhandreiking | veilige moskee\ninhoudsopgave...,Handreiking | Veilige Moskee\n\nInhoudsopgave\...,"[Handreiking, |, Veilige, Moskee, Inhoudsopgav...",{KPXZKR},0,0,1,0.0,0
2,868212__concatenated-003.txt,868212,3,419.528015,595.276001,Handreiking | Veilige Moskee\n,3\n,3\nhandreiking | veilige moskee\n1. inleiding\...,Handreiking | Veilige Moskee\n\n1. Inleiding\n...,"[Handreiking, |, Veilige, Moskee, 1, ., Inleid...","{JBOTWT, KPXZKR}",0,0,1,0.0,0
3,868212__concatenated-004.txt,868212,4,419.528015,595.276001,Handreiking | Veilige Moskee\n,4\n,4\nhandreiking | veilige moskee\ntips voor mos...,Handreiking | Veilige Moskee\n\nTips voor mosk...,"[Handreiking, |, Veilige, Moskee, Tips, voor, ...","{JBOTWT, KPXZKR}",0,0,1,0.0,0
4,868212__concatenated-005.txt,868212,5,419.528015,595.276001,Handreiking | Veilige Moskee\n,5\n,5\nhandreiking | veilige moskee\naanvullingen\...,Handreiking | Veilige Moskee\n\nAanvullingen\n...,"[Handreiking, |, Veilige, Moskee, Aanvullingen...","{JBOTWT, KPXZKR}",0,0,1,0.0,0


In [51]:
def font_diff(df, pages = 1):
    is_diff = []

    for i in tqdm(range(len(df))):
        a = df.iloc[i]['fonts']
        b = set()

        for y in range(1,pages+1):
            if type(df.shift(y).iloc[i]['fonts']) != float:
                b.update(df.shift(y).iloc[i]['fonts'])

        if len(b) == 0:
            is_diff.append(1)

        elif bool(a & b):
            is_diff.append(0)
        else:
            is_diff.append(1)

    return is_diff

df['font_is_diff'] = font_diff(df)

100%|██████████| 19101/19101 [01:28<00:00, 214.79it/s]


### Make predictions

In [74]:
split = 15036
features = ['font_is_diff']

X_train = df.iloc[:split][features]
y_train = df.iloc[:split]['label']
X_test = df.iloc[split:][features]
y_test = df.iloc[split:]['label']

model = LogisticRegression()
model.fit(X_train, y_train)

true = y_test
preds = model.predict(X_test)
vb_truth, vb_pred = change_format(y_test), change_format(model.predict(X_test))
precision_score(true, preds), recall_score(true, preds), f1_score(true, preds)

(0.6733254994124559, 0.6459977452085682, 0.6593785960874569)

### BCubed score

In [75]:
print(MeanBcubed(vb_truth,vb_pred))
Bcubed(vb_truth,vb_pred)

size    50.743665
P        0.807107
R        0.894423
F1       0.778658
dtype: float64


Unnamed: 0,size,P,R,F1
0,20,1.0,0.350000,0.518519
1,20,1.0,0.350000,0.518519
2,20,1.0,0.350000,0.518519
3,20,1.0,0.350000,0.518519
4,20,1.0,0.350000,0.518519
...,...,...,...,...
4060,6,1.0,0.166667,0.285714
4061,4,1.0,1.000000,1.000000
4062,4,1.0,1.000000,1.000000
4063,4,1.0,1.000000,1.000000


### Make predictions - Images filtered out

In [76]:
df2 = df[df['isImage'] == 0]

split = 8701
features = ['font_is_diff']

X_train = df2.iloc[:split][features]
y_train = df2.iloc[:split]['label']
X_test = df2.iloc[split:][features]
y_test = df2.iloc[split:]['label']

model = LogisticRegression()
model.fit(X_train, y_train)

true = y_test
preds = model.predict(X_test)
vb_truth, vb_pred = change_format(y_test), change_format(model.predict(X_test))
precision_score(true, preds), recall_score(true, preds), f1_score(true, preds)

(0.6793650793650794, 0.8075471698113208, 0.7379310344827587)

### BCubed score - images filtered out

In [77]:
print(MeanBcubed(vb_truth,vb_pred))
Bcubed(vb_truth,vb_pred)

size    61.412027
P        0.883782
R        0.894938
F1       0.833231
dtype: float64


Unnamed: 0,size,P,R,F1
0,3,1.0,1.00,1.0
1,3,1.0,1.00,1.0
2,3,1.0,1.00,1.0
3,1,1.0,1.00,1.0
4,1,1.0,1.00,1.0
...,...,...,...,...
3354,4,1.0,0.25,0.4
3355,4,1.0,1.00,1.0
3356,4,1.0,1.00,1.0
3357,4,1.0,1.00,1.0
