In [1]:
import pandas as pd
import json

from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


from collections import Counter, defaultdict
import numpy as np
import os
from tqdm import tqdm

import fitz

### Import data

In [2]:
with open('corpus1/TrainTestSet/Trainset/Doclengths_of_the_individual_docs_TRAIN.json', 'r') as f:
    data = json.load(f)

[36, 30, 17, 12, 3, 19, 18, 4, 46]

### Testing

In [3]:
# a = pd.read_csv('corpus1/TrainTestSet/Trainset/amsterdam_files_df.csv',index_col = 0)
# b = a[a['name'].str.contains('868212')]
# b[b['page'] == 2]['text'].iloc[0]
# b = b.sort_values(by = 'page')
# b[b['page'] == 39]['text'].iloc[0]

In [21]:
doc = fitz.open('corpus1/TrainTestSet/Trainset/data/967331_files.zip__concatenated.pdf')
doc[9]

page 9 of corpus1/TrainTestSet/Trainset/data/967331_files.zip__concatenated.pdf

### Get header and footer functions

In [24]:
from operator import itemgetter
    
def get_header(page, doc):
    if not doc:
        return 'empty'
    
    header = min(doc, key=itemgetter(1))

    if header[1] > 30:
        return 0
    
    elif 'image' in header[4]:
        return 0
    else:
        return header[4]
    
def get_footer(page, doc):
    if not doc:
        return 'empty'
    
    doc_length = page.rect[-1]
    
    footer = max(doc, key=itemgetter(1))
    
    if footer[3] < doc_length * .95:
        return 0
    
    elif 'image' in footer[4]:
        return 0
    else:
        return footer[4]

def get_header_footer(page):
    doc = page.get_text('blocks')
    return get_header(page, doc), get_footer(page, doc)

get_header_footer(doc[1])

(0,
 'Datum aanvraag: 15 j\nanuari 2015 \nAanvraagnummer: 1607213 \nPagina 2 van 2\n')

### Function to check whether the font of a page differs from the previous page(s)

In [6]:
def font_diff(df, pages = 3):
    is_diff = []

    for i in tqdm(range(len(df))):
        a = df.iloc[i]['fonts']
        b = set()

        for y in range(1,pages+1):
            if type(df.shift(y).iloc[i]['fonts']) != float:
                b.update(df.shift(y).iloc[i]['fonts'])

        if len(b) == 0:
            is_diff.append(1)

        elif bool(a & b):
            is_diff.append(0)
        else:
            is_diff.append(1)

    return is_diff

### Create dataframe with features containing all concatenated training documents

In [7]:
pd.set_option('display.max_rows', 20)
path = 'corpus1/TrainTestSet/Trainset/data/'
a = defaultdict(list)

for r, d, f in os.walk(path):
    if '.ipynb_checkpoints' in d:
        d.remove('.ipynb_checkpoints')
    for file in tqdm(f):
        doc = fitz.open(r+file)
        
        for page in doc:
            
            try:
                header, footer = get_header_footer(page)
                a['header'].append(header)
                a['footer'].append(footer)
            except:
                continue
                
            a['file_name'].append(file.split('__')[0])
            a['page'].append(page.number+1)
            
            cropbox = page.rect[-2:]
            a['cropbox_x'].append(cropbox[0])
            a['cropbox_y'].append(cropbox[1])
            
            fonts = set()
            for font in page.get_fonts():
                fonts.add(font[3].split('+')[0])
            if fonts:
                a['fonts'].append(fonts)
            else:
                a['fonts'].append({'none'})
            
            if page.get_text() == '':
                a['isImage'].append(1)
            else:
                a['isImage'].append(0)

df = pd.DataFrame(a)
df['label'] = 0

### Label the first page of a document as 1 according to golden standard
for file in tqdm(df['file_name'].unique()):
    split = 1
    for i in data[file]:
        df.loc[(df['file_name'] == file) & (df['page'] == split), 'label'] = 1
        split+=i

df['crop_is_diff'] = df['cropbox_x'] != (df['cropbox_x'].shift())
df['crop_is_diff'] = df['crop_is_diff'].apply(lambda x: 1 if x == True else 0)
df['font_is_diff'] = font_diff(df, 3)

df.to_csv('feature_df.csv')

100%|██████████| 113/113 [07:41<00:00,  4.09s/it]
100%|██████████| 113/113 [00:05<00:00, 20.17it/s]
100%|██████████| 19101/19101 [01:34<00:00, 201.52it/s]


<a id="df"></a>

In [83]:
df[['file_name','page','cropbox_x','cropbox_y', 'header', 'footer', 'fonts', 'isImage','label','crop_is_diff','font_is_diff']][25:45]

Unnamed: 0,file_name,page,cropbox_x,cropbox_y,header,footer,fonts,isImage,label,crop_is_diff,font_is_diff
25,868212,26,419.528015,595.276001,Handreiking | Veilige Moskee\n,26\n,"{KPXZKR, JBOTWT}",0,0,0,0
26,868212,27,419.528015,595.276001,Handreiking | Veilige Moskee\n,27\n,"{JBOTWT, KPXZKR}",0,0,0,0
27,868212,28,419.528015,595.276001,Handreiking | Veilige Moskee\n,28\n,"{JBOTWT, KPXZKR}",0,0,0,0
28,868212,29,419.528015,595.276001,Handreiking | Veilige Moskee\n,29\n,"{JBOTWT, KPXZKR}",0,0,0,0
29,868212,30,419.528015,595.276001,Handreiking | Veilige Moskee\n,30\n,"{JBOTWT, KPXZKR}",0,0,0,0
30,868212,31,419.528015,595.276001,Handreiking | Veilige Moskee\n,31\n,"{JBOTWT, KPXZKR}",0,0,0,0
31,868212,32,419.528015,595.276001,Handreiking | Veilige Moskee\n,32\n,"{JBOTWT, KPXZKR}",0,0,0,0
32,868212,33,419.528015,595.276001,Handreiking | Veilige Moskee\n,33\n,"{KPXZKR, JBOTWT}",0,0,0,0
33,868212,34,419.528015,595.276001,Handreiking | Veilige Moskee\n,34\n,"{JBOTWT, KPXZKR}",0,0,0,0
34,868212,35,419.528015,595.276001,Handreiking | Veilige Moskee\n,35\n,"{KPXZKR, JBOTWT}",0,0,0,0


### Onehot encoding if needed

In [81]:
def onehot(df):
    lb = LabelBinarizer()

    all_fonts = set()
    df['fonts'].apply(lambda x: all_fonts.update(x))

    lb.fit(list(all_fonts))
    ohe_list = []
    df['fonts'].apply(lambda x: ohe_list.append(lb.transform(list(x))))
    return df.join(combine_ohe(ohe_list, lb))



def combine_ohe(ohe_list, lb):
    a = []
    for i in ohe_list:
        combined = 0
        if len(i) > 1:
            for y in i:
                combined+=y
            a.append(combined)
        else:
            a.append(i[0])
    return pd.DataFrame(a, columns = lb.classes_)

### Classification
<a id="classification"></a>

In [72]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

def score(true, preds):
    correct = 0
    total = Counter(y_test)[1]
    for t, p in zip(true, preds):
        if t == p == 1:
            correct += 1
    return correct/total

#### Font_is_diff only

In [73]:
pd.set_option('display.max_rows', 200)
features = ['font_is_diff']
split = 15027

X_train = df.iloc[:split][features]
y_train = df.iloc[:split]['label']
X_test = df.iloc[split:][features]
y_test = df.iloc[split:]['label']
model = RandomForestClassifier()
model.fit(X_train, y_train)

true = y_test
preds = model.predict(X_test)
score(true,preds)

0.5039370078740157

#### Crop_is_diff only

In [74]:
features = ['crop_is_diff']
split = 15027

X_train = df.iloc[:split][features]
y_train = df.iloc[:split]['label']
X_test = df.iloc[split:][features]
y_test = df.iloc[split:]['label']

model = RandomForestClassifier()
model.fit(X_train, y_train)

true = y_test
preds = model.predict(X_test)

score(true,preds)

0.33970753655793023

#### Both font_is_diff and crop_is_diff

In [78]:
features = ['font_is_diff','crop_is_diff']
split = 15027

X_train = df.iloc[:split][features]
y_train = df.iloc[:split]['label']
X_test = df.iloc[split:][features]
y_test = df.iloc[split:]['label']

model = RandomForestClassifier()
model.fit(X_train, y_train)

true = y_test
preds = model.predict(X_test)

score(true,preds)

0.33970753655793023

De score voor `font_is_diff` en `crop_is_diff` gecombineerd is hetzelfde als alleen `crop_is_diff`

In [80]:
model.feature_importances_

array([0.34896885, 0.65103115])