In [1]:
import pandas as pd
import json

from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


from collections import Counter, defaultdict
import numpy as np
import os
from tqdm import tqdm

import fitz

### Import data

In [2]:
with open('corpus1/TrainTestSet/Trainset/Doclengths_of_the_individual_docs_TRAIN.json', 'r') as f:
    data = json.load(f)

[36, 30, 17, 12, 3, 19, 18, 4, 46]

### Testing

In [3]:
# a = pd.read_csv('corpus1/TrainTestSet/Trainset/amsterdam_files_df.csv',index_col = 0)
# b = a[a['name'].str.contains('868212')]
# b[b['page'] == 2]['text'].iloc[0]
# b = b.sort_values(by = 'page')
# b[b['page'] == 39]['text'].iloc[0]

In [4]:
doc = fitz.open('corpus1/TrainTestSet/Trainset/data/967331_files.zip__concatenated.pdf')
doc[9]

page 0 of corpus1/TrainTestSet/Trainset/data/967331_files.zip__concatenated.pdf

### Get header and footer functions

In [5]:
from operator import itemgetter
    
def get_header(page, doc):
    if not doc:
        return 'empty'
    
    header = min(doc, key=itemgetter(1))

    if header[1] > 30:
        return 0
    
    elif 'image' in header[4]:
        return 0
    else:
        return header[4]
    
def get_footer(page, doc):
    if not doc:
        return 'empty'
    
    doc_length = page.rect[-1]
    
    footer = max(doc, key=itemgetter(1))
    
    if footer[3] < doc_length * .95:
        return 0
    
    elif 'image' in footer[4]:
        return 0
    else:
        return footer[4]

def get_header_footer(page):
    doc = page.get_text('blocks')
    return get_header(page, doc), get_footer(page, doc)

get_header_footer(doc[1])

(0,
 'Datum aanvraag: 15 j\nanuari 2015 \nAanvraagnummer: 1607213 \nPagina 2 van 2\n')

### Function to check whether the font of a page differs from the previous page(s)

In [6]:
def font_diff(df, pages = 3):
    is_diff = []

    for i in tqdm(range(len(df))):
        a = df.iloc[i]['fonts']
        b = set()

        for y in range(1,pages+1):
            if type(df.shift(y).iloc[i]['fonts']) != float:
                b.update(df.shift(y).iloc[i]['fonts'])

        if len(b) == 0:
            is_diff.append(1)

        elif bool(a & b):
            is_diff.append(0)
        else:
            is_diff.append(1)

    return is_diff

### Create dataframe with features containing all concatenated training documents

In [7]:
pd.set_option('display.max_rows', 20)
path = 'corpus1/TrainTestSet/Trainset/data/'
a = defaultdict(list)

for r, d, f in os.walk(path):
    if '.ipynb_checkpoints' in d:
        d.remove('.ipynb_checkpoints')
    for file in tqdm(f):
        doc = fitz.open(r+file)
        
        for page in doc:
            
            try:
                header, footer = get_header_footer(page)
                a['header'].append(header)
                a['footer'].append(footer)
            except:
                continue
                
            a['file_name'].append(file.split('__')[0])
            a['page'].append(page.number+1)
            
            cropbox = page.rect[-2:]
            a['cropbox_x'].append(cropbox[0])
            a['cropbox_y'].append(cropbox[1])
            
            fonts = set()
            for font in page.get_fonts():
                fonts.add(font[3].split('+')[0])
            if fonts:
                a['fonts'].append(fonts)
            else:
                a['fonts'].append({'none'})
            
            if page.get_text() == '':
                a['isImage'].append(1)
            else:
                a['isImage'].append(0)

df = pd.DataFrame(a)
df['label'] = 0

### Label the first page of a document as 1 according to golden standard
for file in tqdm(df['file_name'].unique()):
    split = 1
    for i in data[file]:
        df.loc[(df['file_name'] == file) & (df['page'] == split), 'label'] = 1
        split+=i

df['crop_is_diff'] = df['cropbox_x'] != (df['cropbox_x'].shift())
df['crop_is_diff'] = df['crop_is_diff'].apply(lambda x: 1 if x == True else 0)
df['font_is_diff'] = font_diff(df, 3)

df.to_csv('feature_df.csv')

100%|██████████| 113/113 [07:41<00:00,  4.09s/it]
100%|██████████| 113/113 [00:05<00:00, 20.17it/s]
100%|██████████| 19101/19101 [01:34<00:00, 201.52it/s]


In [20]:
df[['file_name','page','cropbox_x','cropbox_y', 'header', 'footer', 'fonts', 'isImage','label','crop_is_diff','font_is_diff']][-20:]

Unnamed: 0,file_name,page,cropbox_x,cropbox_y,header,footer,fonts,isImage,label,crop_is_diff,font_is_diff
19081,993914_files.zip,189,595.320007,841.919983,0,0,"{XGBFBJ, XYDKBJ, JYAZNL, SWOBJT}",0,0,0,0
19082,993914_files.zip,190,595.320007,841.919983,0,0,"{SWOBJT, JYAZNL, MNMRDF, XYDKBJ, PMCJTZ, XGBFBJ}",0,0,0,0
19083,993914_files.zip,191,595.320007,841.919983,0,0,"{PMCJTZ, JYAZNL, XYDKBJ}",0,0,0,0
19084,993914_files.zip,192,595.320007,841.919983,0,0,"{EGUFKV, NWFMGD, SBSQYT, XLFUQJ, IMSDON}",0,1,0,1
19085,993914_files.zip,193,595.320007,841.919983,0,0,"{EGUFKV, NWFMGD, SBSQYT, XLFUQJ, IMSDON}",0,0,0,0
19086,993914_files.zip,194,595.320007,841.919983,0,0,"{EGUFKV, SBSQYT, IMSDON, XLFUQJ}",0,0,0,0
19087,993914_files.zip,195,595.320007,841.919983,0,0,"{EGUFKV, SBSQYT, XLFUQJ}",0,0,0,0
19088,993914_files.zip,196,595.320007,841.919983,0,0,"{RVPEYL, IFZCCD, DVMYKN, ZPOAGV, XEROEZ}",0,1,0,1
19089,993914_files.zip,197,595.320007,841.919983,0,0,"{RVPEYL, IFZCCD, DVMYKN, XEROEZ, LHNUSX, GCAQAH}",0,0,0,0
19090,993914_files.zip,198,595.320007,841.919983,0,0,"{TGMQAH, RVPEYL, IFZCCD, QZYYKN, DVMYKN, JRMIQ...",0,0,0,0


### Onehot encoding if needed

In [118]:
def onehot(df):
    lb = LabelBinarizer()

    all_fonts = set()
    df['fonts'].apply(lambda x: all_fonts.update(x))

    lb.fit(list(all_fonts))
    ohe_list = []
    df['fonts'].apply(lambda x: ohe_list.append(lb.transform(list(x))))
    return df.join(combine_ohe(ohe_list, lb))



def combine_ohe(ohe_list, lb):
    a = []
    for i in ohe_list:
        combined = 0
        if len(i) > 1:
            for y in i:
                combined+=y
            a.append(combined)
        else:
            a.append(i[0])
    return pd.DataFrame(a, columns = lb.classes_)

Unnamed: 0,header,footer,file_name,page,cropbox_x,cropbox_y,fonts,isImage,label,crop_is_diff
0,0,0,868212,1,419.528015,595.276001,{KPXZKR},0,1,1
1,Handreiking | Veilige Moskee\n,1\n,868212,2,419.528015,595.276001,{KPXZKR},0,0,0
2,Handreiking | Veilige Moskee\n,3\n,868212,3,419.528015,595.276001,"{JBOTWT, KPXZKR}",0,0,0
3,Handreiking | Veilige Moskee\n,4\n,868212,4,419.528015,595.276001,"{JBOTWT, KPXZKR}",0,0,0
4,Handreiking | Veilige Moskee\n,5\n,868212,5,419.528015,595.276001,"{JBOTWT, KPXZKR}",0,0,0
...,...,...,...,...,...,...,...,...,...,...
19096,0,6\n,993914_files.zip,204,595.219971,842.000000,"{MBAHDD, MBAHDE, MBAHBB}",0,0,0
19097,0,0,993914,1,595.320007,841.919983,{ABCDEE},0,1,1
19098,0,0,993914,2,595.320007,841.919983,{ABCDEE},0,0,0
19099,0,0,993914,3,595.320007,841.919983,{ABCDEE},0,0,0


### Classification

In [10]:
pd.set_option('display.max_rows', 200)
features = ['font_is_diff']
split = 15027

X_train = df.iloc[:split][features]
y_train = df.iloc[:split]['label']
X_test = df.iloc[split:][features]
y_test = df.iloc[split:]['label']

In [11]:
from sklearn.tree import DecisionTreeClassifier
model = LogisticRegression()
model.fit(X_train, y_train)

true = y_test
preds = model.predict(X_test)

In [12]:
correct = 0
total = Counter(y_test)[1]
for t, p in zip(true, preds):
    if t == p == 1:
        correct += 1
correct/total

0.5039370078740157

### Ideas
- Remove empty pages if that page is not the first page