In [1]:
import pandas as pd
import json

from collections import Counter, defaultdict
import numpy as np
import os
from tqdm import tqdm

import fitz

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from gensim.utils import simple_preprocess
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile
from gensim.utils import tokenize

from sklearn.metrics.pairwise import cosine_similarity

tqdm.pandas()

### Testing

In [2]:
doc = fitz.open('corpus1/TrainTestSet/Trainset/data/967331_files.zip__concatenated.pdf')
doc[8].get_text()

'Datum aanvraag: 15 j\nanuari 2015 \nAanvraagnummer: 1607213 \nPagina 3 van 3\nNaam bij\nlage \nBestandsnaam \nType\nDatum \ningediend\nStatus\ndocument\n412 Brandveil\nigheid 2e\nverdieping\n412 Brandveil\nigheid\n2e verdieping_\nedit.pdf\nBrandveil\nigheid \n15-01-2015 \nIn\nbehandel\ning\nAanvraag ontvangen\nDatum: 15-01-2015\nKenmerk: OLO 1607213\n01 \nBehoort bij de beschikking namens het \ndagelijks bestuur van het stadsdeel \nNieuw-West \n \nDatum: 15 juni 2015 \nKenmerk: OLO 1607213 \nBijlage:\n'

### Get header and footer functions

In [3]:
from operator import itemgetter
    
def get_header(page, doc):
    if not doc:
        return 0
    
    header = min(doc, key=itemgetter(1))

    if header[1] > 30:
        return 0
    
    elif 'image' in header[4]:
        return 0
    else:
        return header[4]
    
def get_footer(page, doc):
    if not doc:
        return 0
    
    doc_length = page.rect[-1]
    
    footer = max(doc, key=itemgetter(1))
    
    if footer[3] < doc_length * .95:
        return 0
    
    elif 'image' in footer[4]:
        return 0
    else:
        return footer[4]

def get_header_footer(page):
    doc = page.get_text('blocks')
    return get_header(page, doc), get_footer(page, doc)

get_header_footer(doc[1])

(0,
 'Datum aanvraag: 15 j\nanuari 2015 \nAanvraagnummer: 1607213 \nPagina 2 van 2\n')

### Function to check whether the font of a page differs from the previous page(s)

In [4]:
def font_diff1(df, pages = 3):
    is_diff = []
    
    for i in tqdm(range(len(df))):
        a = df.iloc[i]['fonts']
        b = set()
        
        for j in range(1, pages + 1):
            if pd.isnull(df.shift(j).iloc[i]['fonts']) == False:
                b.update(df.shift(j).iloc[i]['fonts'])
        
        if len(b) == 0:
            is_diff.append(-1)
        
        elif a == {'none'} or b == {'none'}:
            is_diff.append(0)
            
        elif bool(a & b):
            is_diff.append(1)
            
        else:
            is_diff.append(-1)
            
    return is_diff

def font_diff2(df):
    is_diff = [-1]
    _1 = df.shift()

    for i in tqdm(range(1, len(df))):
        a = df.iloc[i]['fonts']
        b = _1.iloc[i]['fonts']

        if a == {'none'} or b == {'none'}:
            is_diff.append(0)

        elif bool(a & b):
            is_diff.append(1)


        else:
            is_diff.append(-1)

    return is_diff

In [5]:
def crop_diff(df):
    is_diff = []
    is_diff.append(-1)
    _1 = df.shift()
    
    for i in tqdm(range(1, len(df))):
        a = (df.iloc[i]['cropbox_x'], df.iloc[i]['cropbox_y'])
        b = (_1.iloc[i]['cropbox_x'], _1.iloc[i]['cropbox_y'])
        
        if a == b:
            is_diff.append(1)
        else:
            is_diff.append(-1)

    return is_diff

In [6]:
def simple_tokenize(text, _stopwords):
    return [token for token in simple_preprocess(text) if token not in _stopwords]

In [7]:
def compare_sim_d2v(df, seg = 'text_vector'):
    _1 = df.shift()
    sim_scores = [0]

    for i in tqdm(range(1,len(df))):
        a = df.iloc[i][seg]
        b = _1.iloc[i][seg]

        score = cosine_similarity([a],[b])[0][0]
        sim_scores.append(score)
    return sim_scores

In [8]:
### D2V Model trained on corpus1
# from ast import literal_eval

# df_1 = pd.read_csv('dataframes/corpus_1_feature_df.csv', index_col = 0)
# df_1['fonts'] = df_1['fonts'].progress_apply(literal_eval)
# df_1['text_tokenized'] = df_1['text_tokenized'].progress_apply(literal_eval)
# df_1['header_tokenized'] = df_1['header_tokenized'].progress_apply(literal_eval)
# df_1['footer_tokenized'] = df_1['footer_tokenized'].progress_apply(literal_eval)
# df_1['text_vector'] = df_1['text_vector'].progress_apply(literal_eval)

# _documents = df_1[df_1['text_tokenized'].astype(bool)]['text_tokenized'].values
# documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(_documents)]
# model = Doc2Vec(documents, vector_size=32, window=2, min_count=1, workers=4)
# model.random.seed(42)

# model.save('d2v_models/corpus1_d2v')

In [9]:
### D2V Model trained on corpus2
# df_2 = pd.read_csv('dataframes/corpus_2_feature_df.csv', index_col = 0)
# df_2['fonts'] = df_2['fonts'].progress_apply(literal_eval)
# df_2['text_tokenized'] = df_2['text_tokenized'].progress_apply(literal_eval)
# df_2['header_tokenized'] = df_2['header_tokenized'].progress_apply(literal_eval)
# df_2['footer_tokenized'] = df_2['footer_tokenized'].progress_apply(literal_eval)
# df_2['text_vector'] = df_2['text_vector'].progress_apply(literal_eval)

# _documents = df_2[df_2['text_tokenized'].astype(bool)]['text_tokenized'].values
# documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(_documents)]
# model = Doc2Vec(documents, vector_size=32, window=2, min_count=1, workers=4)
# model.random.seed(42)

# model.save('d2v_models/corpus2_d2v')

### Create dataframe with features containing all concatenated training documents

In [11]:
pd.set_option('display.max_columns', 40)

def preprocess(corpus_no):
    path = 'corpus'+str(corpus_no)+'/TrainTestSet/Trainset/data/'
    a = defaultdict(list)
    with open('corpus'+str(corpus_no)+'/TrainTestSet/Trainset/Doclengths_of_the_individual_docs_TRAIN.json', 'r') as f:
        data = json.load(f)

    for r, d, f in os.walk(path):
        if '.ipynb_checkpoints' in d:
            d.remove('.ipynb_checkpoints')
        for file in tqdm(f):
            doc = fitz.open(r+file)

            for page in doc:

                try:
                    header, footer = get_header_footer(page)
                    a['header'].append(header)
                    a['footer'].append(footer)
                except:
                    continue

                a['file_name'].append(file.split('__')[0])
                a['page'].append(page.number+1)

                cropbox = page.rect[-2:]
                a['cropbox_x'].append(cropbox[0])
                a['cropbox_y'].append(cropbox[1])

                a['text'].append(page.get_text().lower())

                fonts = set()
                for font in page.get_fonts():
                    fonts.add(font[3].split('+')[0])
                if fonts:
                    a['fonts'].append(fonts)
                else:
                    a['fonts'].append({'none'})

                if page.get_text() == '':
                    a['isImage'].append(1)

                else:
                    a['isImage'].append(0)

            # break
    
    df = pd.DataFrame(a)

    ### Labeling ###
    print('labeling')
    df['label'] = 0
    for file in tqdm(df['file_name'].unique()):
        split = 1
        for i in data[file]:
            df.loc[(df['file_name'] == file) & (df['page'] == split), 'label'] = 1
            split+=i
    ################
    
    ### Join OCR ###
    ocr = pd.read_csv('corpus'+str(corpus_no)+'/TrainTestSet/Trainset/ocred_text.csv',index_col = 0)
    ocr.rename(columns = {'name':'file_name'}, inplace = True)
    ocr['file_name'] = ocr['file_name'].apply(lambda x: x.split('__')[0])
    df = df.merge(ocr, on = ['file_name','page'], how = 'left')
    ################
    
    df.dropna(subset=['text_y'], inplace = True)
    
    ### Tokenization & Cleaning ###
    print('tokenizing')
    _stopwords = stopwords.words('dutch')
    df['text_tokenized_simple'] = df['text_y'].progress_apply(lambda x: simple_tokenize(x, _stopwords) if pd.isnull(x) == False else [])
    df['footer_tokenized_simple'] = df['footer'].progress_apply(lambda x: simple_tokenize(x, _stopwords) if x != 0 else [])
    df['header_tokenized_simple'] = df['header'].progress_apply(lambda x: simple_tokenize(x, _stopwords) if x != 0 else [])
    
    df['text_tokenized_gensim'] = df['text_y'].progress_apply(lambda x: list(tokenize(x)))
    df['footer_tokenized_gensim'] = df['footer'].progress_apply(lambda x: list(tokenize(x)) if x != 0 else [])
    df['header_tokenized_gensim'] = df['header'].progress_apply(lambda x: list(tokenize(x)) if x != 0 else [])
    
    df['text_y_cleaned'] = df['text_tokenized_gensim'].apply(lambda x: ' '.join([str(item) for item in x]))
    ####################
            
    ### Differences ###
    print('calculating differences')
    df['font_diff1'] = font_diff1(df)
    df['font_diff2'] = font_diff2(df)
    df['crop_diff'] = crop_diff(df)
    df['isLastPage'] = np.nan_to_num(df['label'].shift(-1).values, nan = 1)
    ###################
    
    ### Load D2V Model ###
    if corpus_no == 1:
        model = Doc2Vec.load('d2v_models/corpus2_d2v')
    elif corpus_no == 2:
        model = Doc2Vec.load('d2v_models/corpus1_d2v')
    #######################
    
    ### Vectorize texts ###
    print('vectorizing texts')
    df['text_vector_d2v'] = df['text_tokenized_simple'].progress_apply(lambda x: list(model.infer_vector(x)))
    df['text_d2v_sim_score'] = compare_sim_d2v(df, 'text_vector_d2v')
    #######################
    
    df.to_csv('dataframes/corpus_'+str(corpus_no)+'_df.csv')

    c = df.columns
    df = df[['full_name', 'file_name', 'page', 'cropbox_x', 'cropbox_y',
       'text_x', 'text_y', 'text_y_cleaned', 'header', 'footer', 'fonts',
       'text_tokenized_simple', 'footer_tokenized_simple',
       'header_tokenized_simple', 'text_tokenized_gensim',
       'footer_tokenized_gensim', 'header_tokenized_gensim', 'text_vector_d2v', 'text_d2v_sim_score',
       'font_diff1', 'font_diff2', 'crop_diff', 'isLastPage', 'isImage', 'label']]
    
    print([d for d in c if d not in df.columns])
    df.to_csv('dataframes/corpus_'+str(corpus_no)+'_df_col.csv')
    display(df)
    return df

df = preprocess(2)

100%|██████████| 52/52 [01:12<00:00,  1.39s/it]


labeling


100%|██████████| 52/52 [00:02<00:00, 20.54it/s]


tokenizing


100%|██████████| 16512/16512 [00:07<00:00, 2305.18it/s]
100%|██████████| 16512/16512 [00:00<00:00, 198943.32it/s]
100%|██████████| 16512/16512 [00:00<00:00, 72104.02it/s]
100%|██████████| 16512/16512 [00:03<00:00, 4999.10it/s]
100%|██████████| 16512/16512 [00:00<00:00, 270681.69it/s]
100%|██████████| 16512/16512 [00:00<00:00, 611486.48it/s]


calculating differences


100%|██████████| 16512/16512 [08:14<00:00, 33.42it/s]
100%|██████████| 16511/16511 [00:03<00:00, 4592.77it/s]
100%|██████████| 16511/16511 [00:07<00:00, 2302.79it/s]


vectorizing texts


100%|██████████| 16512/16512 [00:21<00:00, 781.31it/s] 
100%|██████████| 16511/16511 [00:07<00:00, 2280.53it/s]


['gelderland_files_df.csv']


Unnamed: 0,full_name,file_name,page,cropbox_x,cropbox_y,text_x,text_y,text_y_cleaned,header,footer,fonts,text_tokenized_simple,footer_tokenized_simple,header_tokenized_simple,text_tokenized_gensim,footer_tokenized_gensim,header_tokenized_gensim,text_vector_d2v,text_d2v_sim_score,font_diff1,font_diff2,crop_diff,isLastPage,isImage,label
0,Aanvullend-besluit-Wob-verzoek-project-COILED....,Aanvullend-besluit-Wob-verzoek-project-COILED....,1,595.320007,841.919983,\n \n \n \n \n \n \ndatum \n 19 augustus 2021...,Datum\n19 augustus 2021\n\nZaaknummer\n2021-01...,Datum augustus Zaaknummer Onderwerp Aanvullend...,0,www.gelderland.nl \nKvK-nummer: 51468751 \n \n,"{PNWTQT, JPXFWQ, WEMLVK}","[datum, augustus, zaaknummer, onderwerp, aanvu...","[www, gelderland, nl, kvk, nummer]",[],"[Datum, augustus, Zaaknummer, Onderwerp, Aanvu...","[www, gelderland, nl, KvK, nummer]",[],"[-0.54855424, -1.3298025, 0.19989337, -0.20702...",0.000000,-1,-1,-1,0.0,0,1
1,Aanvullend-besluit-Wob-verzoek-project-COILED....,Aanvullend-besluit-Wob-verzoek-project-COILED....,2,595.320007,841.919983,\n \n \n \n \ndatum \n 19 augustus 2021 \n \n...,Datum\n19 augustus 2021\n\nZaaknummer\n2021-01...,Datum augustus Zaaknummer Blad van Niet openba...,0,0,"{PNWTQT, JPXFWQ, WEMLVK}","[datum, augustus, zaaknummer, blad, openbaar, ...",[],[],"[Datum, augustus, Zaaknummer, Blad, van, Niet,...",[],[],"[-0.11132708, -0.8158811, 0.107630394, -0.1691...",0.721262,1,1,1,0.0,0,0
2,Aanvullend-besluit-Wob-verzoek-project-COILED....,Aanvullend-besluit-Wob-verzoek-project-COILED....,3,595.320007,841.919983,,Datum\n19 augustus 2021\n\nZaaknummer\n2021-01...,Datum augustus Zaaknummer Blad van Deze vindt ...,0,0,{none},"[datum, augustus, zaaknummer, blad, vindt, bov...",[],[],"[Datum, augustus, Zaaknummer, Blad, van, Deze,...",[],[],"[0.053682096, -0.7946833, 0.140492, -0.0468661...",0.392220,0,0,1,1.0,1,0
3,Aanvullend-besluit-Wob-verzoek-project-COILED....,Aanvullend-besluit-Wob-verzoek-project-COILED....,4,595.000000,842.000000,inventarislijst aanvullend besluit wob-verzoek...,Inventarislijst aanvullend besluit Wob-verzoek...,Inventarislijst aanvullend besluit Wob verzoek...,0,0,"{JJUKDK, IQEYES, XNGBWX, WVHQSO}","[inventarislijst, aanvullend, besluit, wob, ve...",[],[],"[Inventarislijst, aanvullend, besluit, Wob, ve...",[],[],"[-0.19504319, -2.220936, 1.263472, 0.9210858, ...",0.321710,-1,0,-1,1.0,0,1
4,Aanvullend-besluit-Wob-verzoek-project-COILED....,Aanvullend-besluit-Wob-verzoek-project-COILED....,5,612.000000,792.000000,,Joost\n\ncpenstioneel programma oost\n\n“\nEur...,Joost cpenstioneel programma oost Europese Uni...,0,0,{none},"[joost, cpenstioneel, programma, oost, europes...",[],[],"[Joost, cpenstioneel, programma, oost, Europes...",[],[],"[0.04210999, -0.36448914, -0.57351416, 0.46199...",0.363172,0,0,-1,1.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16532,Wob-besluiten-over-emissies-en-incidenten-Sach...,Wob-besluiten-over-emissies-en-incidenten-Sach...,366,612.000000,792.000000,\n \n \n \n \n \n3 \ngeluidsgevoelige bestemm...,GELDERLAND\n\nGELUIDSGEVOELIGE BESTEMMINGEN\nG...,GELDERLAND GELUIDSGEVOELIGE BESTEMMINGEN Gebou...,0,\n3 \n,"{GPWGHL, PFTZPI}","[gelderland, bestemmingen, gebouwen, objecten,...",[],[],"[GELDERLAND, GELUIDSGEVOELIGE, BESTEMMINGEN, G...",[],[],"[-0.5214059, -0.6770154, -0.76133853, 0.524511...",0.523393,1,1,1,0.0,0,0
16533,Wob-besluiten-over-emissies-en-incidenten-Sach...,Wob-besluiten-over-emissies-en-incidenten-Sach...,367,612.000000,792.000000,\n \n \n \n \n \n4 \n \nlicht ontvlambare sto...,GELDERLAND\n\nLICHT ONTVLAMBARE STOFFEN\n\nSto...,GELDERLAND LICHT ONTVLAMBARE STOFFEN Stoffen d...,0,\n4 \n,"{GPWGHL, PFTZPI}","[gelderland, licht, ontvlambare, stoffen, stof...",[],[],"[GELDERLAND, LICHT, ONTVLAMBARE, STOFFEN, Stof...",[],[],"[-0.06287716, -0.3350579, 0.0720587, 1.2124405...",0.718779,1,1,1,0.0,0,0
16534,Wob-besluiten-over-emissies-en-incidenten-Sach...,Wob-besluiten-over-emissies-en-incidenten-Sach...,368,612.000000,792.000000,\n \n \n \n \n \n5 \npbv-verklaring vloeistof...,GELDERLAND\n\nPBV-VERKLARING VLOEISTOFDICHTE V...,GELDERLAND PBV VERKLARING VLOEISTOFDICHTE VOOR...,0,\n5 \n,"{GPWGHL, PFTZPI}","[gelderland, pbv, verklaring, vloeistofdichte,...",[],[],"[GELDERLAND, PBV, VERKLARING, VLOEISTOFDICHTE,...",[],[],"[-0.06918676, -0.12621342, 0.31464157, 0.21689...",0.575068,1,1,1,0.0,0,0
16535,Wob-besluiten-over-emissies-en-incidenten-Sach...,Wob-besluiten-over-emissies-en-incidenten-Sach...,369,612.000000,792.000000,\n \n \n \n \n \n6 \nkomen. \n \nvluchtige or...,GELDERLAND\n\nkomen.\n\nVLUCHTIGE ORGANISCHE S...,GELDERLAND komen VLUCHTIGE ORGANISCHE STOF Een...,0,\n6 \n,"{GPWGHL, PFTZPI}","[gelderland, komen, vluchtige, organische, sto...",[],[],"[GELDERLAND, komen, VLUCHTIGE, ORGANISCHE, STO...",[],[],"[-0.116890505, -0.15046203, -0.48416546, 0.245...",0.644168,1,1,1,0.0,0,0


<a id="df"></a>

### Onehot encoding if needed

In [165]:
def onehot(df):
    lb = LabelBinarizer()

    all_fonts = set()
    df['fonts'].apply(lambda x: all_fonts.update(x))

    lb.fit(list(all_fonts))
    ohe_list = []
    df['fonts'].apply(lambda x: ohe_list.append(lb.transform(list(x))))
    return df.join(combine_ohe(ohe_list, lb))



def combine_ohe(ohe_list, lb):
    a = []
    for i in ohe_list:
        combined = 0
        if len(i) > 1:
            for y in i:
                combined+=y
            a.append(combined)
        else:
            a.append(i[0])
    return pd.DataFrame(a, columns = lb.classes_)