In [1]:
import pandas as pd
import json

from collections import Counter, defaultdict
import numpy as np
import os
from tqdm import tqdm

import fitz
import re

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

from gensim.utils import simple_preprocess
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile
from gensim.utils import tokenize

from sklearn.metrics.pairwise import cosine_similarity

tqdm.pandas()

### Testing

In [2]:
doc = fitz.open('corpus1/TrainTestSet/Trainset/data/868212__concatenated.pdf')
# doc[98].get_text('blocks')

### Get header and footer functions

In [3]:
def get_header(page):
    try:
        doc = page.get_text('blocks')
        if not doc:
            return ''

        doc_length = page.rect[-1]
        max_header = doc_length*.095
        texts = []

        for text in doc:
            if text[1] <= max_header and text[1] != 0 and 'image' not in text[4]:
                texts.append(text[4])
        
        return re.sub('\n',' ', ' '.join([str(item) for item in texts]))
        
    except:
        return ''

get_header(doc[51])


''

In [4]:
def get_footer(page):
    try:
        doc = page.get_text('blocks')
        if not doc:
            return ''
    
        doc_length = page.rect[-1]
        max_footer = doc_length*.905
        texts = []

        for text in doc:
            if text[1] >= max_footer and text[1] != 0 and 'image' not in text[4]:
                texts.append(text[4])

        
        return re.sub('\n',' ', ' '.join([str(item) for item in texts]))

    except:
        return ''
        
get_footer(doc[51])

'16 Jaarrapportage Aanpak Radicalisering en Polarisatie 2016 '

### Function to check whether the font of a page differs from the previous page(s)

In [5]:
def font_diff1(df, font):
    is_diff = []
    
    _1 = df.shift(1)
    _2 = df.shift(2)
    _3 = df.shift(3)

    for i in tqdm(range(len(df))):
        a = df.iloc[i][font]
        b = set()

        if i == 0:
            b = set()

        elif i == 1:
            b.update(_1.iloc[i][font])

        elif i == 2:
            b.update(_1.iloc[i][font])
            b.update(_2.iloc[i][font])

        else:
            b.update(_1.iloc[i][font])
            b.update(_2.iloc[i][font])
            b.update(_3.iloc[i][font])

        if len(b) == 0:
            is_diff.append(-1)

        elif a == {'none'} or b == {'none'}:
            is_diff.append(0)

        elif bool(a & b):
            is_diff.append(1)

        else:
            is_diff.append(-1)
            
    return is_diff


def font_diff2(df, font):
    is_diff = [-1]
    _1 = df.shift()

    for i in tqdm(range(1, len(df))):
        a = df.iloc[i][font]
        b = _1.iloc[i][font]

        if a == {'none'} or b == {'none'}:
            is_diff.append(0)

        elif bool(a & b):
            is_diff.append(1)


        else:
            is_diff.append(-1)

    return is_diff

def font_diff3(df):
    it = iter(range(1,len(df)))
    diff = [-1]

    for i in tqdm(it):
        a = df.iloc[i]['fonts']
        b = df.iloc[i-1]['fonts']

        if a == {'none'}:
            j = i
            skips = 1
            j+=1
            diff.append(0)

            while df.iloc[j]['fonts'] == {'none'}:
                diff.append(0)
                j+=1
                skips+=1
            
            a = df.iloc[j]['fonts']

            if bool(a&b):
                diff.append(1)
            
            else:
                diff.append(-1)
            

            for _ in range(skips):
                i = next(it)

        else:
            if bool(a&b):
                diff.append(1)
            

            else:
                diff.append(-1)

    return diff

In [6]:
def crop_diff(df):
    is_diff = [-1]
    _1 = df.shift()
    
    for i in tqdm(range(1, len(df))):
        a = (df.iloc[i]['cropbox_x'], df.iloc[i]['cropbox_y'])
        b = (_1.iloc[i]['cropbox_x'], _1.iloc[i]['cropbox_y'])
        
        if a == b:
            is_diff.append(1)
        else:
            is_diff.append(-1)

    return is_diff

In [7]:
def simple_tokenize(text, _stopwords):
    return [token for token in simple_preprocess(text) if token not in _stopwords]

In [8]:
def preprocess_text(text, _stopwords, stemmer):
    if pd.isnull(text):
        return []
    tokens = simple_tokenize(text, _stopwords)
    stemmed = [stemmer.stem(word) for word in tokens]
    
    return stemmed

### Create dataframe with features containing all concatenated training documents

In [9]:
pd.set_option('display.max_columns', 40)

def read(corpus_no):
    path = 'corpus'+str(corpus_no)+'/TrainTestSet/Trainset/data/'
    a = defaultdict(list)
    with open('corpus'+str(corpus_no)+'/TrainTestSet/Trainset/Doclengths_of_the_individual_docs_TRAIN.json', 'r') as f:
        data = json.load(f)

    for r, d, f in os.walk(path):
        if '.ipynb_checkpoints' in d:
            d.remove('.ipynb_checkpoints')
        for file in tqdm(f):
            doc = fitz.open(r+file)
            
            for page in doc:

                a['file_name'].append(file.split('__')[0])
                a['page'].append(page.number+1)

                cropbox = page.rect[-2:]
                a['cropbox_x'].append(cropbox[0])
                a['cropbox_y'].append(cropbox[1])

                a['text'].append(page.get_text())

                fonts = set()
                for font in page.get_fonts():
                    fonts.add(font[3].split('+')[0])
                    
                if fonts:
                    a['fonts'].append(fonts)
                else:
                    a['fonts'].append({'none'})

                if page.get_text() == '':
                    a['isImage'].append(1)

                else:
                    a['isImage'].append(0)

                header = get_header(page)
                footer = get_footer(page)
                a['header'].append(header)
                a['footer'].append(footer)
                
            # break

    df = pd.DataFrame(a)

    # # display(df)

    ### Labeling ###
    print('labeling')
    df['label'] = 0
    for file in tqdm(df['file_name'].unique()):
        split = 1
        for i in data[file]:
            df.loc[(df['file_name'] == file) & (df['page'] == split), 'label'] = 1
            split+=i
    ################
    
    ### Join OCR ###
    ocr = pd.read_csv('corpus'+str(corpus_no)+'/TrainTestSet/Trainset/ocred_text.csv',index_col = 0)
    ocr.rename(columns = {'name':'file_name'}, inplace = True)
    ocr['file_name'] = ocr['file_name'].apply(lambda x: x.split('__')[0])
    df = df.merge(ocr, on = ['file_name','page'], how = 'left')
    ################
    
    # ### Drop things ###
    # # df = drop_images(df)
    # ###################
    
    ### Tokenization & Cleaning ###
    print('cleaning')
    _stopwords = stopwords.words('dutch')
    stemmer = SnowballStemmer('dutch')

    df['preprocessed_text'] = df['text_y'].progress_apply(lambda x: preprocess_text(x, _stopwords, stemmer))   
    df['text_y_cleaned'] = df['preprocessed_text'].apply(lambda x: ' '.join([str(item) for item in x]))
    ####################
    
    # ### Drop things 2 ###
    # # df.dropna(subset=['text_y_fasttext_cleaned'], inplace = True)
    # #####################
            
    # ### Differences ###
    print('calculating differences')
    # df['font_diff1'] = font_diff1(df,'fonts')
    # df['font_diff2'] = font_diff2(df,'fonts')
    df['crop_diff'] = crop_diff(df)
    df['isLastPage'] = np.nan_to_num(df['label'].shift(-1).values, nan = 1)
    df['font_diff3'] = font_diff3(df)
    # ###################

    c = df.columns
    df = df[['full_name', 'file_name', 'page', 'cropbox_x', 'cropbox_y',
       'text_x', 'text_y', 'text_y_cleaned', 'header', 'footer', 'fonts',
       'preprocessed_text','isImage','isLastPage','crop_diff','font_diff3', 'label']]
    
    print([d for d in c if d not in df.columns])
    df.to_csv('dfs/c'+str(corpus_no)+'.csv')
    # # display(df)

    return df

df = read(2)
# df[['file_name','page','text_y','header','footer','label']][:200]

  7%|▋         | 8/113 [02:04<27:17, 15.60s/it]

In [12]:
c1 = pd.read_csv('dfs/c1.csv',index_col = 0)
c2 = pd.read_csv('dfs/c2.csv',index_col = 0)

In [13]:
from ast import literal_eval
c1['fonts'] = c1['fonts'].apply(literal_eval)
c2['fonts'] = c2['fonts'].apply(literal_eval)

In [18]:
pd.set_option('display.max_rows', 1000)
c2[['page','fonts','font_diff3','crop_diff','label']][:200]

Unnamed: 0,page,fonts,font_diff3,crop_diff,label
0,1,"{WEMLVK, PNWTQT, JPXFWQ}",-1,-1,1
1,2,"{WEMLVK, PNWTQT, JPXFWQ}",1,1,0
2,3,{none},0,1,0
3,4,"{WVHQSO, XNGBWX, JJUKDK, IQEYES}",-1,-1,1
4,5,{none},0,-1,1
5,6,{none},0,-1,1
6,7,{none},0,1,0
7,8,{none},0,1,0
8,9,{none},0,-1,1
9,10,{none},0,1,0
