In [1]:
import pandas as pd
from tqdm import tqdm
import re
from nltk.util import ngrams

In [2]:
c1 = pd.read_csv('dfs/c1.csv',index_col = 0)
c2 = pd.read_csv('dfs/c2.csv',index_col = 0)

In [3]:
import re
import math
import numpy as np
from itertools import chain
from collections import Counter, defaultdict
import nltk
from nltk.util import ngrams
from textblob import TextBlob

NGRAM = 2

re_sent_ends_naive = re.compile(r'[.\n]')
re_stripper_alpha = re.compile('[^a-zA-Z]+')
re_stripper_naive = re.compile('[^a-zA-Z\.\n]')

splitter_naive = lambda x: re_sent_ends_naive.split(re_stripper_naive.sub(' ', x))

sent_detector = nltk.data.load('tokenizers/punkt/dutch.pickle')

In [4]:
def get_tuples_nosentences(txt, NGRAM):
    """Get tuples that ignores all punctuation (including sentences)."""
    if not txt: return None
    ng = ngrams(re_stripper_alpha.sub(' ', txt).split(), NGRAM)
    return list(ng)

def get_char_tuples(txt, n):
    if not txt: return None
    return [tuple(txt[i:i+n]) for i in range(len(txt)-n+1)]

def get_tuples_manual_sentences(txt):
    """Naive get tuples that uses periods or newlines to denote sentences."""
    if not txt: return None
    sentences = (x.split() for x in splitter_naive(txt) if x)
    ng = (ngrams(x, NGRAM) for x in sentences if len(x) >= NGRAM)
    return list(chain(*ng))

def get_tuples_nltk_punkt_sentences(txt):
    """Get tuples that doesn't use textblob."""
    if not txt: return None
    sentences = (re_stripper_alpha.split(x) for x in sent_detector.tokenize(txt) if x)
    # Need to filter X because of empty 'words' from punctuation split
    ng = (ngrams(filter(None, x), NGRAM) for x in sentences if len(x) >= NGRAM)
    return list(chain(*ng))

def get_tuples_textblob_sentences(txt):
    """New get_tuples that does use textblob."""
    if not txt: return None
    tb = TextBlob(txt)
    ng = (ngrams(x.words, NGRAM) for x in tb.sentences if len(x.words) > NGRAM)
    return [item for sublist in ng for item in sublist]

def jaccard_distance(a, b):
    """Calculate the jaccard distance between sets A and B"""
    a = set(a)
    b = set(b)
    if len(a|b) == 0:
        return 0.0
    return 1.0 * len(a&b)/len(a|b)

def cosine_similarity_ngrams(a, b):
    vec1 = Counter(a)
    vec2 = Counter(b)
    
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    return float(numerator) / denominator

In [5]:
NGRAM = 1

a = get_tuples_nosentences(c1.iloc[36]['text_y_cleaned'],NGRAM)
b = get_tuples_nosentences(c1.iloc[34]['text_y_cleaned'],NGRAM)
print("Jaccard: {}   Cosine: {}".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))

Jaccard: 0.011111111111111112   Cosine: 0.01934558081335342


In [6]:
def compare(df, max_ngrams):
    it = iter(range(1,len(df)))
    diffs = defaultdict(list)
    for n in range(max_ngrams):
        diffs[str(n+1)+'_cos'].append(0)
        diffs[str(n+1)+'_jac'].append(0)
        diffs[str(n+1)+'_cos_char'].append(0)
        diffs[str(n+1)+'_jac_char'].append(0)


    for i in tqdm(it):
        a = df.iloc[i]['text_y_cleaned']
        b = df.iloc[i-1]['text_y_cleaned']

        if pd.isnull(a):
            j = i
            skips = 0

            while pd.isnull(df.iloc[j]['text_y_cleaned']):
                for n in range(max_ngrams):
                    diffs[str(n+1)+'_cos'].append(0)
                    diffs[str(n+1)+'_jac'].append(0)
                    diffs[str(n+1)+'_cos_char'].append(0)
                    diffs[str(n+1)+'_jac_char'].append(0)
                j+=1
                skips+=1
            
            a = df.iloc[j]['text_y_cleaned']

            for n in range(max_ngrams):
                NGRAM = n+1
                # Word ngrams
                a_tup = get_tuples_nosentences(a, NGRAM)
                b_tup = get_tuples_nosentences(b, NGRAM)
                diffs[str(n+1)+'_cos'].append(cosine_similarity_ngrams(a_tup,b_tup))
                diffs[str(n+1)+'_jac'].append(jaccard_distance(a_tup,b_tup))

                # Char ngrams
                a_tup = get_char_tuples(a, NGRAM)
                b_tup = get_char_tuples(b, NGRAM)
                diffs[str(n+1)+'_cos_char'].append(cosine_similarity_ngrams(a_tup,b_tup))
                diffs[str(n+1)+'_jac_char'].append(jaccard_distance(a_tup,b_tup))


            for _ in range(skips):
                i = next(it)

        else:
            for n in range(max_ngrams):
                NGRAM = n+1
                # Word ngrams
                a_tup = get_tuples_nosentences(a, NGRAM)
                b_tup = get_tuples_nosentences(b, NGRAM)
                diffs[str(n+1)+'_cos'].append(cosine_similarity_ngrams(a_tup,b_tup))
                diffs[str(n+1)+'_jac'].append(jaccard_distance(a_tup,b_tup))

                # Char ngrams
                a_tup = get_char_tuples(a, NGRAM)
                b_tup = get_char_tuples(b, NGRAM)
                diffs[str(n+1)+'_cos_char'].append(cosine_similarity_ngrams(a_tup,b_tup))
                diffs[str(n+1)+'_jac_char'].append(jaccard_distance(a_tup,b_tup))


    return pd.DataFrame(diffs)

In [13]:
d1 = c1.join(compare(c1,3))

18116it [01:03, 283.23it/s]


In [12]:
pd.set_option('display.max_rows', 200)
d1.columns

Index(['full_name', 'file_name', 'page', 'cropbox_x', 'cropbox_y', 'text_x',
       'text_y', 'text_y_cleaned', 'header', 'footer', 'fonts',
       'preprocessed_text', 'isImage', 'isLastPage', 'crop_diff', 'font_diff3',
       'label', '1_cos', '1_jac', '1_cos_char', '1_jac_char', '2_cos', '2_jac',
       '2_cos_char', '2_jac_char', '3_cos', '3_jac', '3_cos_char',
       '3_jac_char'],
      dtype='object')

In [8]:
n = 4



def compare2(df, max_ngrams, n):
    shifts = dict()
    diffs = defaultdict(list)

    

    for i in range(n):
        shifts[(i+1)*-1] = c1.shift(i+1)
        shifts[(-i-1)*-1] = c1.shift(-i-1)

    for i in tqdm(range(len(df))):
        a = df.iloc[i]['text_y_cleaned']

        for j in shifts.keys():
            b = shifts[j].iloc[i]['text_y_cleaned']

            for k in range(max_ngrams):
                if pd.isnull(a) or pd.isnull(b):
                    diffs[str(k+1)+'_cos_'+str(j)].append(0)
                    diffs[str(k+1)+'_jac_'+str(j)].append(0)
                    diffs[str(k+1)+'_cos_char_'+str(j)].append(0)
                    diffs[str(k+1)+'_jac_char_'+str(j)].append(0)
                    continue

                # Word ngrams
                a_tup = get_tuples_nosentences(a,k+1)
                b_tup = get_tuples_nosentences(b,k+1)
                diffs[str(k+1)+'_cos_'+str(j)].append(cosine_similarity_ngrams(a_tup,b_tup))
                diffs[str(k+1)+'_jac_'+str(j)].append(jaccard_distance(a_tup,b_tup))

                # Char ngrams
                a_tup = get_char_tuples(a,k+1)
                b_tup = get_char_tuples(b,k+1)
                diffs[str(k+1)+'_cos_char_'+str(j)].append(cosine_similarity_ngrams(a_tup,b_tup))
                diffs[str(k+1)+'_jac_char_'+str(j)].append(jaccard_distance(a_tup,b_tup))

    return pd.DataFrame(diffs)





In [9]:
c1_sims = c1.join(compare2(c1, 3, 3))
c2_sims = c2.join(compare2(c2, 3, 3))

100%|██████████| 19102/19102 [06:15<00:00, 50.84it/s] 
100%|██████████| 16537/16537 [05:30<00:00, 50.01it/s] 


In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [17]:
def change_format(y):
    y[0] = 1
    indices = [i for i, x in enumerate(y) if x == 1]+[len(y)-1]
    result = []
    for i in range(len(indices)):
        if i != len(indices)-1:
            result.append(indices[i+1] - indices[i])
    result[-1]+=1
    return result

In [18]:
def make_index(split):
    '''Turns a doc length vector like [1,2,1,3,3,5] into a dict with pagenumbers as keys and the set of all 
    pagenumbers in the same document as value.
    This thus is an index which gives for every page its cluster.'''
    l= sum(split)
    pages= list(np.arange(l))
    out = defaultdict(set)
    for block_length in split:
        block= pages[:block_length]
        pages= pages[block_length:]
        for page in block:
            out[page]= set(block)
    return out

In [19]:
def Bcubed(truth,pred):
    assert sum(truth)==sum(pred)  # same amount of pages
    truth,pred = make_index(truth), make_index(pred)
    
    df  ={i:{'size':len(truth[i]),'P':0,'R':0,'F1':0} for i in truth}
    for i in truth:
        df[i]['P']= len(truth[i] & pred[i])/len(pred[i]) 
        df[i]['R']= len(truth[i] & pred[i])/len(truth[i])
        df[i]['F1']= (2*df[i]['P']*df[i]['R'])/(df[i]['P']+df[i]['R'])
    df= pd.DataFrame.from_dict(df, orient='index')
    df.index_name='PageNr'
    return  df


def MeanBcubed(truth,pred):
    assert sum(truth)==sum(pred)  # same amount of pages
    return Bcubed(truth,pred).mean()

In [49]:
train_clean = c1_sims[~pd.isnull(c1_sims['text_y_cleaned'])]
test_clean = c2_sims[~pd.isnull(c2_sims['text_y_cleaned'])]

train = c1_sims
test = c2_sims

Unnamed: 0,full_name,file_name,page,cropbox_x,cropbox_y,text_x,text_y,text_y_cleaned,header,footer,...,1_cos_char3,1_jac_char3,2_cos_3,2_jac_3,2_cos_char3,2_jac_char3,3_cos_3,3_jac_3,3_cos_char3,3_jac_char3
0,868212__concatenated-001.txt,868212,1,419.528015,595.276001,Handreiking \nVeilige Moskee\n,\n\nHandreiking\n\nVeilige Moskee\n\n \n,handreik veilig moskee,,,...,0.898450,0.625000,0.164153,0.013605,0.512104,0.082645,0.081111,0.006579,0.476742,0.032573
1,868212__concatenated-002.txt,868212,2,419.528015,595.276001,1\nHandreiking | Veilige Moskee\nInhoudsopgave...,Handreiking | Veilige Moskee\n\nInhoudsopgave\...,handreik veilig moskee inhoudsopgav inleid aar...,Handreiking | Veilige Moskee,1,...,0.975678,0.880000,0.042796,0.022222,0.701130,0.497561,0.022283,0.011111,0.400137,0.187359
2,868212__concatenated-003.txt,868212,3,419.528015,595.276001,3\nHandreiking | Veilige Moskee\n1. Inleiding\...,Handreiking | Veilige Moskee\n\n1. Inleiding\n...,handreik veilig moskee inleid moskee gebedshui...,Handreiking | Veilige Moskee,3,...,0.986773,0.920000,0.038490,0.019900,0.873152,0.602362,0.019631,0.009901,0.650754,0.256065
3,868212__concatenated-004.txt,868212,4,419.528015,595.276001,4\nHandreiking | Veilige Moskee\nTips voor mos...,Handreiking | Veilige Moskee\n\nTips voor mosk...,handreik veilig moskee tip moskee gemeent poli...,Handreiking | Veilige Moskee,4,...,0.983317,0.857143,0.022635,0.009585,0.879288,0.628205,0.006080,0.003096,0.630186,0.325933
4,868212__concatenated-005.txt,868212,5,419.528015,595.276001,5\nHandreiking | Veilige Moskee\nAanvullingen\...,Handreiking | Veilige Moskee\n\nAanvullingen\n...,handreik veilig moskee aanvull handreik defini...,Handreiking | Veilige Moskee,5,...,0.982468,0.960000,0.046524,0.017045,0.760221,0.547325,0.012189,0.005587,0.480749,0.231579
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19097,993914_files.zip__concatenated-204.txt,993914_files.zip,204,595.219971,842.000000,6\nT + 31 (0)\n \n@amsterdam.nl \n \n,T + 32 N\n\n-®amsterdam.nl\n\n \n,amsterdam nl,T + 31 (0) @amsterdam.nl,6,...,0.659141,0.416667,0.000000,0.000000,0.059602,0.040000,0.000000,0.000000,0.004637,0.006079
19098,993914__concatenated-1.txt,993914,1,595.320007,841.919983,Aantekeningen \nZienswijzeverzoeken \nBelanghe...,Aantekeningen\nZienswijzeverzoeken\n\n \n\n \n...,aanteken belanghebb partij document politie dr...,Aantekeningen Zienswijzeverzoeken,,...,0.960295,1.000000,0.000000,0.000000,0.667565,0.420635,0.000000,0.000000,0.356729,0.146233
19099,993914__concatenated-2.txt,993914,2,595.320007,841.919983,"Geheimhouding politiegegevens \n(artikel 7, tw...",\n\nGeheimhouding politiegegevens\n(artikel 7...,geheimhoud politiegegeven artikel twed lid wpg...,"Geheimhouding politiegegevens (artikel 7, twe...",,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
19100,993914__concatenated-3.txt,993914,3,595.320007,841.919983,55 \nGemeente \nE-mails over cijfermatige\ndat...,\n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n ...,gemeent mail cijfermat gedeelt open art lid da...,55 Gemeente E-mails over cijfermatige datage...,,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [61]:
features = ['1_cos_-1','1_cos_1']
# features = ['2_cos_-3','2_cos_-2','2_cos_-1','2_cos_1','2_cos_2','2_cos_3']
# features = ['3_cos_-3','3_cos_-2','3_cos_-1','3_cos_1','3_cos_2','3_cos_3']
# features = ['2_jac_-3','2_jac_-2','2_jac_-1','2_jac_1','2_jac_2','2_jac_3']
# features = ['2_cos_char-3','2_cos_char-2','2_cos_char-1','2_cos_char1','2_cos_char2','2_cos_char3']
features = ['2_jac_char-3','2_jac_char-2','2_jac_char-1','2_jac_char1','2_jac_char2','2_jac_char3']
model = DecisionTreeClassifier()
model.fit(train_clean[features], train_clean['label'])

X_test = test_clean[features]
y_test = test_clean['label']
true = y_test
preds = model.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(model.predict(X_test))

MeanBcubed(vb_truth, vb_pred)

size    69.707824
P        0.823313
R        0.331486
F1       0.314790
dtype: float64

In [47]:
get_tuples_nosentences('dit is een test', 2)

[('dit', 'is'), ('is', 'een'), ('een', 'test')]

In [62]:
get_tuples_char(train.iloc[5]['text_y_cleaned'], 3)

NameError: name 'get_tuples_char' is not defined