In [1]:
import pandas as pd
from ast import literal_eval
import numpy as np
from collections import defaultdict
from tqdm import tqdm
import pickle

In [2]:
c1_hf = pd.read_csv('sim_scores/c1_headerfooter.csv', index_col = 0)
c2_hf = pd.read_csv('sim_scores/c2_headerfooter.csv', index_col = 0)
c1_d2v = pd.read_csv('sim_scores/d2v_skip_c1.csv', index_col = 0)
c2_d2v = pd.read_csv('sim_scores/d2v_skip_c2.csv', index_col = 0)
c1_tfidf = pd.read_csv('sim_scores/tfidf_skip_c1.csv', index_col = 0)
c2_tfidf = pd.read_csv('sim_scores/tfidf_skip_c2.csv', index_col = 0)
c1 = pd.read_csv('dfs/c1.csv', index_col = 0)
c2 = pd.read_csv('dfs/c2.csv', index_col = 0)

In [3]:
c1 = c1.merge(c1_hf.drop(columns=['label']), how='left', on=['file_name','page']).merge(c1_d2v.drop(columns=['text_y_cleaned','label']), how='left',on=['full_name','page']).merge(c1_tfidf.drop(columns=['text_y_cleaned','label']), how='left',on=['full_name','page'])
c2 = c2.merge(c2_hf.drop(columns=['label']), how='left', on=['file_name','page']).merge(c2_d2v.drop(columns=['text_y_cleaned','label']), how='left',on=['full_name','page']).merge(c2_tfidf.drop(columns=['text_y_cleaned','label']), how='left',on=['full_name','page'])


# Eval

In [4]:
def change_format(y):
    y[0] = 1
    indices = [i for i, x in enumerate(y) if x == 1]+[len(y)-1]
    result = []
    for i in range(len(indices)):
        if i != len(indices)-1:
            result.append(indices[i+1] - indices[i])
    result[-1]+=1
    return result

In [5]:
def make_index(split):
    '''Turns a doc length vector like [1,2,1,3,3,5] into a dict with pagenumbers as keys and the set of all 
    pagenumbers in the same document as value.
    This thus is an index which gives for every page its cluster.'''
    l= sum(split)
    pages= list(np.arange(l))
    out = defaultdict(set)
    for block_length in split:
        block= pages[:block_length]
        pages= pages[block_length:]
        for page in block:
            out[page]= set(block)
    return out

In [6]:
def Bcubed(truth,pred):
    assert sum(truth)==sum(pred)  # same amount of pages
    truth,pred = make_index(truth), make_index(pred)
    
    df  ={i:{'size':len(truth[i]),'P':0,'R':0,'F1':0} for i in truth}
    for i in truth:
        df[i]['P']= len(truth[i] & pred[i])/len(pred[i]) 
        df[i]['R']= len(truth[i] & pred[i])/len(truth[i])
        df[i]['F1']= (2*df[i]['P']*df[i]['R'])/(df[i]['P']+df[i]['R'])
    df= pd.DataFrame.from_dict(df, orient='index')
    df.index_name='PageNr'
    return  df


def MeanBcubed(truth,pred):
    assert sum(truth)==sum(pred)  # same amount of pages
    return Bcubed(truth,pred).mean()

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from collections import defaultdict

_c1 = c1[~pd.isnull(c1['text_y_cleaned'])]
_c2 = c2[~pd.isnull(c2['text_y_cleaned'])]

# Visual features Only

## Train C1, Predict C2

In [134]:
res = []
types = []
trainings = []

In [135]:
vis_features = ['font_diff3','crop_diff','header_diff','footer_diff']
model = LogisticRegression()
model.fit(_c1[vis_features], _c1['label'])

X_test = _c2[vis_features]
y_test = _c2['label']
true = y_test
preds = model.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(model.predict(X_test))

res.append(MeanBcubed(vb_truth, vb_pred))
types.append('visual only')
trainings.append('c1c2')
MeanBcubed(vb_truth, vb_pred)

size    69.707824
P        0.448520
R        0.827935
F1       0.369598
dtype: float64

## Train C2, Predict C1

In [136]:
vis_features = ['font_diff3','crop_diff','header_diff','footer_diff']
model = LogisticRegression()
model.fit(_c2[vis_features], _c2['label'])

X_test = _c1[vis_features]
y_test = _c1['label']
true = y_test
preds = model.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(model.predict(X_test))

res.append(MeanBcubed(vb_truth, vb_pred))
types.append('visual only')
trainings.append('c2c1')
MeanBcubed(vb_truth, vb_pred)

size    34.403323
P        0.369691
R        0.942905
F1       0.404325
dtype: float64

# TFIDF + visual results

## Train C1, predict C2

In [137]:
vis_features = ['font_diff3','crop_diff','header_diff','footer_diff','tfidf_sim']
model = LogisticRegression()
model.fit(_c1[vis_features], _c1['label'])

X_test = _c2[vis_features]
y_test = _c2['label']
true = y_test
preds = model.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(model.predict(X_test))
MeanBcubed(vb_truth, vb_pred)
res.append(MeanBcubed(vb_truth, vb_pred))
types.append('visual + tfidf')
trainings.append('c1c2')

## Train C2, Predict C1

In [138]:
vis_features = ['font_diff3','crop_diff','header_diff','footer_diff', 'tfidf_sim']
model = LogisticRegression()
model.fit(_c2[vis_features], _c2['label'])

X_test = _c1[vis_features]
y_test = _c1['label']
true = y_test
preds = model.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(model.predict(X_test))
res.append(MeanBcubed(vb_truth, vb_pred))
types.append('visual + tfidf')
trainings.append('c2c1')
MeanBcubed(vb_truth, vb_pred)

size    34.403323
P        0.455809
R        0.942125
F1       0.490227
dtype: float64

# D2V + Visual

## Train C1, predict C2

In [139]:
vis_features = ['font_diff3','crop_diff','header_diff','footer_diff','d2v_sim']
model = LogisticRegression()
model.fit(_c1[vis_features], _c1['label'])

X_test = _c2[vis_features]
y_test = _c2['label']
true = y_test
preds = model.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(model.predict(X_test))
res.append(MeanBcubed(vb_truth, vb_pred))
types.append('visual + d2v')
trainings.append('c1c2')
MeanBcubed(vb_truth, vb_pred)

size    69.707824
P        0.559806
R        0.768265
F1       0.446680
dtype: float64

## Train C2, Predict C1

In [140]:
vis_features = ['font_diff3','crop_diff','header_diff','footer_diff', 'd2v_sim']
model = LogisticRegression()
model.fit(_c2[vis_features], _c2['label'])

X_test = _c1[vis_features]
y_test = _c1['label']
true = y_test
preds = model.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(model.predict(X_test))
res.append(MeanBcubed(vb_truth, vb_pred))
types.append('visual + d2v')
trainings.append('c2c1')
MeanBcubed(vb_truth, vb_pred)

size    34.403323
P        0.471147
R        0.905422
F1       0.483325
dtype: float64

# BERT

In [112]:
c1_old = pd.read_csv('dfs/c1_old.csv',index_col = 0)
c2_old = pd.read_csv('dfs/c2_old.csv',index_col = 0)
c1_bert = pd.read_csv('sim_scores/bert_skip_c1.csv',index_col = 0)
c2_bert = pd.read_csv('sim_scores/bert_skip_c2.csv',index_col = 0)

v1 = pickle.load(open('pickles/c1_vectors.p','rb'))
v2 = pickle.load(open('pickles/c2_vectors.p','rb'))

v1_trunc = pickle.load(open('pickles/c1_vectors_trunc.p','rb'))
v2_trunc = pickle.load(open('pickles/c2_vectors_trunc.p','rb'))

In [113]:
c1_old = c1_old.merge(c1_bert.drop(columns=['text_y_cleaned','label']), on = ['full_name','page'])
c2_old = c2_old.merge(c2_bert.drop(columns=['text_y_cleaned','label']), on = ['full_name','page'])

In [114]:
from sklearn.neural_network import MLPClassifier
clf1 = MLPClassifier(
    hidden_layer_sizes = (128,),
    batch_size = 50,
    solver = 'adam',
    verbose = 1
)

clf1.fit(v1, c1_old['label'])

Iteration 1, loss = 0.39818276
Iteration 2, loss = 0.35097355
Iteration 3, loss = 0.33709768
Iteration 4, loss = 0.32372174
Iteration 5, loss = 0.31834743
Iteration 6, loss = 0.30123198
Iteration 7, loss = 0.28832314
Iteration 8, loss = 0.28640865
Iteration 9, loss = 0.27549390
Iteration 10, loss = 0.26946782
Iteration 11, loss = 0.25660425
Iteration 12, loss = 0.25597089
Iteration 13, loss = 0.25027705
Iteration 14, loss = 0.26675862
Iteration 15, loss = 0.23785877
Iteration 16, loss = 0.23361837
Iteration 17, loss = 0.22899692
Iteration 18, loss = 0.22355790
Iteration 19, loss = 0.23867769
Iteration 20, loss = 0.22129997
Iteration 21, loss = 0.22209691
Iteration 22, loss = 0.21568291
Iteration 23, loss = 0.20738741
Iteration 24, loss = 0.20109768
Iteration 25, loss = 0.19541929
Iteration 26, loss = 0.21171757
Iteration 27, loss = 0.21251098
Iteration 28, loss = 0.18878991
Iteration 29, loss = 0.18416842
Iteration 30, loss = 0.17919205
Iteration 31, loss = 0.17502174
Iteration 32, los

MLPClassifier(batch_size=50, hidden_layer_sizes=(128,), verbose=1)

In [115]:
clf2 = MLPClassifier(
    hidden_layer_sizes = (128,),
    batch_size = 50,
    solver = 'adam',
    verbose = 1
)

clf2.fit(v2, c2_old['label'])

Iteration 1, loss = 0.28963045
Iteration 2, loss = 0.23539884
Iteration 3, loss = 0.21925378
Iteration 4, loss = 0.20706952
Iteration 5, loss = 0.19985772
Iteration 6, loss = 0.18956082
Iteration 7, loss = 0.18630909
Iteration 8, loss = 0.17956198
Iteration 9, loss = 0.17534934
Iteration 10, loss = 0.16988212
Iteration 11, loss = 0.16726029
Iteration 12, loss = 0.16236013
Iteration 13, loss = 0.15889342
Iteration 14, loss = 0.15642832
Iteration 15, loss = 0.15356882
Iteration 16, loss = 0.14868698
Iteration 17, loss = 0.14696297
Iteration 18, loss = 0.14379808
Iteration 19, loss = 0.14002244
Iteration 20, loss = 0.13480208
Iteration 21, loss = 0.13391103
Iteration 22, loss = 0.13007846
Iteration 23, loss = 0.12879745
Iteration 24, loss = 0.12386284
Iteration 25, loss = 0.11920692
Iteration 26, loss = 0.11859908
Iteration 27, loss = 0.11438356
Iteration 28, loss = 0.11697673
Iteration 29, loss = 0.11021783
Iteration 30, loss = 0.10827913
Iteration 31, loss = 0.10750992
Iteration 32, los

MLPClassifier(batch_size=50, hidden_layer_sizes=(128,), verbose=1)

In [116]:
c2_old['bert_proba'] = clf1.predict_proba(v2)[:,1]
c1_old['bert_proba'] = clf2.predict_proba(v1)[:,1]

In [117]:
_c1_old = c1_old[~pd.isnull(c1_old['text_y_cleaned'])]
_c2_old = c2_old[~pd.isnull(c2_old['text_y_cleaned'])]

In [118]:
pd.set_option('display.max_rows', 200)
# c1[['page','crop_diff','font_diff3','footer_diff','header_diff','label']][:200]
# c1[c1['font_diff3'] == 0]

# Train on c1, predict c2

### Bert Vectors only

In [141]:
X_train = v1
y_train = c1_old['label']
X_test = v2
y_test = c2_old['label']

clf = MLPClassifier(
    hidden_layer_sizes = (128,),
    batch_size = 50,
    solver = 'adam',
    verbose = 0
)

clf.fit(X_train, y_train)

true = y_test
preds = clf.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(clf.predict(X_test))
res.append(MeanBcubed(vb_truth, vb_pred))
types.append('bert only')
trainings.append('c1c2')
MeanBcubed(vb_truth, vb_pred)

size    68.910383
P        0.678032
R        0.675426
F1       0.523359
dtype: float64

In [142]:
# features = ['font_diff2','crop_diff','text_d2v_sim_score']
X_train = v2
y_train = c2_old['label']
X_test = v1
y_test = c1_old['label']

clf = MLPClassifier(
    hidden_layer_sizes = (128,),
    batch_size = 50,
    solver = 'adam',
    verbose = 0
)

clf.fit(X_train, y_train)

true = y_test
preds = clf.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(clf.predict(X_test))
res.append(MeanBcubed(vb_truth, vb_pred))
types.append('bert only')
trainings.append('c2c1')
MeanBcubed(vb_truth, vb_pred)

size    34.380975
P        0.724989
R        0.655715
F1       0.545121
dtype: float64

### Visual + Bert_similarity i-1

In [143]:
features = ['font_diff3','crop_diff','bert_sim']

X_train = c1_old[features]
y_train = c1_old['label']
X_test = c2_old[features]
y_test = c2_old['label']

model = LogisticRegression()

model.fit(X_train, y_train)

true = y_test
preds = model.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(model.predict(X_test))
res.append(MeanBcubed(vb_truth, vb_pred))
types.append('visual + bert similarity')
trainings.append('c1c2')
MeanBcubed(vb_truth, vb_pred)

size    68.910383
P        0.434505
R        0.833364
F1       0.355291
dtype: float64

In [144]:
features = ['font_diff3','crop_diff','bert_sim']

X_train = c2_old[features]
y_train = c2_old['label']
X_test = c1_old[features]
y_test = c1_old['label']

clf = LogisticRegression()

clf.fit(X_train, y_train)

true = y_test
preds = clf.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(clf.predict(X_test))
res.append(MeanBcubed(vb_truth, vb_pred))
types.append('visual + bert similarity')
trainings.append('c2c1')
MeanBcubed(vb_truth, vb_pred)


size    34.380975
P        0.150656
R        0.962881
F1       0.183027
dtype: float64

### Visual + Bert_proba

In [145]:
features = ['font_diff3','crop_diff','bert_proba']

X_train = c1_old[features]
y_train = c1_old['label']
X_test = c2_old[features]
y_test = c2_old['label']

model = LogisticRegression()

model.fit(X_train, y_train)

true = y_test
preds = model.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(model.predict(X_test))
res.append(MeanBcubed(vb_truth, vb_pred))
types.append('visual + bert proba')
trainings.append('c1c2')
MeanBcubed(vb_truth, vb_pred)


size    68.910383
P        0.548187
R        0.769882
F1       0.427405
dtype: float64

In [146]:
features = ['font_diff3','crop_diff','bert_proba']

X_train = c2_old[features]
y_train = c2_old['label']
X_test = c1_old[features]
y_test = c1_old['label']

clf = LogisticRegression()

clf.fit(X_train, y_train)

true = y_test
preds = clf.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(clf.predict(X_test))
res.append(MeanBcubed(vb_truth, vb_pred))
types.append('visual + bert proba')
trainings.append('c2c1')
MeanBcubed(vb_truth, vb_pred)


size    34.380975
P        0.304108
R        0.969963
F1       0.372951
dtype: float64

## MLP - BERT + Visual

In [130]:
v1_plus = [np.append(v1[i], [c1_old.iloc[i]['font_diff3'], c1_old.iloc[i]['crop_diff']]) for i in tqdm(range(len(v1)))]
v2_plus = [np.append(v2[i], [c2_old.iloc[i]['font_diff3'], c2_old.iloc[i]['crop_diff']]) for i in tqdm(range(len(v2)))]

100%|██████████| 19101/19101 [00:03<00:00, 4997.64it/s]
100%|██████████| 16537/16537 [00:03<00:00, 5011.22it/s]


### Train C1, Predict C2

In [147]:
# features = ['font_diff2','crop_diff','text_d2v_sim_score']
X_train = v1_plus
y_train = c1_old['label']
X_test = v2_plus
y_test = c2_old['label']

clf = MLPClassifier(
    hidden_layer_sizes = (128,),
    batch_size = 50,
    solver = 'adam',
    verbose = 0
)

clf.fit(X_train, y_train)

true = y_test
preds = clf.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(clf.predict(X_test))
res.append(MeanBcubed(vb_truth, vb_pred))
types.append('bert appended + MLP')
trainings.append('c1c2')
MeanBcubed(vb_truth, vb_pred)


size    68.910383
P        0.657442
R        0.704239
F1       0.523575
dtype: float64

### Train C2, Predict C1

In [148]:
# features = ['font_diff2','crop_diff','text_d2v_sim_score']
X_train = v2_plus
y_train = c2_old['label']
X_test = v1_plus
y_test = c1_old['label']

clf = MLPClassifier(
    hidden_layer_sizes = (128,),
    batch_size = 50,
    solver = 'adam',
    verbose = 0
)

clf.fit(X_train, y_train)

true = y_test
preds = clf.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(clf.predict(X_test))
res.append(MeanBcubed(vb_truth, vb_pred))
types.append('bert appended + MLP')
trainings.append('c2c1')
MeanBcubed(vb_truth, vb_pred)


size    34.380975
P        0.783772
R        0.681577
F1       0.601636
dtype: float64

# Overview of results

In [150]:
b = pd.DataFrame(res)
b.index = [types,trainings]
b

Unnamed: 0,Unnamed: 1,size,P,R,F1
visual only,c1c2,69.707824,0.44852,0.827935,0.369598
visual only,c2c1,34.403323,0.369691,0.942905,0.404325
visual + tfidf,c1c2,69.707824,0.462274,0.819096,0.379936
visual + tfidf,c2c1,34.403323,0.455809,0.942125,0.490227
visual + d2v,c1c2,69.707824,0.559806,0.768265,0.44668
visual + d2v,c2c1,34.403323,0.471147,0.905422,0.483325
bert only,c1c2,68.910383,0.678032,0.675426,0.523359
bert only,c2c1,34.380975,0.724989,0.655715,0.545121
visual + bert similarity,c1c2,68.910383,0.434505,0.833364,0.355291
visual + bert similarity,c2c1,34.380975,0.150656,0.962881,0.183027
