In [1]:
import pandas as pd
from ast import literal_eval
import numpy as np
from collections import defaultdict
from tqdm import tqdm
import pickle

In [23]:
c1_hf = pd.read_csv('sim_scores/c1_headerfooter.csv', index_col = 0)
c2_hf = pd.read_csv('sim_scores/c2_headerfooter.csv', index_col = 0)
c1_d2v = pd.read_csv('sim_scores/d2v_skip_c1.csv', index_col = 0)
c2_d2v = pd.read_csv('sim_scores/d2v_skip_c2.csv', index_col = 0)
c1_tfidf = pd.read_csv('sim_scores/tfidf_skip_c1.csv', index_col = 0)
c2_tfidf = pd.read_csv('sim_scores/tfidf_skip_c2.csv', index_col = 0)
c1 = pd.read_csv('dfs/c1.csv', index_col = 0)
c2 = pd.read_csv('dfs/c2.csv', index_col = 0)

In [24]:
c1 = c1.merge(c1_hf.drop(columns=['label']), how='left', on=['file_name','page']).merge(c1_d2v.drop(columns=['text_y_cleaned','label']), how='left',on=['full_name','page']).merge(c1_tfidf.drop(columns=['text_y_cleaned','label']), how='left',on=['full_name','page'])
c2 = c2.merge(c2_hf.drop(columns=['label']), how='left', on=['file_name','page']).merge(c2_d2v.drop(columns=['text_y_cleaned','label']), how='left',on=['full_name','page']).merge(c2_tfidf.drop(columns=['text_y_cleaned','label']), how='left',on=['full_name','page'])


# Eval

In [11]:
def change_format(y):
    y[0] = 1
    indices = [i for i, x in enumerate(y) if x == 1]+[len(y)-1]
    result = []
    for i in range(len(indices)):
        if i != len(indices)-1:
            result.append(indices[i+1] - indices[i])
    result[-1]+=1
    return result

In [12]:
def make_index(split):
    '''Turns a doc length vector like [1,2,1,3,3,5] into a dict with pagenumbers as keys and the set of all 
    pagenumbers in the same document as value.
    This thus is an index which gives for every page its cluster.'''
    l= sum(split)
    pages= list(np.arange(l))
    out = defaultdict(set)
    for block_length in split:
        block= pages[:block_length]
        pages= pages[block_length:]
        for page in block:
            out[page]= set(block)
    return out

In [13]:
def Bcubed(truth,pred):
    assert sum(truth)==sum(pred)  # same amount of pages
    truth,pred = make_index(truth), make_index(pred)
    
    df  ={i:{'size':len(truth[i]),'P':0,'R':0,'F1':0} for i in truth}
    for i in truth:
        df[i]['P']= len(truth[i] & pred[i])/len(pred[i]) 
        df[i]['R']= len(truth[i] & pred[i])/len(truth[i])
        df[i]['F1']= (2*df[i]['P']*df[i]['R'])/(df[i]['P']+df[i]['R'])
    df= pd.DataFrame.from_dict(df, orient='index')
    df.index_name='PageNr'
    return  df


def MeanBcubed(truth,pred):
    assert sum(truth)==sum(pred)  # same amount of pages
    return Bcubed(truth,pred).mean()

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from collections import defaultdict

_c1 = c1[~pd.isnull(c1['text_y_cleaned'])]
_c2 = c2[~pd.isnull(c2['text_y_cleaned'])]

# Visual features Only

## Train C1, Predict C2

In [26]:
vis_features = ['font_diff3','crop_diff','header_diff','footer_diff']
model = LogisticRegression()
model.fit(_c1[vis_features], _c1['label'])

X_test = _c2[vis_features]
y_test = _c2['label']
true = y_test
preds = model.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(model.predict(X_test))
MeanBcubed(vb_truth, vb_pred)

size    69.707824
P        0.448520
R        0.827935
F1       0.369598
dtype: float64

## Train C2, Predict C1

In [27]:
vis_features = ['font_diff3','crop_diff','header_diff','footer_diff']
model = LogisticRegression()
model.fit(_c2[vis_features], _c2['label'])

X_test = _c1[vis_features]
y_test = _c1['label']
true = y_test
preds = model.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(model.predict(X_test))
MeanBcubed(vb_truth, vb_pred)

size    34.403323
P        0.369691
R        0.942905
F1       0.404325
dtype: float64

# TFIDF + visual results

## Train C1, predict C2

In [28]:
vis_features = ['font_diff3','crop_diff','header_diff','footer_diff','tfidf_sim']
model = LogisticRegression()
model.fit(_c1[vis_features], _c1['label'])

X_test = _c2[vis_features]
y_test = _c2['label']
true = y_test
preds = model.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(model.predict(X_test))
MeanBcubed(vb_truth, vb_pred)

size    69.707824
P        0.462274
R        0.819096
F1       0.379936
dtype: float64

## Train C2, Predict C1

In [29]:
vis_features = ['font_diff3','crop_diff','header_diff','footer_diff', 'tfidf_sim']
model = LogisticRegression()
model.fit(_c2[vis_features], _c2['label'])

X_test = _c1[vis_features]
y_test = _c1['label']
true = y_test
preds = model.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(model.predict(X_test))
MeanBcubed(vb_truth, vb_pred)

size    34.403323
P        0.455809
R        0.942125
F1       0.490227
dtype: float64

# D2V + Visual

## Train C1, predict C2

In [30]:
vis_features = ['font_diff3','crop_diff','header_diff','footer_diff','d2v_sim']
model = LogisticRegression()
model.fit(_c1[vis_features], _c1['label'])

X_test = _c2[vis_features]
y_test = _c2['label']
true = y_test
preds = model.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(model.predict(X_test))
MeanBcubed(vb_truth, vb_pred)

size    69.707824
P        0.559806
R        0.768265
F1       0.446680
dtype: float64

## Train C2, Predict C1

In [31]:
vis_features = ['font_diff3','crop_diff','header_diff','footer_diff', 'd2v_sim']
model = LogisticRegression()
model.fit(_c2[vis_features], _c2['label'])

X_test = _c1[vis_features]
y_test = _c1['label']
true = y_test
preds = model.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(model.predict(X_test))
MeanBcubed(vb_truth, vb_pred)

size    34.403323
P        0.471147
R        0.905422
F1       0.483325
dtype: float64

# BERT

In [2]:
c1_old = pd.read_csv('dfs/c1_old.csv',index_col = 0)
c2_old = pd.read_csv('dfs/c2_old.csv',index_col = 0)
c1_bert = pd.read_csv('sim_scores/bert_skip_c1.csv',index_col = 0)
c2_bert = pd.read_csv('sim_scores/bert_skip_c2.csv',index_col = 0)

v1 = pickle.load(open('pickles/c1_vectors.p','rb'))
v2 = pickle.load(open('pickles/c2_vectors.p','rb'))

v1_trunc = pickle.load(open('pickles/c1_vectors_trunc.p','rb'))
v2_trunc = pickle.load(open('pickles/c2_vectors_trunc.p','rb'))

In [3]:
c1_old = c1_old.merge(c1_bert.drop(columns=['text_y_cleaned','label']), on = ['full_name','page'])
c2_old = c2_old.merge(c2_bert.drop(columns=['text_y_cleaned','label']), on = ['full_name','page'])

In [5]:
from sklearn.neural_network import MLPClassifier
clf1 = MLPClassifier(
    hidden_layer_sizes = (128,),
    batch_size = 50,
    solver = 'adam',
    verbose = 1
)

clf1.fit(v1, c1_old['label'])

Iteration 1, loss = 0.40946184
Iteration 2, loss = 0.36183279
Iteration 3, loss = 0.34101073
Iteration 4, loss = 0.33127445
Iteration 5, loss = 0.31804524
Iteration 6, loss = 0.30892639
Iteration 7, loss = 0.30077472
Iteration 8, loss = 0.30326847
Iteration 9, loss = 0.29066891
Iteration 10, loss = 0.27566614
Iteration 11, loss = 0.27167211
Iteration 12, loss = 0.28626322
Iteration 13, loss = 0.26632670
Iteration 14, loss = 0.26244806
Iteration 15, loss = 0.25046571
Iteration 16, loss = 0.24937064
Iteration 17, loss = 0.24486053
Iteration 18, loss = 0.23642340
Iteration 19, loss = 0.23615901
Iteration 20, loss = 0.22873189
Iteration 21, loss = 0.22257550
Iteration 22, loss = 0.21612877
Iteration 23, loss = 0.21477054
Iteration 24, loss = 0.21301685
Iteration 25, loss = 0.20159179
Iteration 26, loss = 0.21875542
Iteration 27, loss = 0.20296074
Iteration 28, loss = 0.19578381
Iteration 29, loss = 0.21499388
Iteration 30, loss = 0.22900750
Iteration 31, loss = 0.19626225
Iteration 32, los

MLPClassifier(batch_size=50, hidden_layer_sizes=(128,), verbose=1)

In [6]:
clf2 = MLPClassifier(
    hidden_layer_sizes = (128,),
    batch_size = 50,
    solver = 'adam',
    verbose = 1
)

clf2.fit(v2, c2_old['label'])

Iteration 1, loss = 0.29521352
Iteration 2, loss = 0.23646435
Iteration 3, loss = 0.21727416
Iteration 4, loss = 0.21027434
Iteration 5, loss = 0.19552299
Iteration 6, loss = 0.19123976
Iteration 7, loss = 0.18425600
Iteration 8, loss = 0.17794381
Iteration 9, loss = 0.17570567
Iteration 10, loss = 0.17132961
Iteration 11, loss = 0.16588510
Iteration 12, loss = 0.16168129
Iteration 13, loss = 0.16047170
Iteration 14, loss = 0.15456410
Iteration 15, loss = 0.15122162
Iteration 16, loss = 0.14713748
Iteration 17, loss = 0.14600203
Iteration 18, loss = 0.14130635
Iteration 19, loss = 0.14005184
Iteration 20, loss = 0.13702810
Iteration 21, loss = 0.13365677
Iteration 22, loss = 0.13166521
Iteration 23, loss = 0.12912677
Iteration 24, loss = 0.12408592
Iteration 25, loss = 0.12340024
Iteration 26, loss = 0.12362981
Iteration 27, loss = 0.11717177
Iteration 28, loss = 0.11821284
Iteration 29, loss = 0.11102938
Iteration 30, loss = 0.11153758
Iteration 31, loss = 0.10972422
Iteration 32, los

MLPClassifier(batch_size=50, hidden_layer_sizes=(128,), verbose=1)

In [7]:
c2_old['bert_proba'] = clf1.predict_proba(v2)[:,1]
c1_old['bert_proba'] = clf2.predict_proba(v1)[:,1]

In [8]:
_c1_old = c1_old[~pd.isnull(c1_old['text_y_cleaned'])]
_c2_old = c2_old[~pd.isnull(c2_old['text_y_cleaned'])]

In [49]:
pd.set_option('display.max_rows', 200)
# c1[['page','crop_diff','font_diff3','footer_diff','header_diff','label']][:200]
# c1[c1['font_diff3'] == 0]

# Train on c1, predict c2

### Bert Vectors only

In [15]:
X_train = v1
y_train = c1_old['label']
X_test = v2
y_test = c2_old['label']

clf = MLPClassifier(
    hidden_layer_sizes = (128,),
    batch_size = 50,
    solver = 'adam',
    verbose = 0
)

clf.fit(X_train, y_train)

true = y_test
preds = clf.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(clf.predict(X_test))
print(MeanBcubed(vb_truth,vb_pred))

size    68.910383
P        0.684255
R        0.693042
F1       0.537675
dtype: float64


### Visual + Bert_similarity i-1

In [53]:
features = ['font_diff3','crop_diff','bert_sim']

X_train = c1_old[features]
y_train = c1_old['label']
X_test = c2_old[features]
y_test = c2_old['label']

model = LogisticRegression()

model.fit(X_train, y_train)

true = y_test
preds = model.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(model.predict(X_test))
print(MeanBcubed(vb_truth,vb_pred))

size    68.910383
P        0.434505
R        0.833364
F1       0.355291
dtype: float64


### Visual + Bert_proba

In [21]:
features = ['font_diff3','crop_diff','bert_proba']

X_train = c1_old[features]
y_train = c1_old['label']
X_test = c2_old[features]
y_test = c2_old['label']

model = LogisticRegression()

model.fit(X_train, y_train)

true = y_test
preds = model.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(model.predict(X_test))
print(MeanBcubed(vb_truth,vb_pred))


size    68.910383
P        0.593553
R        0.724658
F1       0.431767
dtype: float64


# Train on c2, predict c1

### Bert vectors only

In [22]:
# features = ['font_diff2','crop_diff','text_d2v_sim_score']
X_train = v2
y_train = c2_old['label']
X_test = v1
y_test = c1_old['label']

clf = MLPClassifier(
    hidden_layer_sizes = (128,),
    batch_size = 50,
    solver = 'adam',
    verbose = 0
)

clf.fit(X_train, y_train)

true = y_test
preds = clf.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(clf.predict(X_test))
print(MeanBcubed(vb_truth,vb_pred))

size    34.380975
P        0.648183
R        0.725552
F1       0.537412
dtype: float64


### Visual + Bert_similarity i-1

In [34]:
features = ['font_diff3','crop_diff','bert_sim']

X_train = c2_old[features]
y_train = c2_old['label']
X_test = c1_old[features]
y_test = c1_old['label']

clf = LogisticRegression()

clf.fit(X_train, y_train)

true = y_test
preds = clf.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(clf.predict(X_test))
print(MeanBcubed(vb_truth,vb_pred))


size    34.380975
P        0.150656
R        0.962881
F1       0.183027
dtype: float64


### Visual + Bert_proba

In [35]:
features = ['font_diff3','crop_diff','bert_proba']

X_train = c2_old[features]
y_train = c2_old['label']
X_test = c1_old[features]
y_test = c1_old['label']

clf = LogisticRegression()

clf.fit(X_train, y_train)

true = y_test
preds = clf.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(clf.predict(X_test))
print(MeanBcubed(vb_truth,vb_pred))


size    34.380975
P        0.350439
R        0.963462
F1       0.420291
dtype: float64


## MLP - BERT + Visual

In [36]:
v1_plus = [np.append(v1[i], [c1_old.iloc[i]['font_diff3'], c1_old.iloc[i]['crop_diff']]) for i in tqdm(range(len(v1)))]
v2_plus = [np.append(v2[i], [c2_old.iloc[i]['font_diff3'], c2_old.iloc[i]['crop_diff']]) for i in tqdm(range(len(v2)))]

100%|██████████| 19101/19101 [00:03<00:00, 5043.83it/s]
100%|██████████| 16537/16537 [00:03<00:00, 5085.18it/s]


### Train C1, Predict C2

In [38]:
# features = ['font_diff2','crop_diff','text_d2v_sim_score']
X_train = v1_plus
y_train = c1_old['label']
X_test = v2_plus
y_test = c2_old['label']

clf = MLPClassifier(
    hidden_layer_sizes = (128,),
    batch_size = 50,
    solver = 'adam',
    verbose = 0
)

clf.fit(X_train, y_train)

true = y_test
preds = clf.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(clf.predict(X_test))
print(MeanBcubed(vb_truth,vb_pred))


size    68.910383
P        0.633823
R        0.720345
F1       0.498019
dtype: float64


### Train C2, Predict C1

In [39]:
# features = ['font_diff2','crop_diff','text_d2v_sim_score']
X_train = v2_plus
y_train = c2_old['label']
X_test = v1_plus
y_test = c1_old['label']

clf = MLPClassifier(
    hidden_layer_sizes = (128,),
    batch_size = 50,
    solver = 'adam',
    verbose = 0
)

clf.fit(X_train, y_train)

true = y_test
preds = clf.predict(X_test)
vb_truth, vb_pred = change_format(y_test.values), change_format(clf.predict(X_test))
print(MeanBcubed(vb_truth,vb_pred))


size    34.380975
P        0.783197
R        0.715920
F1       0.631302
dtype: float64
