In [1]:
import pandas as pd
import sys
import os
sys.path.append(os.path.dirname(sys.path[0]) + '/src')
from ast import literal_eval
from datasets.ie_hyperion_dataset import find_word_bounds, clean_text

df = pd.read_csv('../data/processed/pipeline/test/ie_hyperion.csv', converters={'Stralci': literal_eval, 'Repertori': literal_eval})
#df = df.head(50)
df['Testo'] = df['Testo'].map(clean_text)
df['Stralci'] = df['Stralci'].map(lambda x: [clean_text(s) for s in x])
df['Bounds'] = df.apply(lambda x: find_word_bounds(x['Stralci'], x['Testo']), axis=1).values.tolist()

In [2]:
def intersection(A, B):
    start = max(A[0], B[0])
    end = min(A[1], B[1])
    if(start > end):
        return 0
    return end - start + 1


def C(pred_bound:tuple, gt_bound:tuple, pred_rep:str, gt_rep:str, norm_factor:int) -> float:
    if pred_rep != gt_rep:
        return 0
    x = intersection(pred_bound, gt_bound)
    return x / norm_factor

def precision(pred_bounds:list, gt_bounds:list, pred_reps:list, gt_reps:list) -> float:
    curr_sum = 0
    for i in range(len(pred_bounds)):
        for j in range(len(gt_bounds)):
            curr_sum += C(pred_bounds[i], gt_bounds[j], pred_reps[i], gt_reps[j], pred_bounds[i][1] - pred_bounds[i][0] + 1)
    return curr_sum / len(pred_bounds)

def recall(pred_bounds:list, gt_bounds:list, pred_reps:list, gt_reps:list) -> float:
    curr_sum = 0
    for i in range(len(pred_bounds)):
        for j in range(len(gt_bounds)):
            curr_sum += C(pred_bounds[i], gt_bounds[j], pred_reps[i], gt_reps[j], gt_bounds[j][1] - gt_bounds[j][0] + 1)
    return curr_sum / len(gt_bounds)

def f1(prec:float, rec:float) -> float:
    if prec and rec:
        return 2 * ((prec * rec)/(prec + rec))
    return 0

def IoU(pred_bounds:list, gt_bounds:list, pred_reps:list, gt_reps:list) -> float:
    curr_sum = 0
    for i in range(len(pred_bounds)):
        for j in range(len(gt_bounds)):
            curr_sum += C(pred_bounds[i], gt_bounds[j], pred_reps[i], gt_reps[j], max(pred_bounds[i][1], gt_bounds[j][1]) - min(pred_bounds[i][0], gt_bounds[j][0]) + 1)
    return curr_sum / len(pred_bounds)



In [3]:
def normalize(bounds:list, reps:list):
    norm_bounds = []
    norm_reps = []
    
    for i in range(len(bounds)):
        if norm_reps and norm_reps[-1] == reps[i]:
            norm_bounds[-1] = (norm_bounds[-1][0], bounds[i][1])
        else:
            norm_bounds.append(bounds[i])
            norm_reps.append(reps[i])
    return pd.Series([norm_bounds, norm_reps])

## segmenter ottimo + BERT

In [4]:
from models.bert_rep import BertRep

bert_rep = BertRep()
df['Repertori_predetti'] = df['Stralci'].map(bert_rep.predict).values.tolist()

In [5]:
df['Precision'] =  df.apply(lambda x: precision(x['Bounds'], x['Bounds'], x['Repertori_predetti'], x['Repertori']), axis=1)
df['Recall'] =  df.apply(lambda x: recall(x['Bounds'], x['Bounds'], x['Repertori_predetti'], x['Repertori']), axis=1)
df['F1'] =  df.apply(lambda x: f1(x['Precision'], x['Recall']), axis=1)
df['IoU'] =  df.apply(lambda x: IoU(x['Bounds'], x['Bounds'], x['Repertori_predetti'], x['Repertori']), axis=1)

print('OPT SEG + BERT')
print(df['Precision'].mean())
print(df['Recall'].mean())
print(df['F1'].mean())
print(df['IoU'].mean())

OPT SEG + BERT
0.41755630448791553
0.4175581330692732
0.4175572070638188
0.4175545675676947


In [6]:
df[['Norm_bounds', 'Norm_rep']] =  df.apply(lambda x: normalize(x['Bounds'], x['Repertori_predetti']), axis=1)

df['Norm_precision'] =  df.apply(lambda x: precision(x['Norm_bounds'], x['Bounds'], x['Norm_rep'], x['Repertori']), axis=1)
df['Norm_recall'] =  df.apply(lambda x: recall(x['Norm_bounds'], x['Bounds'], x['Norm_rep'], x['Repertori']), axis=1)
df['Norm_f1'] =  df.apply(lambda x: f1(x['Norm_precision'], x['Norm_recall']), axis=1)
df['Norm_IoU'] =  df.apply(lambda x: IoU(x['Norm_bounds'], x['Bounds'], x['Norm_rep'], x['Repertori']), axis=1)

print('OPT SEG + BERT norm')
print(df['Norm_precision'].mean())
print(df['Norm_recall'].mean())
print(df['Norm_f1'].mean())
print(df['Norm_IoU'].mean())

OPT SEG + BERT norm
0.419067510965262
0.4175526557238897
0.4175703029401396
0.419067510965262


## NLTK + BERT

In [4]:
from models.nltk_segmenter import NLTKSegmenter


nltk_seg = NLTKSegmenter()
df['Stralci_predetti'] = df['Testo'].map(nltk_seg.predict).values.tolist()
df['Bounds_predetti'] = df.apply(lambda x: find_word_bounds(x['Stralci_predetti'], x['Testo']), axis=1).values.tolist()

[nltk_data] Downloading package punkt to /Users/michele/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
from models.bert_rep import BertRep

bert_rep = BertRep()
df['Repertori_predetti'] = df['Stralci_predetti'].map(bert_rep.predict).values.tolist()

In [6]:
df['Precision'] =  df.apply(lambda x: precision(x['Bounds_predetti'], x['Bounds'], x['Repertori_predetti'], x['Repertori']), axis=1)
df['Recall'] =  df.apply(lambda x: recall(x['Bounds_predetti'], x['Bounds'], x['Repertori_predetti'], x['Repertori']), axis=1)
df['F1'] =  df.apply(lambda x: f1(x['Precision'], x['Recall']), axis=1)
df['IoU'] =  df.apply(lambda x: IoU(x['Bounds_predetti'], x['Bounds'], x['Repertori_predetti'], x['Repertori']), axis=1)

print('NLTK + BERT')
print(df['Precision'].mean())
print(df['Recall'].mean())
print(df['F1'].mean())
print(df['IoU'].mean())

0.004718615576747079
0.0046154013551244055
0.004519172355641398
0.004012308739479754


In [None]:
df[['Norm_bounds', 'Norm_rep']] =  df.apply(lambda x: normalize(x['Bounds_predetti'], x['Repertori_predetti']), axis=1)

df['Norm_precision'] =  df.apply(lambda x: precision(x['Norm_bounds'], x['Bounds'], x['Norm_rep'], x['Repertori']), axis=1)
df['Norm_recall'] =  df.apply(lambda x: recall(x['Norm_bounds'], x['Bounds'], x['Norm_rep'], x['Repertori']), axis=1)
df['Norm_f1'] =  df.apply(lambda x: f1(x['Norm_precision'], x['Norm_recall']), axis=1)
df['Norm_IoU'] =  df.apply(lambda x: IoU(x['Norm_bounds'], x['Bounds'], x['Norm_rep'], x['Repertori']), axis=1)

print('BERT + BERT norm')
print(df['Norm_precision'].mean())
print(df['Norm_recall'].mean())
print(df['Norm_f1'].mean())
print(df['Norm_IoU'].mean())

## BERT + BERT

In [4]:
from models.bert_segmenter import BertSegmenter

bert_seg = BertSegmenter()
df['Stralci_predetti'] = df['Testo'].map(bert_seg.predict).values.tolist()
df['Bounds_predetti'] = df.apply(lambda x: find_word_bounds(x['Stralci_predetti'], x['Testo']), axis=1).values.tolist()

In [7]:
from models.bert_rep import BertRep

bert_rep = BertRep()
df['Repertori_predetti'] = df['Stralci_predetti'].map(bert_rep.predict).values.tolist()

In [8]:
df['Precision'] =  df.apply(lambda x: precision(x['Bounds_predetti'], x['Bounds'], x['Repertori_predetti'], x['Repertori']), axis=1)
df['Recall'] =  df.apply(lambda x: recall(x['Bounds_predetti'], x['Bounds'], x['Repertori_predetti'], x['Repertori']), axis=1)
df['F1'] =  df.apply(lambda x: f1(x['Precision'], x['Recall']), axis=1)
df['IoU'] =  df.apply(lambda x: IoU(x['Bounds_predetti'], x['Bounds'], x['Repertori_predetti'], x['Repertori']), axis=1)

print('BERT + BERT')
print(df['Precision'].mean())
print(df['Recall'].mean())
print(df['F1'].mean())
print(df['IoU'].mean())

0.37369420684781746
0.36555002400034536
0.3623481486134643
0.33127296517234917


In [None]:
df[['Norm_bounds', 'Norm_rep']] =  df.apply(lambda x: normalize(x['Bounds_predetti'], x['Repertori_predetti']), axis=1)

df['Norm_precision'] =  df.apply(lambda x: precision(x['Norm_bounds'], x['Bounds'], x['Norm_rep'], x['Repertori']), axis=1)
df['Norm_recall'] =  df.apply(lambda x: recall(x['Norm_bounds'], x['Bounds'], x['Norm_rep'], x['Repertori']), axis=1)
df['Norm_f1'] =  df.apply(lambda x: f1(x['Norm_precision'], x['Norm_recall']), axis=1)
df['Norm_IoU'] =  df.apply(lambda x: IoU(x['Norm_bounds'], x['Bounds'], x['Norm_rep'], x['Repertori']), axis=1)

print('BERT + BERT norm')
print(df['Norm_precision'].mean())
print(df['Norm_recall'].mean())
print(df['Norm_f1'].mean())
print(df['Norm_IoU'].mean())

0.009532639896573889
0.010532637982794465
0.009407292121459798
0.009437041903007072


## NLTK + CLS ottimo

In [None]:
from models.nltk_segmenter import NLTKSegmenter


nltk_seg = NLTKSegmenter()
df['Stralci_predetti'] = df['Testo'].map(nltk_seg.predict).values.tolist()
df['Bounds_predetti'] = df.apply(lambda x: find_word_bounds(x['Stralci_predetti'], x['Testo']), axis=1).values.tolist()

[nltk_data] Downloading package punkt to /Users/michele/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import numpy as np
def optimal_rep(pred_bounds: list, gt_bounds:list, reps:list):
    opt_reps = []
    for b in pred_bounds:
        opt = np.argmax([intersection(b, x) for x in gt_bounds])
        opt_reps.append(reps[opt])
    return opt_reps


df['Repertori_predetti'] = df.apply(lambda x: optimal_rep(x['Bounds_predetti'], x['Bounds'], x['Repertori']), axis=1).values.tolist()

In [None]:
df['Precision'] =  df.apply(lambda x: precision(x['Bounds_predetti'], x['Bounds'], x['Repertori_predetti'], x['Repertori']), axis=1)
df['Recall'] =  df.apply(lambda x: recall(x['Bounds_predetti'], x['Bounds'], x['Repertori_predetti'], x['Repertori']), axis=1)
df['F1'] =  df.apply(lambda x: f1(x['Precision'], x['Recall']), axis=1)
df['IoU'] =  df.apply(lambda x: IoU(x['Bounds_predetti'], x['Bounds'], x['Repertori_predetti'], x['Repertori']), axis=1)

print('NLTK + CLS ottimo')
print(df['Precision'].mean())
print(df['Recall'].mean())
print(df['F1'].mean())
print(df['IoU'].mean())

0.9467603903230071
0.8812328602948418
0.9019201022407155
0.6179222972511701


In [None]:
df[['Norm_bounds', 'Norm_rep']] =  df.apply(lambda x: normalize(x['Bounds_predetti'], x['Repertori_predetti']), axis=1)

df['Norm_precision'] =  df.apply(lambda x: precision(x['Norm_bounds'], x['Bounds'], x['Norm_rep'], x['Repertori']), axis=1)
df['Norm_recall'] =  df.apply(lambda x: recall(x['Norm_bounds'], x['Bounds'], x['Norm_rep'], x['Repertori']), axis=1)
df['Norm_f1'] =  df.apply(lambda x: f1(x['Norm_precision'], x['Norm_recall']), axis=1)
df['Norm_IoU'] =  df.apply(lambda x: IoU(x['Norm_bounds'], x['Bounds'], x['Norm_rep'], x['Repertori']), axis=1)

print('NLTK + CLS ottimo norm')
print(df['Norm_precision'].mean())
print(df['Norm_recall'].mean())
print(df['Norm_f1'].mean())
print(df['Norm_IoU'].mean())

0.9445165782946999
0.8812201295162867
0.9010714143714191
0.9390283325194846


## BERT + CLS ottimo

In [None]:
from models.bert_segmenter import BertSegmenter

bert_seg = BertSegmenter()
df['Stralci_predetti'] = df['Testo'].map(bert_seg.predict).values.tolist()
df['Bounds_predetti'] = df.apply(lambda x: find_word_bounds(x['Stralci_predetti'], x['Testo']), axis=1).values.tolist()

In [None]:
import numpy as np
def optimal_rep(pred_bounds: list, gt_bounds:list, reps:list):
    opt_reps = []
    for b in pred_bounds:
        opt = np.argmax([intersection(b, x) for x in gt_bounds])
        opt_reps.append(reps[opt])
    return opt_reps


df['Repertori_predetti'] = df.apply(lambda x: optimal_rep(x['Bounds_predetti'], x['Bounds'], x['Repertori']), axis=1).values.tolist()

In [None]:
df['Precision'] =  df.apply(lambda x: precision(x['Bounds_predetti'], x['Bounds'], x['Repertori_predetti'], x['Repertori']), axis=1)
df['Recall'] =  df.apply(lambda x: recall(x['Bounds_predetti'], x['Bounds'], x['Repertori_predetti'], x['Repertori']), axis=1)
df['F1'] =  df.apply(lambda x: f1(x['Precision'], x['Recall']), axis=1)
df['IoU'] =  df.apply(lambda x: IoU(x['Bounds_predetti'], x['Bounds'], x['Repertori_predetti'], x['Repertori']), axis=1)

print('BERT + CLS ottimo')

print(df['Precision'].mean())
print(df['Recall'].mean())
print(df['F1'].mean())
print(df['IoU'].mean())

0.9375757679460751
0.842296912752431
0.8705571397106444
0.6313146682518852


In [None]:
df[['Norm_bounds', 'Norm_rep']] =  df.apply(lambda x: normalize(x['Bounds_predetti'], x['Repertori_predetti']), axis=1)

df['Norm_precision'] =  df.apply(lambda x: precision(x['Norm_bounds'], x['Bounds'], x['Norm_rep'], x['Repertori']), axis=1)
df['Norm_recall'] =  df.apply(lambda x: recall(x['Norm_bounds'], x['Bounds'], x['Norm_rep'], x['Repertori']), axis=1)
df['Norm_f1'] =  df.apply(lambda x: f1(x['Norm_precision'], x['Norm_recall']), axis=1)
df['Norm_IoU'] =  df.apply(lambda x: IoU(x['Norm_bounds'], x['Bounds'], x['Norm_rep'], x['Repertori']), axis=1)

print('BERT + CLS ottimo norm')
print(df['Norm_precision'].mean())
print(df['Norm_recall'].mean())
print(df['Norm_f1'].mean())
print(df['Norm_IoU'].mean())

0.936950124028952
0.8420045995256696
0.8706209693315267
0.8993033577261793


## NLTK + CLS random

In [None]:
from models.nltk_segmenter import NLTKSegmenter


nltk_seg = NLTKSegmenter()
df['Stralci_predetti'] = df['Testo'].map(nltk_seg.predict).values.tolist()
df['Bounds_predetti'] = df.apply(lambda x: find_word_bounds(x['Stralci_predetti'], x['Testo']), axis=1).values.tolist()

[nltk_data] Downloading package punkt to /Users/michele/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from random import randint
from random import seed
from datasets.hyperion_dataset import decode_labels

seed(1464)

def rand_cls(n:int) -> list:
    cls = [randint(0,22) for _ in range(n)]
    return decode_labels(cls)

df['Repertori_predetti'] = df.apply(lambda x: rand_cls(len(x['Bounds_predetti'])), axis=1).values.tolist()

In [None]:
df['Precision'] =  df.apply(lambda x: precision(x['Bounds_predetti'], x['Bounds'], x['Repertori_predetti'], x['Repertori']), axis=1)
df['Recall'] =  df.apply(lambda x: recall(x['Bounds_predetti'], x['Bounds'], x['Repertori_predetti'], x['Repertori']), axis=1)
df['F1'] =  df.apply(lambda x: f1(x['Precision'], x['Recall']), axis=1)
df['IoU'] =  df.apply(lambda x: IoU(x['Bounds_predetti'], x['Bounds'], x['Repertori_predetti'], x['Repertori']), axis=1)

print('NLTK + CLS random')
print(df['Precision'].mean())
print(df['Recall'].mean())
print(df['F1'].mean())
print(df['IoU'].mean())

0.044407641437046515
0.042256647929365884
0.040739674733738025
0.030087476034809615


In [None]:
df[['Norm_bounds', 'Norm_rep']] =  df.apply(lambda x: normalize(x['Bounds_predetti'], x['Repertori_predetti']), axis=1)

df['Norm_precision'] =  df.apply(lambda x: precision(x['Norm_bounds'], x['Bounds'], x['Norm_rep'], x['Repertori']), axis=1)
df['Norm_recall'] =  df.apply(lambda x: recall(x['Norm_bounds'], x['Bounds'], x['Norm_rep'], x['Repertori']), axis=1)
df['Norm_f1'] =  df.apply(lambda x: f1(x['Norm_precision'], x['Norm_recall']), axis=1)
df['Norm_IoU'] =  df.apply(lambda x: IoU(x['Norm_bounds'], x['Bounds'], x['Norm_rep'], x['Repertori']), axis=1)

print('NLTK + CLS random norm')
print(df['Norm_precision'].mean())
print(df['Norm_recall'].mean())
print(df['Norm_f1'].mean())
print(df['Norm_IoU'].mean())

0.04440255948854607
0.042256647929365884
0.04057405105963676
0.0302209534449937


## BERT + CLS random

In [None]:
from models.bert_segmenter import BertSegmenter

bert_seg = BertSegmenter()
df['Stralci_predetti'] = df['Testo'].map(bert_seg.predict).values.tolist()
df['Bounds_predetti'] = df.apply(lambda x: find_word_bounds(x['Stralci_predetti'], x['Testo']), axis=1).values.tolist()

In [None]:
from random import randint
from random import seed
from datasets.hyperion_dataset import decode_labels

seed(1464)

def rand_cls(n:int) -> list:
    cls = [randint(0,22) for _ in range(n)]
    return decode_labels(cls)

df['Repertori_predetti'] = df.apply(lambda x: rand_cls(len(x['Bounds_predetti'])), axis=1).values.tolist()

In [None]:
df['Precision'] =  df.apply(lambda x: precision(x['Bounds_predetti'], x['Bounds'], x['Repertori_predetti'], x['Repertori']), axis=1)
df['Recall'] =  df.apply(lambda x: recall(x['Bounds_predetti'], x['Bounds'], x['Repertori_predetti'], x['Repertori']), axis=1)
df['F1'] =  df.apply(lambda x: f1(x['Precision'], x['Recall']), axis=1)
df['IoU'] =  df.apply(lambda x: IoU(x['Bounds_predetti'], x['Bounds'], x['Repertori_predetti'], x['Repertori']), axis=1)

print('BERT + CLS random')

print(df['Precision'].mean())
print(df['Recall'].mean())
print(df['F1'].mean())
print(df['IoU'].mean())

0.043717936953035906
0.04044134381853343
0.03915551941264637
0.02943148860330272


In [None]:
df[['Norm_bounds', 'Norm_rep']] =  df.apply(lambda x: normalize(x['Bounds_predetti'], x['Repertori_predetti']), axis=1)

df['Norm_precision'] =  df.apply(lambda x: precision(x['Norm_bounds'], x['Bounds'], x['Norm_rep'], x['Repertori']), axis=1)
df['Norm_recall'] =  df.apply(lambda x: recall(x['Norm_bounds'], x['Bounds'], x['Norm_rep'], x['Repertori']), axis=1)
df['Norm_f1'] =  df.apply(lambda x: f1(x['Norm_precision'], x['Norm_recall']), axis=1)
df['Norm_IoU'] =  df.apply(lambda x: IoU(x['Norm_bounds'], x['Bounds'], x['Norm_rep'], x['Repertori']), axis=1)

print('BERT + CLS random norm')
print(df['Norm_precision'].mean())
print(df['Norm_recall'].mean())
print(df['Norm_f1'].mean())
print(df['Norm_IoU'].mean())

0.043453629536277356
0.04044134381853343
0.03903155818616637
0.029874160151226164


## RISULTATI da cluster
NLTK + BERT
0.2990602818274286
0.29751360751204853
0.2897168291168118
0.20986743884372597

NLTK + BERT norm
0.2881645483627048
0.29751360751204853
0.2828495318043233
0.23518775892093524

BERT + BERT
0.3175951768074362
0.3013790273821301
0.29867822523716175
0.2523689405834198

BERT + BERT norm
0.3147392277664459
0.3013577509082085
0.29698474595848295
0.2675877796407899

NLTK + CLS ottimo
0.9467603903230075
0.8812328602948415
0.9019201022407156
0.6179222972511671

NLTK + CLS ottimo norm
0.9445165782947008
0.8812201295162867
0.9010714143714188
0.9390283325194851

BERT + CLS ottimo
0.9463897708207414
0.8688017717594765
0.8917464576750034
0.6666620091429322

BERT + CLS ottimo norm
0.9470870050957066
0.8689860748196476
0.892520978540389
0.9293125056746196

NLTK + CLS random
0.044407641437046494
0.04225664792936588
0.04073967473373807
0.03008747603480959

NLTK + CLS random norm
0.04440255948854607
0.04225664792936588
0.040574051059636795
0.030220953444993686

BERT + CLS random
0.03947437441425662
0.038680462145539794
0.03676656673673848
0.027945803398262345

BERT + CLS random norm
0.039539754353300004
0.03867744216535087
0.03667237117688916
0.02827673305407843