In [1]:
%load_ext autoreload

In [33]:
import json
import pandas as pd
import numpy as np
import sys

pd.set_option('display.max_colwidth', None)
sys.path.append('./src-py')

In [34]:
%autoreload
import sbert_training
from sklearn.metrics import precision_recall_fscore_support

In [35]:
from sentence_transformers import SentenceTransformer, InputExample, LoggingHandler, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, BinaryClassificationEvaluator
from torch.utils.data import DataLoader
from sentence_transformers.evaluation import TripletEvaluator
from datetime import datetime
from sentence_transformers import util
from zipfile import ZipFile
from sentence_transformers.datasets import SentenceLabelDataset
from sentence_transformers.datasets import NoDuplicatesDataLoader

import logging

logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
logger = logging.getLogger(__name__)

In [36]:
output_path = "../../data-ceph/arguana/argmining22-sharedtask/models/"

In [37]:
taska_training_df = pd.read_csv('../data/TaskA_train.csv')
taska_valid_df = pd.read_csv('../data/TaskA_dev.csv')

taska_training_df['Premise'] = taska_training_df.apply(lambda x: x['topic'] + ' : ' +  x['Premise'], axis=1)
taska_valid_df['Premise'] = taska_valid_df.apply(lambda x: x['topic'] + ' : ' +  x['Premise'], axis=1)

taska_validity_train_df = taska_training_df[taska_training_df.Validity != 0].copy()
taska_validity_valid_df = taska_valid_df[taska_valid_df.Validity != 0].copy()

taska_validity_train_df['label'] = taska_validity_train_df.Validity.apply(lambda x : 1 if x == 1 else 0)
taska_validity_valid_df['label'] = taska_validity_valid_df.Validity.apply(lambda x : 1 if x == 1 else 0)

taska_novelty_train_df = taska_training_df[taska_training_df.Novelty != 0].copy()
taska_novelty_valid_df = taska_valid_df[taska_valid_df.Novelty != 0].copy()

taska_novelty_train_df['label'] = taska_novelty_train_df.Novelty.apply(lambda x : 1 if x == 1 else 0)
taska_novelty_valid_df['label'] = taska_novelty_valid_df.Novelty.apply(lambda x : 1 if x == 1 else 0)

In [38]:
taska_novelty_train_df.label.value_counts()

0    595
1    123
Name: label, dtype: int64

In [39]:
taska_validity_train_df.label.value_counts()

1    401
0    320
Name: label, dtype: int64

### Evaluate sbert on Validity:

In [61]:
trained_model, evaluator = sbert_training.train_model(taska_validity_train_df, taska_validity_valid_df, 
                                                      output_path + '/task-A/validity/sbert/', 
                                                    'sentence-transformers/nli-roberta-large', 
                                                    num_epochs=10, train_batch_size=8,
                                                    model_suffix='', max_seq_length=512, special_tokens=[], 
                                                    loss='ContrastiveLoss', sentence_transformer=False, 
                                                    evaluation_steps=10,
                                                     lr=5e-6)

2022-07-04 17:04:31 - Use pytorch device: cuda
2022-07-04 17:04:31 - Read Triplet train dataset
Len of training: 721
Len of Dev: 199
Evaluating before start learning.....
2022-07-04 17:04:31 - Binary Accuracy Evaluation of the model on sts-dev dataset:
2022-07-04 17:04:32 - Accuracy with Cosine-Similarity:           72.36	(Threshold: 0.6503)
2022-07-04 17:04:32 - F1 with Cosine-Similarity:                 80.69	(Threshold: 0.5985)
2022-07-04 17:04:32 - Precision with Cosine-Similarity:          70.91
2022-07-04 17:04:32 - Recall with Cosine-Similarity:             93.60
2022-07-04 17:04:32 - Average Precision with Cosine-Similarity:  82.57

2022-07-04 17:04:32 - Accuracy with Manhatten-Distance:           72.36	(Threshold: 646.3694)
2022-07-04 17:04:32 - F1 with Manhatten-Distance:                 80.56	(Threshold: 679.6707)
2022-07-04 17:04:32 - Precision with Manhatten-Distance:          71.17
2022-07-04 17:04:32 - Recall with Manhatten-Distance:             92.80
2022-07-04 17:04:32



Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/91 [00:00<?, ?it/s]

2022-07-04 17:04:33 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 0 after 10 steps:
2022-07-04 17:04:34 - Accuracy with Cosine-Similarity:           72.36	(Threshold: 0.6569)
2022-07-04 17:04:34 - F1 with Cosine-Similarity:                 80.69	(Threshold: 0.6012)
2022-07-04 17:04:34 - Precision with Cosine-Similarity:          70.91
2022-07-04 17:04:34 - Recall with Cosine-Similarity:             93.60
2022-07-04 17:04:34 - Average Precision with Cosine-Similarity:  82.54

2022-07-04 17:04:34 - Accuracy with Manhatten-Distance:           72.36	(Threshold: 631.1085)
2022-07-04 17:04:34 - F1 with Manhatten-Distance:                 80.56	(Threshold: 677.6138)
2022-07-04 17:04:34 - Precision with Manhatten-Distance:          71.17
2022-07-04 17:04:34 - Recall with Manhatten-Distance:             92.80
2022-07-04 17:04:34 - Average Precision with Manhatten-Distance:  82.50

2022-07-04 17:04:34 - Accuracy with Euclidean-Distance:           72.36	(Threshold: 25.5618

Iteration:   0%|          | 0/91 [00:00<?, ?it/s]

2022-07-04 17:05:14 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 1 after 10 steps:
2022-07-04 17:05:14 - Accuracy with Cosine-Similarity:           73.37	(Threshold: 0.7805)
2022-07-04 17:05:14 - F1 with Cosine-Similarity:                 80.00	(Threshold: 0.6735)
2022-07-04 17:05:14 - Precision with Cosine-Similarity:          67.03
2022-07-04 17:05:14 - Recall with Cosine-Similarity:             99.20
2022-07-04 17:05:14 - Average Precision with Cosine-Similarity:  81.97

2022-07-04 17:05:14 - Accuracy with Manhatten-Distance:           74.37	(Threshold: 519.8750)
2022-07-04 17:05:14 - F1 with Manhatten-Distance:                 80.61	(Threshold: 522.1368)
2022-07-04 17:05:14 - Precision with Manhatten-Distance:          76.81
2022-07-04 17:05:14 - Recall with Manhatten-Distance:             84.80
2022-07-04 17:05:14 - Average Precision with Manhatten-Distance:  82.11

2022-07-04 17:05:14 - Accuracy with Euclidean-Distance:           74.37	(Threshold: 20.3348

Iteration:   0%|          | 0/91 [00:00<?, ?it/s]

2022-07-04 17:05:31 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 2 after 10 steps:
2022-07-04 17:05:31 - Accuracy with Cosine-Similarity:           73.37	(Threshold: 0.7771)
2022-07-04 17:05:31 - F1 with Cosine-Similarity:                 80.39	(Threshold: 0.6467)
2022-07-04 17:05:31 - Precision with Cosine-Similarity:          67.20
2022-07-04 17:05:31 - Recall with Cosine-Similarity:             100.00
2022-07-04 17:05:31 - Average Precision with Cosine-Similarity:  82.23

2022-07-04 17:05:31 - Accuracy with Manhatten-Distance:           73.87	(Threshold: 531.7351)
2022-07-04 17:05:31 - F1 with Manhatten-Distance:                 80.88	(Threshold: 549.8300)
2022-07-04 17:05:31 - Precision with Manhatten-Distance:          74.83
2022-07-04 17:05:31 - Recall with Manhatten-Distance:             88.00
2022-07-04 17:05:31 - Average Precision with Manhatten-Distance:  82.33

2022-07-04 17:05:31 - Accuracy with Euclidean-Distance:           73.37	(Threshold: 20.459

Iteration:   0%|          | 0/91 [00:00<?, ?it/s]

2022-07-04 17:06:37 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 3 after 10 steps:
2022-07-04 17:06:37 - Accuracy with Cosine-Similarity:           71.86	(Threshold: 0.7516)
2022-07-04 17:06:37 - F1 with Cosine-Similarity:                 80.94	(Threshold: 0.6968)
2022-07-04 17:06:37 - Precision with Cosine-Similarity:          69.54
2022-07-04 17:06:37 - Recall with Cosine-Similarity:             96.80
2022-07-04 17:06:37 - Average Precision with Cosine-Similarity:  82.80

2022-07-04 17:06:37 - Accuracy with Manhatten-Distance:           73.87	(Threshold: 537.7893)
2022-07-04 17:06:37 - F1 with Manhatten-Distance:                 80.94	(Threshold: 596.3022)
2022-07-04 17:06:37 - Precision with Manhatten-Distance:          69.54
2022-07-04 17:06:37 - Recall with Manhatten-Distance:             96.80
2022-07-04 17:06:37 - Average Precision with Manhatten-Distance:  82.97

2022-07-04 17:06:37 - Accuracy with Euclidean-Distance:           72.36	(Threshold: 20.1839

Iteration:   0%|          | 0/91 [00:00<?, ?it/s]

2022-07-04 17:06:54 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 4 after 10 steps:
2022-07-04 17:06:54 - Accuracy with Cosine-Similarity:           72.36	(Threshold: 0.7002)
2022-07-04 17:06:54 - F1 with Cosine-Similarity:                 81.61	(Threshold: 0.6867)
2022-07-04 17:06:54 - Precision with Cosine-Similarity:          70.11
2022-07-04 17:06:54 - Recall with Cosine-Similarity:             97.60
2022-07-04 17:06:54 - Average Precision with Cosine-Similarity:  81.92

2022-07-04 17:06:54 - Accuracy with Manhatten-Distance:           72.86	(Threshold: 588.4943)
2022-07-04 17:06:54 - F1 with Manhatten-Distance:                 81.63	(Threshold: 588.4943)
2022-07-04 17:06:54 - Precision with Manhatten-Distance:          71.01
2022-07-04 17:06:54 - Recall with Manhatten-Distance:             96.00
2022-07-04 17:06:54 - Average Precision with Manhatten-Distance:  81.98

2022-07-04 17:06:54 - Accuracy with Euclidean-Distance:           72.36	(Threshold: 23.8145

Iteration:   0%|          | 0/91 [00:00<?, ?it/s]

2022-07-04 17:07:11 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 5 after 10 steps:
2022-07-04 17:07:11 - Accuracy with Cosine-Similarity:           72.86	(Threshold: 0.7021)
2022-07-04 17:07:11 - F1 with Cosine-Similarity:                 81.76	(Threshold: 0.6983)
2022-07-04 17:07:11 - Precision with Cosine-Similarity:          70.76
2022-07-04 17:07:11 - Recall with Cosine-Similarity:             96.80
2022-07-04 17:07:11 - Average Precision with Cosine-Similarity:  81.43

2022-07-04 17:07:11 - Accuracy with Manhatten-Distance:           72.36	(Threshold: 578.6217)
2022-07-04 17:07:11 - F1 with Manhatten-Distance:                 80.94	(Threshold: 606.6852)
2022-07-04 17:07:11 - Precision with Manhatten-Distance:          69.54
2022-07-04 17:07:11 - Recall with Manhatten-Distance:             96.80
2022-07-04 17:07:11 - Average Precision with Manhatten-Distance:  81.66

2022-07-04 17:07:11 - Accuracy with Euclidean-Distance:           72.86	(Threshold: 23.8684

Iteration:   0%|          | 0/91 [00:00<?, ?it/s]

2022-07-04 17:07:28 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 6 after 10 steps:
2022-07-04 17:07:28 - Accuracy with Cosine-Similarity:           72.86	(Threshold: 0.7197)
2022-07-04 17:07:28 - F1 with Cosine-Similarity:                 81.51	(Threshold: 0.7068)
2022-07-04 17:07:28 - Precision with Cosine-Similarity:          71.26
2022-07-04 17:07:28 - Recall with Cosine-Similarity:             95.20
2022-07-04 17:07:28 - Average Precision with Cosine-Similarity:  81.54

2022-07-04 17:07:28 - Accuracy with Manhatten-Distance:           71.86	(Threshold: 574.4922)
2022-07-04 17:07:28 - F1 with Manhatten-Distance:                 80.81	(Threshold: 598.5565)
2022-07-04 17:07:28 - Precision with Manhatten-Distance:          69.77
2022-07-04 17:07:28 - Recall with Manhatten-Distance:             96.00
2022-07-04 17:07:28 - Average Precision with Manhatten-Distance:  81.72

2022-07-04 17:07:28 - Accuracy with Euclidean-Distance:           72.36	(Threshold: 22.8861

Iteration:   0%|          | 0/91 [00:00<?, ?it/s]

2022-07-04 17:07:45 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 7 after 10 steps:
2022-07-04 17:07:45 - Accuracy with Cosine-Similarity:           73.37	(Threshold: 0.7205)
2022-07-04 17:07:45 - F1 with Cosine-Similarity:                 81.66	(Threshold: 0.7122)
2022-07-04 17:07:45 - Precision with Cosine-Similarity:          71.95
2022-07-04 17:07:45 - Recall with Cosine-Similarity:             94.40
2022-07-04 17:07:45 - Average Precision with Cosine-Similarity:  81.72

2022-07-04 17:07:45 - Accuracy with Manhatten-Distance:           72.36	(Threshold: 583.9147)
2022-07-04 17:07:45 - F1 with Manhatten-Distance:                 81.10	(Threshold: 583.9147)
2022-07-04 17:07:45 - Precision with Manhatten-Distance:          71.08
2022-07-04 17:07:45 - Recall with Manhatten-Distance:             94.40
2022-07-04 17:07:45 - Average Precision with Manhatten-Distance:  81.76

2022-07-04 17:07:45 - Accuracy with Euclidean-Distance:           73.37	(Threshold: 23.3647

Iteration:   0%|          | 0/91 [00:00<?, ?it/s]

2022-07-04 17:08:02 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 8 after 10 steps:
2022-07-04 17:08:02 - Accuracy with Cosine-Similarity:           73.37	(Threshold: 0.7206)
2022-07-04 17:08:02 - F1 with Cosine-Similarity:                 81.88	(Threshold: 0.7002)
2022-07-04 17:08:02 - Precision with Cosine-Similarity:          70.52
2022-07-04 17:08:02 - Recall with Cosine-Similarity:             97.60
2022-07-04 17:08:02 - Average Precision with Cosine-Similarity:  81.11

2022-07-04 17:08:02 - Accuracy with Manhatten-Distance:           71.36	(Threshold: 590.8840)
2022-07-04 17:08:02 - F1 with Manhatten-Distance:                 80.81	(Threshold: 596.6866)
2022-07-04 17:08:02 - Precision with Manhatten-Distance:          69.77
2022-07-04 17:08:02 - Recall with Manhatten-Distance:             96.00
2022-07-04 17:08:02 - Average Precision with Manhatten-Distance:  81.24

2022-07-04 17:08:02 - Accuracy with Euclidean-Distance:           72.86	(Threshold: 23.0579

Iteration:   0%|          | 0/91 [00:00<?, ?it/s]

2022-07-04 17:08:19 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 9 after 10 steps:
2022-07-04 17:08:19 - Accuracy with Cosine-Similarity:           72.86	(Threshold: 0.7189)
2022-07-04 17:08:19 - F1 with Cosine-Similarity:                 81.88	(Threshold: 0.6998)
2022-07-04 17:08:19 - Precision with Cosine-Similarity:          70.52
2022-07-04 17:08:19 - Recall with Cosine-Similarity:             97.60
2022-07-04 17:08:19 - Average Precision with Cosine-Similarity:  81.07

2022-07-04 17:08:19 - Accuracy with Manhatten-Distance:           71.86	(Threshold: 590.9159)
2022-07-04 17:08:19 - F1 with Manhatten-Distance:                 81.08	(Threshold: 590.9159)
2022-07-04 17:08:19 - Precision with Manhatten-Distance:          70.18
2022-07-04 17:08:19 - Recall with Manhatten-Distance:             96.00
2022-07-04 17:08:19 - Average Precision with Manhatten-Distance:  81.25

2022-07-04 17:08:19 - Accuracy with Euclidean-Distance:           72.86	(Threshold: 23.1517

In [62]:
trained_model.evaluate(evaluator)

2022-07-04 17:08:34 - Binary Accuracy Evaluation of the model on sts-dev dataset:
2022-07-04 17:08:35 - Accuracy with Cosine-Similarity:           73.37	(Threshold: 0.7012)
2022-07-04 17:08:35 - F1 with Cosine-Similarity:                 82.15	(Threshold: 0.7012)
2022-07-04 17:08:35 - Precision with Cosine-Similarity:          70.93
2022-07-04 17:08:35 - Recall with Cosine-Similarity:             97.60
2022-07-04 17:08:35 - Average Precision with Cosine-Similarity:  81.16

2022-07-04 17:08:35 - Accuracy with Manhatten-Distance:           71.86	(Threshold: 586.9470)
2022-07-04 17:08:35 - F1 with Manhatten-Distance:                 81.08	(Threshold: 592.6179)
2022-07-04 17:08:35 - Precision with Manhatten-Distance:          70.18
2022-07-04 17:08:35 - Recall with Manhatten-Distance:             96.00
2022-07-04 17:08:35 - Average Precision with Manhatten-Distance:  81.27

2022-07-04 17:08:35 - Accuracy with Euclidean-Distance:           72.86	(Threshold: 23.1635)
2022-07-04 17:08:35 - F1

0.8135754642238722

In [63]:
trained_model, evaluator = sbert_training.train_model(taska_validity_train_df, taska_validity_valid_df, 
                                                      output_path + '/task-A/validity/sbert/', 
                                                    'bert-large-uncased', 
                                                    num_epochs=10, train_batch_size=8,
                                                    model_suffix='', max_seq_length=512, special_tokens=[], 
                                                    loss='ContrastiveLoss', sentence_transformer=False, 
                                                    evaluation_steps=40,
                                                     lr=5e-6)

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


2022-07-04 17:08:42 - Use pytorch device: cuda
2022-07-04 17:08:42 - Read Triplet train dataset
Len of training: 721
Len of Dev: 199
Evaluating before start learning.....
2022-07-04 17:08:42 - Binary Accuracy Evaluation of the model on sts-dev dataset:
2022-07-04 17:08:43 - Accuracy with Cosine-Similarity:           66.33	(Threshold: 0.8563)
2022-07-04 17:08:43 - F1 with Cosine-Similarity:                 78.10	(Threshold: 0.7898)
2022-07-04 17:08:43 - Precision with Cosine-Similarity:          64.74
2022-07-04 17:08:43 - Recall with Cosine-Similarity:             98.40
2022-07-04 17:08:43 - Average Precision with Cosine-Similarity:  78.67

2022-07-04 17:08:43 - Accuracy with Manhatten-Distance:           67.34	(Threshold: 143.3595)
2022-07-04 17:08:43 - F1 with Manhatten-Distance:                 77.50	(Threshold: 189.5037)
2022-07-04 17:08:43 - Precision with Manhatten-Distance:          63.59
2022-07-04 17:08:43 - Recall with Manhatten-Distance:             99.20
2022-07-04 17:08:43



Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/91 [00:00<?, ?it/s]

2022-07-04 17:08:48 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 0 after 40 steps:
2022-07-04 17:08:49 - Accuracy with Cosine-Similarity:           67.34	(Threshold: 0.8354)
2022-07-04 17:08:49 - F1 with Cosine-Similarity:                 77.99	(Threshold: 0.7639)
2022-07-04 17:08:49 - Precision with Cosine-Similarity:          64.25
2022-07-04 17:08:49 - Recall with Cosine-Similarity:             99.20
2022-07-04 17:08:49 - Average Precision with Cosine-Similarity:  80.90

2022-07-04 17:08:49 - Accuracy with Manhatten-Distance:           65.33	(Threshold: 144.4313)
2022-07-04 17:08:49 - F1 with Manhatten-Distance:                 78.13	(Threshold: 196.6414)
2022-07-04 17:08:49 - Precision with Manhatten-Distance:          64.10
2022-07-04 17:08:49 - Recall with Manhatten-Distance:             100.00
2022-07-04 17:08:49 - Average Precision with Manhatten-Distance:  79.38

2022-07-04 17:08:49 - Accuracy with Euclidean-Distance:           64.32	(Threshold: 5.2181

Iteration:   0%|          | 0/91 [00:00<?, ?it/s]

2022-07-04 17:09:54 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 1 after 40 steps:
2022-07-04 17:09:55 - Accuracy with Cosine-Similarity:           67.84	(Threshold: 0.8036)
2022-07-04 17:09:55 - F1 with Cosine-Similarity:                 78.03	(Threshold: 0.7518)
2022-07-04 17:09:55 - Precision with Cosine-Similarity:          66.11
2022-07-04 17:09:55 - Recall with Cosine-Similarity:             95.20
2022-07-04 17:09:55 - Average Precision with Cosine-Similarity:  81.89

2022-07-04 17:09:55 - Accuracy with Manhatten-Distance:           67.34	(Threshold: 161.1717)
2022-07-04 17:09:55 - F1 with Manhatten-Distance:                 78.57	(Threshold: 197.0680)
2022-07-04 17:09:55 - Precision with Manhatten-Distance:          66.12
2022-07-04 17:09:55 - Recall with Manhatten-Distance:             96.80
2022-07-04 17:09:55 - Average Precision with Manhatten-Distance:  79.66

2022-07-04 17:09:55 - Accuracy with Euclidean-Distance:           67.34	(Threshold: 6.6902)

Iteration:   0%|          | 0/91 [00:00<?, ?it/s]

2022-07-04 17:10:08 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 2 after 40 steps:
2022-07-04 17:10:08 - Accuracy with Cosine-Similarity:           67.34	(Threshold: 0.7715)
2022-07-04 17:10:08 - F1 with Cosine-Similarity:                 78.37	(Threshold: 0.6659)
2022-07-04 17:10:08 - Precision with Cosine-Similarity:          64.43
2022-07-04 17:10:08 - Recall with Cosine-Similarity:             100.00
2022-07-04 17:10:08 - Average Precision with Cosine-Similarity:  79.33

2022-07-04 17:10:08 - Accuracy with Manhatten-Distance:           68.34	(Threshold: 188.6583)
2022-07-04 17:10:08 - F1 with Manhatten-Distance:                 78.23	(Threshold: 204.8380)
2022-07-04 17:10:08 - Precision with Manhatten-Distance:          68.05
2022-07-04 17:10:08 - Recall with Manhatten-Distance:             92.00
2022-07-04 17:10:08 - Average Precision with Manhatten-Distance:  78.05

2022-07-04 17:10:08 - Accuracy with Euclidean-Distance:           67.34	(Threshold: 6.9632

Iteration:   0%|          | 0/91 [00:00<?, ?it/s]

2022-07-04 17:10:21 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 3 after 40 steps:
2022-07-04 17:10:21 - Accuracy with Cosine-Similarity:           65.83	(Threshold: 0.8519)
2022-07-04 17:10:21 - F1 with Cosine-Similarity:                 78.37	(Threshold: 0.7115)
2022-07-04 17:10:21 - Precision with Cosine-Similarity:          64.43
2022-07-04 17:10:21 - Recall with Cosine-Similarity:             100.00
2022-07-04 17:10:21 - Average Precision with Cosine-Similarity:  81.08

2022-07-04 17:10:21 - Accuracy with Manhatten-Distance:           68.34	(Threshold: 185.3054)
2022-07-04 17:10:21 - F1 with Manhatten-Distance:                 78.46	(Threshold: 217.6257)
2022-07-04 17:10:21 - Precision with Manhatten-Distance:          65.59
2022-07-04 17:10:21 - Recall with Manhatten-Distance:             97.60
2022-07-04 17:10:21 - Average Precision with Manhatten-Distance:  79.50

2022-07-04 17:10:21 - Accuracy with Euclidean-Distance:           67.84	(Threshold: 6.8596

Iteration:   0%|          | 0/91 [00:00<?, ?it/s]

2022-07-04 17:10:34 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 4 after 40 steps:
2022-07-04 17:10:35 - Accuracy with Cosine-Similarity:           67.34	(Threshold: 0.7792)
2022-07-04 17:10:35 - F1 with Cosine-Similarity:                 78.62	(Threshold: 0.6850)
2022-07-04 17:10:35 - Precision with Cosine-Similarity:          64.77
2022-07-04 17:10:35 - Recall with Cosine-Similarity:             100.00
2022-07-04 17:10:35 - Average Precision with Cosine-Similarity:  80.93

2022-07-04 17:10:35 - Accuracy with Manhatten-Distance:           69.35	(Threshold: 187.9810)
2022-07-04 17:10:35 - F1 with Manhatten-Distance:                 78.46	(Threshold: 241.3089)
2022-07-04 17:10:35 - Precision with Manhatten-Distance:          65.59
2022-07-04 17:10:35 - Recall with Manhatten-Distance:             97.60
2022-07-04 17:10:35 - Average Precision with Manhatten-Distance:  79.90

2022-07-04 17:10:35 - Accuracy with Euclidean-Distance:           69.35	(Threshold: 7.4255

Iteration:   0%|          | 0/91 [00:00<?, ?it/s]

2022-07-04 17:10:48 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 5 after 40 steps:
2022-07-04 17:10:48 - Accuracy with Cosine-Similarity:           67.34	(Threshold: 0.7796)
2022-07-04 17:10:48 - F1 with Cosine-Similarity:                 78.98	(Threshold: 0.6984)
2022-07-04 17:10:48 - Precision with Cosine-Similarity:          65.61
2022-07-04 17:10:48 - Recall with Cosine-Similarity:             99.20
2022-07-04 17:10:48 - Average Precision with Cosine-Similarity:  81.05

2022-07-04 17:10:48 - Accuracy with Manhatten-Distance:           69.35	(Threshold: 200.9352)
2022-07-04 17:10:48 - F1 with Manhatten-Distance:                 78.85	(Threshold: 261.1823)
2022-07-04 17:10:48 - Precision with Manhatten-Distance:          65.78
2022-07-04 17:10:48 - Recall with Manhatten-Distance:             98.40
2022-07-04 17:10:48 - Average Precision with Manhatten-Distance:  79.92

2022-07-04 17:10:48 - Accuracy with Euclidean-Distance:           70.35	(Threshold: 8.0634)

Iteration:   0%|          | 0/91 [00:00<?, ?it/s]

2022-07-04 17:11:01 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 6 after 40 steps:
2022-07-04 17:11:02 - Accuracy with Cosine-Similarity:           66.33	(Threshold: 0.8510)
2022-07-04 17:11:02 - F1 with Cosine-Similarity:                 78.73	(Threshold: 0.6919)
2022-07-04 17:11:02 - Precision with Cosine-Similarity:          65.26
2022-07-04 17:11:02 - Recall with Cosine-Similarity:             99.20
2022-07-04 17:11:02 - Average Precision with Cosine-Similarity:  79.57

2022-07-04 17:11:02 - Accuracy with Manhatten-Distance:           68.34	(Threshold: 211.3572)
2022-07-04 17:11:02 - F1 with Manhatten-Distance:                 78.59	(Threshold: 275.1057)
2022-07-04 17:11:02 - Precision with Manhatten-Distance:          65.43
2022-07-04 17:11:02 - Recall with Manhatten-Distance:             98.40
2022-07-04 17:11:02 - Average Precision with Manhatten-Distance:  79.95

2022-07-04 17:11:02 - Accuracy with Euclidean-Distance:           68.84	(Threshold: 8.2324)

Iteration:   0%|          | 0/91 [00:00<?, ?it/s]

2022-07-04 17:11:15 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 7 after 40 steps:
2022-07-04 17:11:15 - Accuracy with Cosine-Similarity:           66.83	(Threshold: 0.8424)
2022-07-04 17:11:15 - F1 with Cosine-Similarity:                 78.73	(Threshold: 0.6843)
2022-07-04 17:11:15 - Precision with Cosine-Similarity:          65.26
2022-07-04 17:11:15 - Recall with Cosine-Similarity:             99.20
2022-07-04 17:11:15 - Average Precision with Cosine-Similarity:  78.75

2022-07-04 17:11:15 - Accuracy with Manhatten-Distance:           69.35	(Threshold: 219.8254)
2022-07-04 17:11:15 - F1 with Manhatten-Distance:                 79.10	(Threshold: 277.5305)
2022-07-04 17:11:15 - Precision with Manhatten-Distance:          66.13
2022-07-04 17:11:15 - Recall with Manhatten-Distance:             98.40
2022-07-04 17:11:15 - Average Precision with Manhatten-Distance:  79.32

2022-07-04 17:11:15 - Accuracy with Euclidean-Distance:           69.35	(Threshold: 8.5670)

Iteration:   0%|          | 0/91 [00:00<?, ?it/s]

2022-07-04 17:11:28 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 8 after 40 steps:
2022-07-04 17:11:28 - Accuracy with Cosine-Similarity:           66.83	(Threshold: 0.7695)
2022-07-04 17:11:28 - F1 with Cosine-Similarity:                 78.48	(Threshold: 0.6772)
2022-07-04 17:11:28 - Precision with Cosine-Similarity:          64.92
2022-07-04 17:11:28 - Recall with Cosine-Similarity:             99.20
2022-07-04 17:11:28 - Average Precision with Cosine-Similarity:  78.41

2022-07-04 17:11:29 - Accuracy with Manhatten-Distance:           68.34	(Threshold: 221.9131)
2022-07-04 17:11:29 - F1 with Manhatten-Distance:                 78.98	(Threshold: 288.6568)
2022-07-04 17:11:29 - Precision with Manhatten-Distance:          65.61
2022-07-04 17:11:29 - Recall with Manhatten-Distance:             99.20
2022-07-04 17:11:29 - Average Precision with Manhatten-Distance:  79.28

2022-07-04 17:11:29 - Accuracy with Euclidean-Distance:           68.34	(Threshold: 8.0663)

Iteration:   0%|          | 0/91 [00:00<?, ?it/s]

2022-07-04 17:11:41 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 9 after 40 steps:
2022-07-04 17:11:42 - Accuracy with Cosine-Similarity:           66.33	(Threshold: 0.8582)
2022-07-04 17:11:42 - F1 with Cosine-Similarity:                 78.37	(Threshold: 0.6607)
2022-07-04 17:11:42 - Precision with Cosine-Similarity:          64.43
2022-07-04 17:11:42 - Recall with Cosine-Similarity:             100.00
2022-07-04 17:11:42 - Average Precision with Cosine-Similarity:  78.50

2022-07-04 17:11:42 - Accuracy with Manhatten-Distance:           69.35	(Threshold: 223.6354)
2022-07-04 17:11:42 - F1 with Manhatten-Distance:                 78.98	(Threshold: 290.8111)
2022-07-04 17:11:42 - Precision with Manhatten-Distance:          65.61
2022-07-04 17:11:42 - Recall with Manhatten-Distance:             99.20
2022-07-04 17:11:42 - Average Precision with Manhatten-Distance:  79.56

2022-07-04 17:11:42 - Accuracy with Euclidean-Distance:           69.35	(Threshold: 8.7784

In [64]:
trained_model.evaluate(evaluator)

2022-07-04 17:11:50 - Binary Accuracy Evaluation of the model on sts-dev dataset:
2022-07-04 17:11:50 - Accuracy with Cosine-Similarity:           66.33	(Threshold: 0.8610)
2022-07-04 17:11:50 - F1 with Cosine-Similarity:                 78.37	(Threshold: 0.6616)
2022-07-04 17:11:50 - Precision with Cosine-Similarity:          64.43
2022-07-04 17:11:50 - Recall with Cosine-Similarity:             100.00
2022-07-04 17:11:50 - Average Precision with Cosine-Similarity:  78.47

2022-07-04 17:11:50 - Accuracy with Manhatten-Distance:           69.35	(Threshold: 223.6111)
2022-07-04 17:11:50 - F1 with Manhatten-Distance:                 78.98	(Threshold: 291.0289)
2022-07-04 17:11:50 - Precision with Manhatten-Distance:          65.61
2022-07-04 17:11:50 - Recall with Manhatten-Distance:             99.20
2022-07-04 17:11:50 - Average Precision with Manhatten-Distance:  79.63

2022-07-04 17:11:50 - Accuracy with Euclidean-Distance:           68.84	(Threshold: 8.6984)
2022-07-04 17:11:50 - F1

0.7989021345899868

In [41]:
# all_f1_scores = []
# for i in range(n):
#     trained_model, evaluator = sbert_training.train_model(taska_validity_train_df, taska_validity_valid_df, output_path + '/task-A/validity/sbert/', 
#             'sentence-transformers/nli-roberta-large', 
#             num_epochs=5, train_batch_size=32,
#             model_suffix='', max_seq_length=512, special_tokens=[], 
#             loss='ContrastiveLoss', sentence_transformer=False, evaluation_steps=10)
    
#     eval_df = sbert_training.predict_labels(taska_validity_valid_df, trained_model, 'Premise', 'Conclusion', 'pred_validity', 0.5)
    
#     precision, recall, f1, _ = precision_recall_fscore_support(taska_validity_valid_df.Validity.tolist(), taska_validity_valid_df.pred_validity.tolist(), average='binary')
#     print('Precision: {}, Recall {}, F1: {}'.format(precision, recall, f1))
#     all_f1_scores.append(f1)

### Evaluate sbert on Novelty:

In [66]:
trained_model, evaluator = sbert_training.train_model(taska_novelty_train_df, taska_novelty_valid_df, 
                                                        output_path + '/task-A/novelty/sbert/', 
                                                        'sentence-transformers/nli-roberta-large', 
                                                        num_epochs=10, train_batch_size=8,
                                                        model_suffix='', max_seq_length=512, special_tokens=[], 
                                                        loss='ContrastiveLoss', sentence_transformer=False, 
                                                        evaluation_steps=40,
                                                         lr=5e-6)

2022-07-04 17:15:13 - Use pytorch device: cuda
2022-07-04 17:15:13 - Read Triplet train dataset
Len of training: 718
Len of Dev: 200
Evaluating before start learning.....
2022-07-04 17:15:13 - Binary Accuracy Evaluation of the model on sts-dev dataset:
2022-07-04 17:15:14 - Accuracy with Cosine-Similarity:           58.50	(Threshold: 0.9650)
2022-07-04 17:15:14 - F1 with Cosine-Similarity:                 58.57	(Threshold: 0.3644)
2022-07-04 17:15:14 - Precision with Cosine-Similarity:          41.41
2022-07-04 17:15:14 - Recall with Cosine-Similarity:             100.00
2022-07-04 17:15:14 - Average Precision with Cosine-Similarity:  36.74

2022-07-04 17:15:14 - Accuracy with Manhatten-Distance:           58.50	(Threshold: 198.5149)
2022-07-04 17:15:14 - F1 with Manhatten-Distance:                 58.57	(Threshold: 853.4846)
2022-07-04 17:15:14 - Precision with Manhatten-Distance:          41.41
2022-07-04 17:15:14 - Recall with Manhatten-Distance:             100.00
2022-07-04 17:15:



Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

2022-07-04 17:15:19 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 0 after 40 steps:
2022-07-04 17:15:19 - Accuracy with Cosine-Similarity:           58.50	(Threshold: 0.9649)
2022-07-04 17:15:19 - F1 with Cosine-Similarity:                 58.57	(Threshold: 0.3485)
2022-07-04 17:15:19 - Precision with Cosine-Similarity:          41.41
2022-07-04 17:15:19 - Recall with Cosine-Similarity:             100.00
2022-07-04 17:15:19 - Average Precision with Cosine-Similarity:  36.79

2022-07-04 17:15:19 - Accuracy with Manhatten-Distance:           58.50	(Threshold: 198.8578)
2022-07-04 17:15:19 - F1 with Manhatten-Distance:                 58.78	(Threshold: 862.2731)
2022-07-04 17:15:19 - Precision with Manhatten-Distance:          41.62
2022-07-04 17:15:19 - Recall with Manhatten-Distance:             100.00
2022-07-04 17:15:19 - Average Precision with Manhatten-Distance:  36.54

2022-07-04 17:15:19 - Accuracy with Euclidean-Distance:           58.50	(Threshold: 7.971

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

2022-07-04 17:15:57 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 1 after 40 steps:
2022-07-04 17:15:57 - Accuracy with Cosine-Similarity:           58.50	(Threshold: 0.9679)
2022-07-04 17:15:57 - F1 with Cosine-Similarity:                 59.12	(Threshold: 0.4194)
2022-07-04 17:15:57 - Precision with Cosine-Similarity:          42.19
2022-07-04 17:15:57 - Recall with Cosine-Similarity:             98.78
2022-07-04 17:15:57 - Average Precision with Cosine-Similarity:  36.83

2022-07-04 17:15:57 - Accuracy with Manhatten-Distance:           58.50	(Threshold: 191.8530)
2022-07-04 17:15:57 - F1 with Manhatten-Distance:                 59.56	(Threshold: 800.8845)
2022-07-04 17:15:57 - Precision with Manhatten-Distance:          42.63
2022-07-04 17:15:57 - Recall with Manhatten-Distance:             98.78
2022-07-04 17:15:57 - Average Precision with Manhatten-Distance:  36.73

2022-07-04 17:15:57 - Accuracy with Euclidean-Distance:           58.50	(Threshold: 7.5915)

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

2022-07-04 17:16:10 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 2 after 40 steps:
2022-07-04 17:16:11 - Accuracy with Cosine-Similarity:           58.50	(Threshold: 0.9677)
2022-07-04 17:16:11 - F1 with Cosine-Similarity:                 58.78	(Threshold: 0.3745)
2022-07-04 17:16:11 - Precision with Cosine-Similarity:          41.62
2022-07-04 17:16:11 - Recall with Cosine-Similarity:             100.00
2022-07-04 17:16:11 - Average Precision with Cosine-Similarity:  37.16

2022-07-04 17:16:11 - Accuracy with Manhatten-Distance:           58.50	(Threshold: 191.9584)
2022-07-04 17:16:11 - F1 with Manhatten-Distance:                 59.09	(Threshold: 786.1006)
2022-07-04 17:16:11 - Precision with Manhatten-Distance:          42.86
2022-07-04 17:16:11 - Recall with Manhatten-Distance:             95.12
2022-07-04 17:16:11 - Average Precision with Manhatten-Distance:  37.07

2022-07-04 17:16:11 - Accuracy with Euclidean-Distance:           58.50	(Threshold: 7.6792

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

2022-07-04 17:16:53 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 3 after 40 steps:
2022-07-04 17:16:53 - Accuracy with Cosine-Similarity:           58.50	(Threshold: 0.9693)
2022-07-04 17:16:53 - F1 with Cosine-Similarity:                 58.99	(Threshold: 0.3821)
2022-07-04 17:16:53 - Precision with Cosine-Similarity:          41.84
2022-07-04 17:16:53 - Recall with Cosine-Similarity:             100.00
2022-07-04 17:16:53 - Average Precision with Cosine-Similarity:  37.96

2022-07-04 17:16:53 - Accuracy with Manhatten-Distance:           58.50	(Threshold: 188.1312)
2022-07-04 17:16:53 - F1 with Manhatten-Distance:                 59.40	(Threshold: 777.7501)
2022-07-04 17:16:53 - Precision with Manhatten-Distance:          42.93
2022-07-04 17:16:53 - Recall with Manhatten-Distance:             96.34
2022-07-04 17:16:53 - Average Precision with Manhatten-Distance:  38.10

2022-07-04 17:16:53 - Accuracy with Euclidean-Distance:           58.50	(Threshold: 7.5244

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

2022-07-04 17:17:33 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 4 after 40 steps:
2022-07-04 17:17:34 - Accuracy with Cosine-Similarity:           58.50	(Threshold: 0.9676)
2022-07-04 17:17:34 - F1 with Cosine-Similarity:                 58.78	(Threshold: 0.3440)
2022-07-04 17:17:34 - Precision with Cosine-Similarity:          41.62
2022-07-04 17:17:34 - Recall with Cosine-Similarity:             100.00
2022-07-04 17:17:34 - Average Precision with Cosine-Similarity:  37.57

2022-07-04 17:17:34 - Accuracy with Manhatten-Distance:           58.50	(Threshold: 193.7910)
2022-07-04 17:17:34 - F1 with Manhatten-Distance:                 59.00	(Threshold: 798.3619)
2022-07-04 17:17:34 - Precision with Manhatten-Distance:          43.02
2022-07-04 17:17:34 - Recall with Manhatten-Distance:             93.90
2022-07-04 17:17:34 - Average Precision with Manhatten-Distance:  37.70

2022-07-04 17:17:34 - Accuracy with Euclidean-Distance:           58.50	(Threshold: 7.7420

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

2022-07-04 17:18:35 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 5 after 40 steps:
2022-07-04 17:18:36 - Accuracy with Cosine-Similarity:           58.50	(Threshold: 0.9675)
2022-07-04 17:18:36 - F1 with Cosine-Similarity:                 58.61	(Threshold: 0.4076)
2022-07-04 17:18:36 - Precision with Cosine-Similarity:          41.88
2022-07-04 17:18:36 - Recall with Cosine-Similarity:             97.56
2022-07-04 17:18:36 - Average Precision with Cosine-Similarity:  38.54

2022-07-04 17:18:36 - Accuracy with Manhatten-Distance:           58.50	(Threshold: 193.5102)
2022-07-04 17:18:36 - F1 with Manhatten-Distance:                 59.09	(Threshold: 804.0067)
2022-07-04 17:18:36 - Precision with Manhatten-Distance:          42.86
2022-07-04 17:18:36 - Recall with Manhatten-Distance:             95.12
2022-07-04 17:18:36 - Average Precision with Manhatten-Distance:  38.68

2022-07-04 17:18:36 - Accuracy with Euclidean-Distance:           58.50	(Threshold: 7.7752)

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

2022-07-04 17:19:41 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 6 after 40 steps:
2022-07-04 17:19:41 - Accuracy with Cosine-Similarity:           58.50	(Threshold: 0.9671)
2022-07-04 17:19:41 - F1 with Cosine-Similarity:                 58.91	(Threshold: 0.3526)
2022-07-04 17:19:41 - Precision with Cosine-Similarity:          41.97
2022-07-04 17:19:41 - Recall with Cosine-Similarity:             98.78
2022-07-04 17:19:41 - Average Precision with Cosine-Similarity:  38.34

2022-07-04 17:19:41 - Accuracy with Manhatten-Distance:           58.50	(Threshold: 195.8055)
2022-07-04 17:19:41 - F1 with Manhatten-Distance:                 59.38	(Threshold: 780.5580)
2022-07-04 17:19:41 - Precision with Manhatten-Distance:          43.68
2022-07-04 17:19:41 - Recall with Manhatten-Distance:             92.68
2022-07-04 17:19:41 - Average Precision with Manhatten-Distance:  38.72

2022-07-04 17:19:41 - Accuracy with Euclidean-Distance:           58.50	(Threshold: 7.8442)

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

2022-07-04 17:20:20 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 7 after 40 steps:
2022-07-04 17:20:21 - Accuracy with Cosine-Similarity:           58.50	(Threshold: 0.9661)
2022-07-04 17:20:21 - F1 with Cosine-Similarity:                 58.82	(Threshold: 0.3936)
2022-07-04 17:20:21 - Precision with Cosine-Similarity:          42.11
2022-07-04 17:20:21 - Recall with Cosine-Similarity:             97.56
2022-07-04 17:20:21 - Average Precision with Cosine-Similarity:  38.59

2022-07-04 17:20:21 - Accuracy with Manhatten-Distance:           58.50	(Threshold: 197.6780)
2022-07-04 17:20:21 - F1 with Manhatten-Distance:                 59.26	(Threshold: 838.0715)
2022-07-04 17:20:21 - Precision with Manhatten-Distance:          42.55
2022-07-04 17:20:21 - Recall with Manhatten-Distance:             97.56
2022-07-04 17:20:21 - Average Precision with Manhatten-Distance:  38.90

2022-07-04 17:20:21 - Accuracy with Euclidean-Distance:           58.50	(Threshold: 7.9761)

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

2022-07-04 17:20:34 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 8 after 40 steps:
2022-07-04 17:20:34 - Accuracy with Cosine-Similarity:           58.50	(Threshold: 0.9656)
2022-07-04 17:20:34 - F1 with Cosine-Similarity:                 59.12	(Threshold: 0.3663)
2022-07-04 17:20:34 - Precision with Cosine-Similarity:          42.19
2022-07-04 17:20:34 - Recall with Cosine-Similarity:             98.78
2022-07-04 17:20:34 - Average Precision with Cosine-Similarity:  38.78

2022-07-04 17:20:34 - Accuracy with Manhatten-Distance:           58.50	(Threshold: 199.2551)
2022-07-04 17:20:34 - F1 with Manhatten-Distance:                 59.61	(Threshold: 768.9261)
2022-07-04 17:20:34 - Precision with Manhatten-Distance:          43.93
2022-07-04 17:20:34 - Recall with Manhatten-Distance:             92.68
2022-07-04 17:20:34 - Average Precision with Manhatten-Distance:  39.09

2022-07-04 17:20:34 - Accuracy with Euclidean-Distance:           58.50	(Threshold: 8.0474)

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

2022-07-04 17:20:47 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 9 after 40 steps:
2022-07-04 17:20:47 - Accuracy with Cosine-Similarity:           58.50	(Threshold: 0.9659)
2022-07-04 17:20:47 - F1 with Cosine-Similarity:                 58.91	(Threshold: 0.3610)
2022-07-04 17:20:47 - Precision with Cosine-Similarity:          41.97
2022-07-04 17:20:47 - Recall with Cosine-Similarity:             98.78
2022-07-04 17:20:47 - Average Precision with Cosine-Similarity:  38.93

2022-07-04 17:20:47 - Accuracy with Manhatten-Distance:           58.50	(Threshold: 198.7454)
2022-07-04 17:20:47 - F1 with Manhatten-Distance:                 59.61	(Threshold: 768.0740)
2022-07-04 17:20:47 - Precision with Manhatten-Distance:          43.93
2022-07-04 17:20:47 - Recall with Manhatten-Distance:             92.68
2022-07-04 17:20:47 - Average Precision with Manhatten-Distance:  39.12

2022-07-04 17:20:47 - Accuracy with Euclidean-Distance:           58.50	(Threshold: 8.0162)

In [67]:
trained_model.evaluate(evaluator)

2022-07-04 17:20:55 - Binary Accuracy Evaluation of the model on sts-dev dataset:
2022-07-04 17:20:55 - Accuracy with Cosine-Similarity:           58.50	(Threshold: 0.9660)
2022-07-04 17:20:55 - F1 with Cosine-Similarity:                 59.12	(Threshold: 0.3658)
2022-07-04 17:20:55 - Precision with Cosine-Similarity:          42.19
2022-07-04 17:20:55 - Recall with Cosine-Similarity:             98.78
2022-07-04 17:20:55 - Average Precision with Cosine-Similarity:  38.92

2022-07-04 17:20:55 - Accuracy with Manhatten-Distance:           58.50	(Threshold: 198.4807)
2022-07-04 17:20:55 - F1 with Manhatten-Distance:                 59.61	(Threshold: 769.1613)
2022-07-04 17:20:55 - Precision with Manhatten-Distance:          43.93
2022-07-04 17:20:55 - Recall with Manhatten-Distance:             92.68
2022-07-04 17:20:55 - Average Precision with Manhatten-Distance:  39.14

2022-07-04 17:20:55 - Accuracy with Euclidean-Distance:           58.50	(Threshold: 8.0059)
2022-07-04 17:20:55 - F1 

0.3967069974945594

In [68]:
trained_model, evaluator = sbert_training.train_model(taska_novelty_train_df, taska_novelty_valid_df, 
                                                        output_path + '/task-A/novelty/sbert/', 
                                                        'bert-large-uncased', 
                                                        num_epochs=10, train_batch_size=8,
                                                        model_suffix='', max_seq_length=512, special_tokens=[], 
                                                        loss='ContrastiveLoss', sentence_transformer=False, 
                                                        evaluation_steps=40,
                                                         lr=5e-6)

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


2022-07-04 17:21:03 - Use pytorch device: cuda
2022-07-04 17:21:03 - Read Triplet train dataset
Len of training: 718
Len of Dev: 200
Evaluating before start learning.....
2022-07-04 17:21:03 - Binary Accuracy Evaluation of the model on sts-dev dataset:
2022-07-04 17:21:04 - Accuracy with Cosine-Similarity:           58.50	(Threshold: 0.9387)
2022-07-04 17:21:04 - F1 with Cosine-Similarity:                 59.21	(Threshold: 0.7807)
2022-07-04 17:21:04 - Precision with Cosine-Similarity:          42.05
2022-07-04 17:21:04 - Recall with Cosine-Similarity:             100.00
2022-07-04 17:21:04 - Average Precision with Cosine-Similarity:  33.91

2022-07-04 17:21:04 - Accuracy with Manhatten-Distance:           58.50	(Threshold: 84.4564)
2022-07-04 17:21:04 - F1 with Manhatten-Distance:                 59.85	(Threshold: 178.4751)
2022-07-04 17:21:04 - Precision with Manhatten-Distance:          42.71
2022-07-04 17:21:04 - Recall with Manhatten-Distance:             100.00
2022-07-04 17:21:0



Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

2022-07-04 17:21:09 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 0 after 40 steps:
2022-07-04 17:21:10 - Accuracy with Cosine-Similarity:           58.50	(Threshold: 0.9132)
2022-07-04 17:21:10 - F1 with Cosine-Similarity:                 59.42	(Threshold: 0.6714)
2022-07-04 17:21:10 - Precision with Cosine-Similarity:          42.27
2022-07-04 17:21:10 - Recall with Cosine-Similarity:             100.00
2022-07-04 17:21:10 - Average Precision with Cosine-Similarity:  34.16

2022-07-04 17:21:10 - Accuracy with Manhatten-Distance:           58.50	(Threshold: 98.3448)
2022-07-04 17:21:10 - F1 with Manhatten-Distance:                 59.34	(Threshold: 194.8366)
2022-07-04 17:21:10 - Precision with Manhatten-Distance:          42.41
2022-07-04 17:21:10 - Recall with Manhatten-Distance:             98.78
2022-07-04 17:21:10 - Average Precision with Manhatten-Distance:  33.39

2022-07-04 17:21:10 - Accuracy with Euclidean-Distance:           58.50	(Threshold: 3.8853)

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

2022-07-04 17:21:45 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 1 after 40 steps:
2022-07-04 17:21:46 - Accuracy with Cosine-Similarity:           58.50	(Threshold: 0.8436)
2022-07-04 17:21:46 - F1 with Cosine-Similarity:                 59.21	(Threshold: 0.3972)
2022-07-04 17:21:46 - Precision with Cosine-Similarity:          42.05
2022-07-04 17:21:46 - Recall with Cosine-Similarity:             100.00
2022-07-04 17:21:46 - Average Precision with Cosine-Similarity:  34.22

2022-07-04 17:21:46 - Accuracy with Manhatten-Distance:           58.50	(Threshold: 127.4807)
2022-07-04 17:21:46 - F1 with Manhatten-Distance:                 59.48	(Threshold: 252.1426)
2022-07-04 17:21:46 - Precision with Manhatten-Distance:          42.78
2022-07-04 17:21:46 - Recall with Manhatten-Distance:             97.56
2022-07-04 17:21:46 - Average Precision with Manhatten-Distance:  33.44

2022-07-04 17:21:46 - Accuracy with Euclidean-Distance:           58.50	(Threshold: 5.0452

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

2022-07-04 17:22:25 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 2 after 40 steps:
2022-07-04 17:22:25 - Accuracy with Cosine-Similarity:           58.50	(Threshold: 0.8637)
2022-07-04 17:22:25 - F1 with Cosine-Similarity:                 59.42	(Threshold: 0.4277)
2022-07-04 17:22:25 - Precision with Cosine-Similarity:          42.27
2022-07-04 17:22:25 - Recall with Cosine-Similarity:             100.00
2022-07-04 17:22:25 - Average Precision with Cosine-Similarity:  35.21

2022-07-04 17:22:25 - Accuracy with Manhatten-Distance:           59.00	(Threshold: 132.1508)
2022-07-04 17:22:25 - F1 with Manhatten-Distance:                 59.78	(Threshold: 265.2895)
2022-07-04 17:22:25 - Precision with Manhatten-Distance:          42.86
2022-07-04 17:22:25 - Recall with Manhatten-Distance:             98.78
2022-07-04 17:22:25 - Average Precision with Manhatten-Distance:  34.30

2022-07-04 17:22:25 - Accuracy with Euclidean-Distance:           58.50	(Threshold: 4.7633

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

2022-07-04 17:22:38 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 3 after 40 steps:
2022-07-04 17:22:39 - Accuracy with Cosine-Similarity:           59.00	(Threshold: 0.8590)
2022-07-04 17:22:39 - F1 with Cosine-Similarity:                 60.00	(Threshold: 0.4552)
2022-07-04 17:22:39 - Precision with Cosine-Similarity:          43.09
2022-07-04 17:22:39 - Recall with Cosine-Similarity:             98.78
2022-07-04 17:22:39 - Average Precision with Cosine-Similarity:  36.75

2022-07-04 17:22:39 - Accuracy with Manhatten-Distance:           59.00	(Threshold: 126.9702)
2022-07-04 17:22:39 - F1 with Manhatten-Distance:                 60.15	(Threshold: 271.0786)
2022-07-04 17:22:39 - Precision with Manhatten-Distance:          43.48
2022-07-04 17:22:39 - Recall with Manhatten-Distance:             97.56
2022-07-04 17:22:39 - Average Precision with Manhatten-Distance:  35.92

2022-07-04 17:22:39 - Accuracy with Euclidean-Distance:           59.00	(Threshold: 5.0383)

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

2022-07-04 17:22:52 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 4 after 40 steps:
2022-07-04 17:22:52 - Accuracy with Cosine-Similarity:           59.00	(Threshold: 0.8621)
2022-07-04 17:22:52 - F1 with Cosine-Similarity:                 59.34	(Threshold: 0.4510)
2022-07-04 17:22:52 - Precision with Cosine-Similarity:          42.41
2022-07-04 17:22:52 - Recall with Cosine-Similarity:             98.78
2022-07-04 17:22:52 - Average Precision with Cosine-Similarity:  36.08

2022-07-04 17:22:52 - Accuracy with Manhatten-Distance:           59.50	(Threshold: 122.2577)
2022-07-04 17:22:52 - F1 with Manhatten-Distance:                 60.15	(Threshold: 280.9120)
2022-07-04 17:22:52 - Precision with Manhatten-Distance:          43.48
2022-07-04 17:22:52 - Recall with Manhatten-Distance:             97.56
2022-07-04 17:22:52 - Average Precision with Manhatten-Distance:  35.90

2022-07-04 17:22:52 - Accuracy with Euclidean-Distance:           59.50	(Threshold: 4.8390)

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

2022-07-04 17:23:05 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 5 after 40 steps:
2022-07-04 17:23:06 - Accuracy with Cosine-Similarity:           58.50	(Threshold: 0.8606)
2022-07-04 17:23:06 - F1 with Cosine-Similarity:                 59.42	(Threshold: 0.3777)
2022-07-04 17:23:06 - Precision with Cosine-Similarity:          42.27
2022-07-04 17:23:06 - Recall with Cosine-Similarity:             100.00
2022-07-04 17:23:06 - Average Precision with Cosine-Similarity:  35.68

2022-07-04 17:23:06 - Accuracy with Manhatten-Distance:           58.50	(Threshold: 136.9071)
2022-07-04 17:23:06 - F1 with Manhatten-Distance:                 59.93	(Threshold: 307.3293)
2022-07-04 17:23:06 - Precision with Manhatten-Distance:          43.24
2022-07-04 17:23:06 - Recall with Manhatten-Distance:             97.56
2022-07-04 17:23:06 - Average Precision with Manhatten-Distance:  34.47

2022-07-04 17:23:06 - Accuracy with Euclidean-Distance:           58.50	(Threshold: 5.3853

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

2022-07-04 17:24:11 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 6 after 40 steps:
2022-07-04 17:24:12 - Accuracy with Cosine-Similarity:           59.00	(Threshold: 0.8682)
2022-07-04 17:24:12 - F1 with Cosine-Similarity:                 59.64	(Threshold: 0.3623)
2022-07-04 17:24:12 - Precision with Cosine-Similarity:          42.49
2022-07-04 17:24:12 - Recall with Cosine-Similarity:             100.00
2022-07-04 17:24:12 - Average Precision with Cosine-Similarity:  35.91

2022-07-04 17:24:12 - Accuracy with Manhatten-Distance:           59.00	(Threshold: 134.7777)
2022-07-04 17:24:12 - F1 with Manhatten-Distance:                 59.93	(Threshold: 322.0081)
2022-07-04 17:24:12 - Precision with Manhatten-Distance:          43.24
2022-07-04 17:24:12 - Recall with Manhatten-Distance:             97.56
2022-07-04 17:24:12 - Average Precision with Manhatten-Distance:  34.57

2022-07-04 17:24:12 - Accuracy with Euclidean-Distance:           59.00	(Threshold: 5.2864

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

2022-07-04 17:25:15 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 7 after 40 steps:
2022-07-04 17:25:16 - Accuracy with Cosine-Similarity:           58.50	(Threshold: 0.8646)
2022-07-04 17:25:16 - F1 with Cosine-Similarity:                 59.21	(Threshold: 0.3349)
2022-07-04 17:25:16 - Precision with Cosine-Similarity:          42.05
2022-07-04 17:25:16 - Recall with Cosine-Similarity:             100.00
2022-07-04 17:25:16 - Average Precision with Cosine-Similarity:  35.26

2022-07-04 17:25:16 - Accuracy with Manhatten-Distance:           58.50	(Threshold: 140.3504)
2022-07-04 17:25:16 - F1 with Manhatten-Distance:                 59.70	(Threshold: 337.9244)
2022-07-04 17:25:16 - Precision with Manhatten-Distance:          43.01
2022-07-04 17:25:16 - Recall with Manhatten-Distance:             97.56
2022-07-04 17:25:16 - Average Precision with Manhatten-Distance:  34.15

2022-07-04 17:25:16 - Accuracy with Euclidean-Distance:           58.50	(Threshold: 5.5531

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

2022-07-04 17:25:57 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 8 after 40 steps:
2022-07-04 17:25:57 - Accuracy with Cosine-Similarity:           58.50	(Threshold: 0.8653)
2022-07-04 17:25:57 - F1 with Cosine-Similarity:                 59.12	(Threshold: 0.3502)
2022-07-04 17:25:57 - Precision with Cosine-Similarity:          42.19
2022-07-04 17:25:57 - Recall with Cosine-Similarity:             98.78
2022-07-04 17:25:57 - Average Precision with Cosine-Similarity:  35.82

2022-07-04 17:25:57 - Accuracy with Manhatten-Distance:           59.00	(Threshold: 143.1298)
2022-07-04 17:25:57 - F1 with Manhatten-Distance:                 59.48	(Threshold: 346.2112)
2022-07-04 17:25:57 - Precision with Manhatten-Distance:          42.78
2022-07-04 17:25:57 - Recall with Manhatten-Distance:             97.56
2022-07-04 17:25:57 - Average Precision with Manhatten-Distance:  34.60

2022-07-04 17:25:57 - Accuracy with Euclidean-Distance:           59.00	(Threshold: 5.6203)

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

2022-07-04 17:26:10 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 9 after 40 steps:
2022-07-04 17:26:11 - Accuracy with Cosine-Similarity:           58.50	(Threshold: 0.8692)
2022-07-04 17:26:11 - F1 with Cosine-Similarity:                 59.34	(Threshold: 0.3593)
2022-07-04 17:26:11 - Precision with Cosine-Similarity:          42.41
2022-07-04 17:26:11 - Recall with Cosine-Similarity:             98.78
2022-07-04 17:26:11 - Average Precision with Cosine-Similarity:  35.80

2022-07-04 17:26:11 - Accuracy with Manhatten-Distance:           58.50	(Threshold: 138.7182)
2022-07-04 17:26:11 - F1 with Manhatten-Distance:                 59.26	(Threshold: 354.4815)
2022-07-04 17:26:11 - Precision with Manhatten-Distance:          42.55
2022-07-04 17:26:11 - Recall with Manhatten-Distance:             97.56
2022-07-04 17:26:11 - Average Precision with Manhatten-Distance:  34.49

2022-07-04 17:26:11 - Accuracy with Euclidean-Distance:           58.50	(Threshold: 5.4626)

In [69]:
trained_model.evaluate(evaluator)

2022-07-04 17:26:43 - Binary Accuracy Evaluation of the model on sts-dev dataset:
2022-07-04 17:26:43 - Accuracy with Cosine-Similarity:           58.50	(Threshold: 0.8703)
2022-07-04 17:26:43 - F1 with Cosine-Similarity:                 59.34	(Threshold: 0.3593)
2022-07-04 17:26:43 - Precision with Cosine-Similarity:          42.41
2022-07-04 17:26:43 - Recall with Cosine-Similarity:             98.78
2022-07-04 17:26:43 - Average Precision with Cosine-Similarity:  35.71

2022-07-04 17:26:43 - Accuracy with Manhatten-Distance:           58.50	(Threshold: 138.1816)
2022-07-04 17:26:43 - F1 with Manhatten-Distance:                 59.26	(Threshold: 354.8206)
2022-07-04 17:26:43 - Precision with Manhatten-Distance:          42.55
2022-07-04 17:26:43 - Recall with Manhatten-Distance:             97.56
2022-07-04 17:26:43 - Average Precision with Manhatten-Distance:  34.47

2022-07-04 17:26:43 - Accuracy with Euclidean-Distance:           58.50	(Threshold: 5.4412)
2022-07-04 17:26:43 - F1 

0.46174072532721966

### Evaluation on the extra generated conclusions:

In [71]:
taska_training_df = pd.read_pickle('../data/TaskA_train_with_extra_conclusions.pkl')
taska_valid_df = pd.read_csv('../data/TaskA_dev.csv')

taska_training_df['Premise'] = taska_training_df.apply(lambda x: x['topic'] + ' : ' +  x['Premise'], axis=1)
taska_valid_df['Premise'] = taska_valid_df.apply(lambda x: x['topic'] + ' : ' +  x['Premise'], axis=1)

taska_validity_train_df = taska_training_df[taska_training_df.Validity != 0].copy()
taska_validity_valid_df = taska_valid_df[taska_valid_df.Validity != 0].copy()

taska_validity_train_df['label'] = taska_validity_train_df.Validity.apply(lambda x : 1 if x == 1 else 0)
taska_validity_valid_df['label'] = taska_validity_valid_df.Validity.apply(lambda x : 1 if x == 1 else 0)

taska_novelty_train_df = taska_training_df[taska_training_df.Novelty != 0].copy()
taska_novelty_valid_df = taska_valid_df[taska_valid_df.Novelty != 0].copy()

taska_novelty_train_df['label'] = taska_novelty_train_df.Novelty.apply(lambda x : 1 if x == 1 else 0)
taska_novelty_valid_df['label'] = taska_novelty_valid_df.Novelty.apply(lambda x : 1 if x == 1 else 0)

In [72]:
taska_novelty_train_df.label.value_counts()

0    4345
1     123
Name: label, dtype: int64

In [73]:
taska_validity_train_df.label.value_counts()

1    4151
0     320
Name: label, dtype: int64

#### For Validity:

In [74]:
trained_model, evaluator = sbert_training.train_model(taska_validity_train_df, taska_validity_valid_df, 
                                                      output_path + '/task-A/validity/sbert/', 
                                                    'sentence-transformers/nli-roberta-large', 
                                                    num_epochs=10, train_batch_size=8,
                                                    model_suffix='extra-conclusions', max_seq_length=512, special_tokens=[], 
                                                    loss='ContrastiveLoss', sentence_transformer=False, 
                                                    evaluation_steps=10,
                                                     lr=5e-6)

2022-07-04 17:36:10 - Use pytorch device: cuda
2022-07-04 17:36:10 - Read Triplet train dataset
Len of training: 4471
Len of Dev: 199
Evaluating before start learning.....
2022-07-04 17:36:10 - Binary Accuracy Evaluation of the model on sts-dev dataset:
2022-07-04 17:36:11 - Accuracy with Cosine-Similarity:           72.36	(Threshold: 0.6503)
2022-07-04 17:36:11 - F1 with Cosine-Similarity:                 80.69	(Threshold: 0.5985)
2022-07-04 17:36:11 - Precision with Cosine-Similarity:          70.91
2022-07-04 17:36:11 - Recall with Cosine-Similarity:             93.60
2022-07-04 17:36:11 - Average Precision with Cosine-Similarity:  82.57

2022-07-04 17:36:11 - Accuracy with Manhatten-Distance:           72.36	(Threshold: 646.3694)
2022-07-04 17:36:11 - F1 with Manhatten-Distance:                 80.56	(Threshold: 679.6707)
2022-07-04 17:36:11 - Precision with Manhatten-Distance:          71.17
2022-07-04 17:36:11 - Recall with Manhatten-Distance:             92.80
2022-07-04 17:36:1



Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/559 [00:00<?, ?it/s]

2022-07-04 17:36:12 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 0 after 10 steps:
2022-07-04 17:36:13 - Accuracy with Cosine-Similarity:           72.36	(Threshold: 0.6510)
2022-07-04 17:36:13 - F1 with Cosine-Similarity:                 80.69	(Threshold: 0.5991)
2022-07-04 17:36:13 - Precision with Cosine-Similarity:          70.91
2022-07-04 17:36:13 - Recall with Cosine-Similarity:             93.60
2022-07-04 17:36:13 - Average Precision with Cosine-Similarity:  82.54

2022-07-04 17:36:13 - Accuracy with Manhatten-Distance:           72.36	(Threshold: 645.8988)
2022-07-04 17:36:13 - F1 with Manhatten-Distance:                 80.56	(Threshold: 679.3130)
2022-07-04 17:36:13 - Precision with Manhatten-Distance:          71.17
2022-07-04 17:36:13 - Recall with Manhatten-Distance:             92.80
2022-07-04 17:36:13 - Average Precision with Manhatten-Distance:  82.46

2022-07-04 17:36:13 - Accuracy with Euclidean-Distance:           72.86	(Threshold: 26.3614

Iteration:   0%|          | 0/559 [00:00<?, ?it/s]

2022-07-04 17:38:42 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 1 after 10 steps:
2022-07-04 17:38:42 - Accuracy with Cosine-Similarity:           64.32	(Threshold: 0.8751)
2022-07-04 17:38:42 - F1 with Cosine-Similarity:                 77.88	(Threshold: 0.7943)
2022-07-04 17:38:42 - Precision with Cosine-Similarity:          63.78
2022-07-04 17:38:42 - Recall with Cosine-Similarity:             100.00
2022-07-04 17:38:42 - Average Precision with Cosine-Similarity:  69.96

2022-07-04 17:38:42 - Accuracy with Manhatten-Distance:           64.32	(Threshold: 383.7397)
2022-07-04 17:38:42 - F1 with Manhatten-Distance:                 77.88	(Threshold: 496.3971)
2022-07-04 17:38:42 - Precision with Manhatten-Distance:          63.78
2022-07-04 17:38:42 - Recall with Manhatten-Distance:             100.00
2022-07-04 17:38:42 - Average Precision with Manhatten-Distance:  70.52

2022-07-04 17:38:42 - Accuracy with Euclidean-Distance:           64.32	(Threshold: 15.51

Iteration:   0%|          | 0/559 [00:00<?, ?it/s]

2022-07-04 17:40:23 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 2 after 10 steps:
2022-07-04 17:40:23 - Accuracy with Cosine-Similarity:           64.32	(Threshold: 0.5457)
2022-07-04 17:40:23 - F1 with Cosine-Similarity:                 77.74	(Threshold: 0.5457)
2022-07-04 17:40:23 - Precision with Cosine-Similarity:          63.92
2022-07-04 17:40:23 - Recall with Cosine-Similarity:             99.20
2022-07-04 17:40:23 - Average Precision with Cosine-Similarity:  61.81

2022-07-04 17:40:23 - Accuracy with Manhatten-Distance:           64.32	(Threshold: 618.4559)
2022-07-04 17:40:23 - F1 with Manhatten-Distance:                 77.88	(Threshold: 671.2092)
2022-07-04 17:40:23 - Precision with Manhatten-Distance:          63.78
2022-07-04 17:40:23 - Recall with Manhatten-Distance:             100.00
2022-07-04 17:40:23 - Average Precision with Manhatten-Distance:  63.24

2022-07-04 17:40:23 - Accuracy with Euclidean-Distance:           64.32	(Threshold: 29.512

Iteration:   0%|          | 0/559 [00:00<?, ?it/s]

2022-07-04 17:42:04 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 3 after 10 steps:
2022-07-04 17:42:05 - Accuracy with Cosine-Similarity:           63.82	(Threshold: 0.2786)
2022-07-04 17:42:05 - F1 with Cosine-Similarity:                 77.64	(Threshold: 0.2786)
2022-07-04 17:42:05 - Precision with Cosine-Similarity:          63.45
2022-07-04 17:42:05 - Recall with Cosine-Similarity:             100.00
2022-07-04 17:42:05 - Average Precision with Cosine-Similarity:  60.34

2022-07-04 17:42:05 - Accuracy with Manhatten-Distance:           64.32	(Threshold: 738.5422)
2022-07-04 17:42:05 - F1 with Manhatten-Distance:                 77.88	(Threshold: 738.5422)
2022-07-04 17:42:05 - Precision with Manhatten-Distance:          63.78
2022-07-04 17:42:05 - Recall with Manhatten-Distance:             100.00
2022-07-04 17:42:05 - Average Precision with Manhatten-Distance:  60.74

2022-07-04 17:42:05 - Accuracy with Euclidean-Distance:           63.82	(Threshold: 37.54

Iteration:   0%|          | 0/559 [00:00<?, ?it/s]

2022-07-04 17:43:45 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 4 after 10 steps:
2022-07-04 17:43:46 - Accuracy with Cosine-Similarity:           63.32	(Threshold: 0.1473)
2022-07-04 17:43:46 - F1 with Cosine-Similarity:                 77.40	(Threshold: 0.1473)
2022-07-04 17:43:46 - Precision with Cosine-Similarity:          63.13
2022-07-04 17:43:46 - Recall with Cosine-Similarity:             100.00
2022-07-04 17:43:46 - Average Precision with Cosine-Similarity:  59.51

2022-07-04 17:43:46 - Accuracy with Manhatten-Distance:           63.82	(Threshold: 715.9462)
2022-07-04 17:43:46 - F1 with Manhatten-Distance:                 77.50	(Threshold: 715.9462)
2022-07-04 17:43:46 - Precision with Manhatten-Distance:          63.59
2022-07-04 17:43:46 - Recall with Manhatten-Distance:             99.20
2022-07-04 17:43:46 - Average Precision with Manhatten-Distance:  59.82

2022-07-04 17:43:46 - Accuracy with Euclidean-Distance:           63.32	(Threshold: 40.864

Iteration:   0%|          | 0/559 [00:00<?, ?it/s]

2022-07-04 17:45:27 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 5 after 10 steps:
2022-07-04 17:45:27 - Accuracy with Cosine-Similarity:           62.81	(Threshold: 0.1427)
2022-07-04 17:45:27 - F1 with Cosine-Similarity:                 76.88	(Threshold: 0.1427)
2022-07-04 17:45:27 - Precision with Cosine-Similarity:          63.08
2022-07-04 17:45:27 - Recall with Cosine-Similarity:             98.40
2022-07-04 17:45:27 - Average Precision with Cosine-Similarity:  59.84

2022-07-04 17:45:27 - Accuracy with Manhatten-Distance:           63.32	(Threshold: 785.0098)
2022-07-04 17:45:27 - F1 with Manhatten-Distance:                 77.40	(Threshold: 785.0098)
2022-07-04 17:45:27 - Precision with Manhatten-Distance:          63.13
2022-07-04 17:45:27 - Recall with Manhatten-Distance:             100.00
2022-07-04 17:45:27 - Average Precision with Manhatten-Distance:  60.39

2022-07-04 17:45:27 - Accuracy with Euclidean-Distance:           62.81	(Threshold: 40.950

Iteration:   0%|          | 0/559 [00:00<?, ?it/s]

2022-07-04 17:47:08 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 6 after 10 steps:
2022-07-04 17:47:09 - Accuracy with Cosine-Similarity:           62.81	(Threshold: 0.0468)
2022-07-04 17:47:09 - F1 with Cosine-Similarity:                 77.02	(Threshold: 0.0468)
2022-07-04 17:47:09 - Precision with Cosine-Similarity:          62.94
2022-07-04 17:47:09 - Recall with Cosine-Similarity:             99.20
2022-07-04 17:47:09 - Average Precision with Cosine-Similarity:  56.76

2022-07-04 17:47:09 - Accuracy with Manhatten-Distance:           63.32	(Threshold: 756.9078)
2022-07-04 17:47:09 - F1 with Manhatten-Distance:                 77.40	(Threshold: 756.9078)
2022-07-04 17:47:09 - Precision with Manhatten-Distance:          63.13
2022-07-04 17:47:09 - Recall with Manhatten-Distance:             100.00
2022-07-04 17:47:09 - Average Precision with Manhatten-Distance:  57.49

2022-07-04 17:47:09 - Accuracy with Euclidean-Distance:           62.81	(Threshold: 43.442

Iteration:   0%|          | 0/559 [00:00<?, ?it/s]

2022-07-04 17:48:49 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 7 after 10 steps:
2022-07-04 17:48:50 - Accuracy with Cosine-Similarity:           62.31	(Threshold: -0.0440)
2022-07-04 17:48:50 - F1 with Cosine-Similarity:                 76.78	(Threshold: -0.1617)
2022-07-04 17:48:50 - Precision with Cosine-Similarity:          62.63
2022-07-04 17:48:50 - Recall with Cosine-Similarity:             99.20
2022-07-04 17:48:50 - Average Precision with Cosine-Similarity:  56.14

2022-07-04 17:48:50 - Accuracy with Manhatten-Distance:           62.81	(Threshold: 654.7641)
2022-07-04 17:48:50 - F1 with Manhatten-Distance:                 77.02	(Threshold: 723.7513)
2022-07-04 17:48:50 - Precision with Manhatten-Distance:          62.94
2022-07-04 17:48:50 - Recall with Manhatten-Distance:             99.20
2022-07-04 17:48:50 - Average Precision with Manhatten-Distance:  57.08

2022-07-04 17:48:50 - Accuracy with Euclidean-Distance:           62.31	(Threshold: 45.46

Iteration:   0%|          | 0/559 [00:00<?, ?it/s]

2022-07-04 17:50:30 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 8 after 10 steps:
2022-07-04 17:50:31 - Accuracy with Cosine-Similarity:           62.81	(Threshold: -0.0761)
2022-07-04 17:50:31 - F1 with Cosine-Similarity:                 77.02	(Threshold: -0.0761)
2022-07-04 17:50:31 - Precision with Cosine-Similarity:          62.94
2022-07-04 17:50:31 - Recall with Cosine-Similarity:             99.20
2022-07-04 17:50:31 - Average Precision with Cosine-Similarity:  56.15

2022-07-04 17:50:31 - Accuracy with Manhatten-Distance:           62.81	(Threshold: 741.2019)
2022-07-04 17:50:31 - F1 with Manhatten-Distance:                 77.02	(Threshold: 741.2019)
2022-07-04 17:50:31 - Precision with Manhatten-Distance:          62.94
2022-07-04 17:50:31 - Recall with Manhatten-Distance:             99.20
2022-07-04 17:50:31 - Average Precision with Manhatten-Distance:  56.94

2022-07-04 17:50:31 - Accuracy with Euclidean-Distance:           62.81	(Threshold: 46.17

Iteration:   0%|          | 0/559 [00:00<?, ?it/s]

2022-07-04 17:52:11 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 9 after 10 steps:
2022-07-04 17:52:12 - Accuracy with Cosine-Similarity:           62.81	(Threshold: -0.0458)
2022-07-04 17:52:12 - F1 with Cosine-Similarity:                 77.02	(Threshold: -0.0458)
2022-07-04 17:52:12 - Precision with Cosine-Similarity:          62.94
2022-07-04 17:52:12 - Recall with Cosine-Similarity:             99.20
2022-07-04 17:52:12 - Average Precision with Cosine-Similarity:  56.82

2022-07-04 17:52:12 - Accuracy with Manhatten-Distance:           62.81	(Threshold: 750.4122)
2022-07-04 17:52:12 - F1 with Manhatten-Distance:                 77.02	(Threshold: 750.4122)
2022-07-04 17:52:12 - Precision with Manhatten-Distance:          62.94
2022-07-04 17:52:12 - Recall with Manhatten-Distance:             99.20
2022-07-04 17:52:12 - Average Precision with Manhatten-Distance:  57.51

2022-07-04 17:52:12 - Accuracy with Euclidean-Distance:           62.81	(Threshold: 45.61

In [76]:
trained_model.evaluate(evaluator)

2022-07-04 19:20:25 - Binary Accuracy Evaluation of the model on sts-dev dataset:
2022-07-04 19:20:25 - Accuracy with Cosine-Similarity:           63.32	(Threshold: -0.0233)
2022-07-04 19:20:25 - F1 with Cosine-Similarity:                 77.26	(Threshold: -0.0233)
2022-07-04 19:20:25 - Precision with Cosine-Similarity:          63.27
2022-07-04 19:20:25 - Recall with Cosine-Similarity:             99.20
2022-07-04 19:20:25 - Average Precision with Cosine-Similarity:  56.70

2022-07-04 19:20:25 - Accuracy with Manhatten-Distance:           62.81	(Threshold: 751.1327)
2022-07-04 19:20:25 - F1 with Manhatten-Distance:                 77.02	(Threshold: 751.1327)
2022-07-04 19:20:25 - Precision with Manhatten-Distance:          62.94
2022-07-04 19:20:25 - Recall with Manhatten-Distance:             99.20
2022-07-04 19:20:25 - Average Precision with Manhatten-Distance:  57.50

2022-07-04 19:20:25 - Accuracy with Euclidean-Distance:           62.81	(Threshold: 45.6930)
2022-07-04 19:20:25 - 

0.5801685385543429

In [75]:
taska_novelty_train_df.label.value_counts()

0    4345
1     123
Name: label, dtype: int64

#### For Novelty:

In [77]:
trained_model, evaluator = sbert_training.train_model(taska_novelty_train_df, taska_novelty_valid_df, 
                                                      output_path + '/task-A/novelty/sbert/', 
                                                    'sentence-transformers/nli-roberta-large', 
                                                    num_epochs=10, train_batch_size=8,
                                                    model_suffix='extra-conclusions', max_seq_length=512, special_tokens=[], 
                                                    loss='ContrastiveLoss', sentence_transformer=False, 
                                                    evaluation_steps=10,
                                                     lr=5e-6)

2022-07-04 19:22:08 - Use pytorch device: cuda
2022-07-04 19:22:08 - Read Triplet train dataset
Len of training: 4468
Len of Dev: 200
Evaluating before start learning.....
2022-07-04 19:22:08 - Binary Accuracy Evaluation of the model on sts-dev dataset:
2022-07-04 19:22:08 - Accuracy with Cosine-Similarity:           58.50	(Threshold: 0.9650)
2022-07-04 19:22:08 - F1 with Cosine-Similarity:                 58.57	(Threshold: 0.3644)
2022-07-04 19:22:08 - Precision with Cosine-Similarity:          41.41
2022-07-04 19:22:08 - Recall with Cosine-Similarity:             100.00
2022-07-04 19:22:08 - Average Precision with Cosine-Similarity:  36.74

2022-07-04 19:22:08 - Accuracy with Manhatten-Distance:           58.50	(Threshold: 198.5149)
2022-07-04 19:22:08 - F1 with Manhatten-Distance:                 58.57	(Threshold: 853.4846)
2022-07-04 19:22:08 - Precision with Manhatten-Distance:          41.41
2022-07-04 19:22:08 - Recall with Manhatten-Distance:             100.00
2022-07-04 19:22



Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/559 [00:00<?, ?it/s]

2022-07-04 19:22:10 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 0 after 10 steps:
2022-07-04 19:22:10 - Accuracy with Cosine-Similarity:           58.50	(Threshold: 0.9650)
2022-07-04 19:22:10 - F1 with Cosine-Similarity:                 58.57	(Threshold: 0.3640)
2022-07-04 19:22:10 - Precision with Cosine-Similarity:          41.41
2022-07-04 19:22:10 - Recall with Cosine-Similarity:             100.00
2022-07-04 19:22:10 - Average Precision with Cosine-Similarity:  36.75

2022-07-04 19:22:10 - Accuracy with Manhatten-Distance:           58.50	(Threshold: 198.5232)
2022-07-04 19:22:10 - F1 with Manhatten-Distance:                 58.57	(Threshold: 853.7286)
2022-07-04 19:22:10 - Precision with Manhatten-Distance:          41.41
2022-07-04 19:22:10 - Recall with Manhatten-Distance:             100.00
2022-07-04 19:22:10 - Average Precision with Manhatten-Distance:  36.57

2022-07-04 19:22:10 - Accuracy with Euclidean-Distance:           58.50	(Threshold: 7.990

Iteration:   0%|          | 0/559 [00:00<?, ?it/s]

2022-07-04 19:24:38 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 1 after 10 steps:
2022-07-04 19:24:39 - Accuracy with Cosine-Similarity:           58.50	(Threshold: 0.9598)
2022-07-04 19:24:39 - F1 with Cosine-Similarity:                 59.21	(Threshold: 0.1811)
2022-07-04 19:24:39 - Precision with Cosine-Similarity:          42.05
2022-07-04 19:24:39 - Recall with Cosine-Similarity:             100.00
2022-07-04 19:24:39 - Average Precision with Cosine-Similarity:  35.84

2022-07-04 19:24:39 - Accuracy with Manhatten-Distance:           58.50	(Threshold: 213.7083)
2022-07-04 19:24:39 - F1 with Manhatten-Distance:                 59.21	(Threshold: 958.1305)
2022-07-04 19:24:39 - Precision with Manhatten-Distance:          42.05
2022-07-04 19:24:39 - Recall with Manhatten-Distance:             100.00
2022-07-04 19:24:39 - Average Precision with Manhatten-Distance:  35.71

2022-07-04 19:24:39 - Accuracy with Euclidean-Distance:           58.50	(Threshold: 8.542

Iteration:   0%|          | 0/559 [00:00<?, ?it/s]

2022-07-04 19:26:20 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 2 after 10 steps:
2022-07-04 19:26:21 - Accuracy with Cosine-Similarity:           58.50	(Threshold: 0.9585)
2022-07-04 19:26:21 - F1 with Cosine-Similarity:                 59.42	(Threshold: 0.2930)
2022-07-04 19:26:21 - Precision with Cosine-Similarity:          42.27
2022-07-04 19:26:21 - Recall with Cosine-Similarity:             100.00
2022-07-04 19:26:21 - Average Precision with Cosine-Similarity:  36.94

2022-07-04 19:26:21 - Accuracy with Manhatten-Distance:           58.50	(Threshold: 214.5703)
2022-07-04 19:26:21 - F1 with Manhatten-Distance:                 59.21	(Threshold: 903.2573)
2022-07-04 19:26:21 - Precision with Manhatten-Distance:          42.05
2022-07-04 19:26:21 - Recall with Manhatten-Distance:             100.00
2022-07-04 19:26:21 - Average Precision with Manhatten-Distance:  37.09

2022-07-04 19:26:21 - Accuracy with Euclidean-Distance:           58.50	(Threshold: 8.632

Iteration:   0%|          | 0/559 [00:00<?, ?it/s]

2022-07-04 19:33:15 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 3 after 10 steps:
2022-07-04 19:33:16 - Accuracy with Cosine-Similarity:           58.50	(Threshold: 0.9628)
2022-07-04 19:33:16 - F1 with Cosine-Similarity:                 59.42	(Threshold: 0.2782)
2022-07-04 19:33:16 - Precision with Cosine-Similarity:          42.27
2022-07-04 19:33:16 - Recall with Cosine-Similarity:             100.00
2022-07-04 19:33:16 - Average Precision with Cosine-Similarity:  37.53

2022-07-04 19:33:16 - Accuracy with Manhatten-Distance:           58.50	(Threshold: 207.9219)
2022-07-04 19:33:16 - F1 with Manhatten-Distance:                 59.21	(Threshold: 904.1401)
2022-07-04 19:33:16 - Precision with Manhatten-Distance:          42.05
2022-07-04 19:33:16 - Recall with Manhatten-Distance:             100.00
2022-07-04 19:33:16 - Average Precision with Manhatten-Distance:  37.70

2022-07-04 19:33:16 - Accuracy with Euclidean-Distance:           58.50	(Threshold: 8.162

Iteration:   0%|          | 0/559 [00:00<?, ?it/s]

2022-07-04 19:39:16 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 4 after 10 steps:
2022-07-04 19:39:17 - Accuracy with Cosine-Similarity:           58.50	(Threshold: 0.9589)
2022-07-04 19:39:17 - F1 with Cosine-Similarity:                 59.64	(Threshold: 0.2333)
2022-07-04 19:39:17 - Precision with Cosine-Similarity:          42.49
2022-07-04 19:39:17 - Recall with Cosine-Similarity:             100.00
2022-07-04 19:39:17 - Average Precision with Cosine-Similarity:  39.10

2022-07-04 19:39:17 - Accuracy with Manhatten-Distance:           58.50	(Threshold: 218.7397)
2022-07-04 19:39:17 - F1 with Manhatten-Distance:                 59.15	(Threshold: 761.8339)
2022-07-04 19:39:17 - Precision with Manhatten-Distance:          48.09
2022-07-04 19:39:17 - Recall with Manhatten-Distance:             76.83
2022-07-04 19:39:17 - Average Precision with Manhatten-Distance:  39.33

2022-07-04 19:39:17 - Accuracy with Euclidean-Distance:           59.50	(Threshold: 28.809

Iteration:   0%|          | 0/559 [00:00<?, ?it/s]

2022-07-04 19:41:00 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 5 after 10 steps:
2022-07-04 19:41:00 - Accuracy with Cosine-Similarity:           58.50	(Threshold: 0.9556)
2022-07-04 19:41:00 - F1 with Cosine-Similarity:                 59.34	(Threshold: 0.2846)
2022-07-04 19:41:00 - Precision with Cosine-Similarity:          42.41
2022-07-04 19:41:00 - Recall with Cosine-Similarity:             98.78
2022-07-04 19:41:00 - Average Precision with Cosine-Similarity:  39.03

2022-07-04 19:41:00 - Accuracy with Manhatten-Distance:           58.50	(Threshold: 224.5653)
2022-07-04 19:41:00 - F1 with Manhatten-Distance:                 59.42	(Threshold: 936.5785)
2022-07-04 19:41:00 - Precision with Manhatten-Distance:          42.27
2022-07-04 19:41:00 - Recall with Manhatten-Distance:             100.00
2022-07-04 19:41:00 - Average Precision with Manhatten-Distance:  38.99

2022-07-04 19:41:00 - Accuracy with Euclidean-Distance:           58.50	(Threshold: 8.8841

Iteration:   0%|          | 0/559 [00:00<?, ?it/s]

2022-07-04 19:42:42 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 6 after 10 steps:
2022-07-04 19:42:43 - Accuracy with Cosine-Similarity:           59.00	(Threshold: 0.5819)
2022-07-04 19:42:43 - F1 with Cosine-Similarity:                 59.21	(Threshold: 0.2345)
2022-07-04 19:42:43 - Precision with Cosine-Similarity:          42.05
2022-07-04 19:42:43 - Recall with Cosine-Similarity:             100.00
2022-07-04 19:42:43 - Average Precision with Cosine-Similarity:  39.33

2022-07-04 19:42:43 - Accuracy with Manhatten-Distance:           59.00	(Threshold: 713.0913)
2022-07-04 19:42:43 - F1 with Manhatten-Distance:                 58.99	(Threshold: 956.0571)
2022-07-04 19:42:43 - Precision with Manhatten-Distance:          41.84
2022-07-04 19:42:43 - Recall with Manhatten-Distance:             100.00
2022-07-04 19:42:43 - Average Precision with Manhatten-Distance:  39.44

2022-07-04 19:42:43 - Accuracy with Euclidean-Distance:           59.00	(Threshold: 27.89

Iteration:   0%|          | 0/559 [00:00<?, ?it/s]

2022-07-04 19:48:12 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 7 after 10 steps:
2022-07-04 19:48:12 - Accuracy with Cosine-Similarity:           59.00	(Threshold: 0.5577)
2022-07-04 19:48:12 - F1 with Cosine-Similarity:                 59.18	(Threshold: 0.3321)
2022-07-04 19:48:12 - Precision with Cosine-Similarity:          42.70
2022-07-04 19:48:12 - Recall with Cosine-Similarity:             96.34
2022-07-04 19:48:12 - Average Precision with Cosine-Similarity:  39.93

2022-07-04 19:48:12 - Accuracy with Manhatten-Distance:           59.50	(Threshold: 683.0934)
2022-07-04 19:48:12 - F1 with Manhatten-Distance:                 59.53	(Threshold: 761.0834)
2022-07-04 19:48:12 - Precision with Manhatten-Distance:          48.12
2022-07-04 19:48:12 - Recall with Manhatten-Distance:             78.05
2022-07-04 19:48:12 - Average Precision with Manhatten-Distance:  40.18

2022-07-04 19:48:12 - Accuracy with Euclidean-Distance:           59.00	(Threshold: 28.3010

Iteration:   0%|          | 0/559 [00:00<?, ?it/s]

2022-07-04 19:51:39 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 8 after 10 steps:
2022-07-04 19:51:40 - Accuracy with Cosine-Similarity:           59.00	(Threshold: 0.5704)
2022-07-04 19:51:40 - F1 with Cosine-Similarity:                 59.18	(Threshold: 0.3359)
2022-07-04 19:51:40 - Precision with Cosine-Similarity:          42.70
2022-07-04 19:51:40 - Recall with Cosine-Similarity:             96.34
2022-07-04 19:51:40 - Average Precision with Cosine-Similarity:  40.39

2022-07-04 19:51:40 - Accuracy with Manhatten-Distance:           58.50	(Threshold: 222.6154)
2022-07-04 19:51:40 - F1 with Manhatten-Distance:                 58.91	(Threshold: 945.8055)
2022-07-04 19:51:40 - Precision with Manhatten-Distance:          41.97
2022-07-04 19:51:40 - Recall with Manhatten-Distance:             98.78
2022-07-04 19:51:40 - Average Precision with Manhatten-Distance:  40.37

2022-07-04 19:51:40 - Accuracy with Euclidean-Distance:           59.00	(Threshold: 28.2771

Iteration:   0%|          | 0/559 [00:00<?, ?it/s]

2022-07-04 19:54:14 - Binary Accuracy Evaluation of the model on sts-dev dataset in epoch 9 after 10 steps:
2022-07-04 19:54:14 - Accuracy with Cosine-Similarity:           59.50	(Threshold: 0.5565)
2022-07-04 19:54:14 - F1 with Cosine-Similarity:                 59.12	(Threshold: 0.2403)
2022-07-04 19:54:14 - Precision with Cosine-Similarity:          42.19
2022-07-04 19:54:14 - Recall with Cosine-Similarity:             98.78
2022-07-04 19:54:14 - Average Precision with Cosine-Similarity:  40.48

2022-07-04 19:54:14 - Accuracy with Manhatten-Distance:           58.50	(Threshold: 224.8474)
2022-07-04 19:54:14 - F1 with Manhatten-Distance:                 59.09	(Threshold: 770.4397)
2022-07-04 19:54:14 - Precision with Manhatten-Distance:          47.10
2022-07-04 19:54:14 - Recall with Manhatten-Distance:             79.27
2022-07-04 19:54:14 - Average Precision with Manhatten-Distance:  40.55

2022-07-04 19:54:14 - Accuracy with Euclidean-Distance:           58.50	(Threshold: 8.8578)

In [78]:
trained_model.evaluate(evaluator)

2022-07-04 19:55:55 - Binary Accuracy Evaluation of the model on sts-dev dataset:
2022-07-04 19:55:56 - Accuracy with Cosine-Similarity:           60.00	(Threshold: 0.5639)
2022-07-04 19:55:56 - F1 with Cosine-Similarity:                 59.12	(Threshold: 0.2386)
2022-07-04 19:55:56 - Precision with Cosine-Similarity:          42.19
2022-07-04 19:55:56 - Recall with Cosine-Similarity:             98.78
2022-07-04 19:55:56 - Average Precision with Cosine-Similarity:  40.46

2022-07-04 19:55:56 - Accuracy with Manhatten-Distance:           58.50	(Threshold: 224.9925)
2022-07-04 19:55:56 - F1 with Manhatten-Distance:                 59.36	(Threshold: 768.8490)
2022-07-04 19:55:56 - Precision with Manhatten-Distance:          47.45
2022-07-04 19:55:56 - Recall with Manhatten-Distance:             79.27
2022-07-04 19:55:56 - Average Precision with Manhatten-Distance:  40.46

2022-07-04 19:55:56 - Accuracy with Euclidean-Distance:           58.50	(Threshold: 8.8632)
2022-07-04 19:55:56 - F1 

0.4106570104160372