In [1]:
import os
os.getcwd()

'/Users/sandrinechausson/Documents/easyclaimsdetection/experiments'

In [2]:
import json
import math
import pandas as pd
import numpy as np
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from sklearn.metrics import classification_report, accuracy_score

In [3]:
tqdm.pandas()

# Experiments from paper

## Global variables

In [4]:
test_df = pd.read_pickle('../data/testing.pkl')

In [5]:
CLASSES = ['0_0', '1_1', '1_2', '1_3', '1_4', '1_6', '1_7']

In [6]:
with open('../data/claims/claims.json') as file:
    claims = json.load(file)

## Helper functions

In [7]:
def get_median(distr, start, end, step):
    x = np.arange(start, end, step)
    exp_f = np.exp(distr)
    alpha = exp_f.sum() * 0.5
    try:
        median_low = x[exp_f.cumsum() <= alpha][-1]
    except IndexError:
        median_low = start
    try:
        median_high = x[::-1][exp_f[::-1].cumsum() < alpha][-1]
    except IndexError:
        median_high = end
    median_avg = (median_low + median_high) / 2
    return median_avg

def get_thresholds_from_record(record, start=0, end=1, step=0.01):
    thresholds = dict()
    for claim in record:
        thresholds[claim] = {'threshold': get_median(record[claim]['distributions'][-1], start, end, step)}
    return thresholds

def keep_only_top_claim(scores_dict):
    new_dict = dict()
    
    for cl in CLASSES[1:]:
        rel_claims = [t for t in claims if t[:3] == cl]
        cl_dict = {t: scores_dict[t] for t in rel_claims}
        highest_t = max(cl_dict, key = cl_dict.get)
        for t in rel_claims:
            if t == highest_t:
                new_dict[t] = scores_dict[t]
            else:
                new_dict[t] = 0
    return new_dict

def get_multi_pred(row, column, claims_dict, thresholds=None, black_list=None, min_length=1, top_len=2,  min_avgs=None, top_claim_only=False):
    if thresholds is None:
        thresholds = {t: {'threshold': 0.0} for t in claims_dict}
    if min_avgs is None:
        min_avgs = {c: {'threshold': 0.0} for c in CLASSES[1:]}
    if black_list is None:
        black_list = []
    result = list()
    if isinstance(row[column], list):
        zsl_scores = {k:row[column][0][claims_dict[k]] for k in claims_dict}
    else:
        zsl_scores = {k:row[column][claims_dict[k]] for k in claims_dict}
    if top_claim_only:
        zsl_scores = keep_only_top_claim(zsl_scores)
    for c in CLASSES[1:]:
        sub_pred = {k: zsl_scores[k] for k in zsl_scores if k[:3] == c and not k in black_list}
        top_traits = sorted([k for k in sub_pred], reverse=True, key=lambda item: sub_pred[item])[:top_len]
        top_scores = sorted([sub_pred[k] for k in sub_pred], reverse=True)[:top_len]
        if c in min_avgs:
            avg_threshold = min_avgs[c]['threshold']
        else:
            avg_threshold = sum([min_avgs[t]['threshold'] for t in top_traits]) / top_len
        if len([k for k in sub_pred if sub_pred[k] > thresholds[k]['threshold']]) >= min_length and (sum(top_scores) / top_len) > avg_threshold:
            result.append(1)
        else:
            result.append(0)
    if len([e for e in result if e ==1]) > 0:
        result = [0] + result
    else:
        result = [1] + result
    return result

## Section 4.1: Evaluation against baselines

### Baseline 1: Fine-tuned RoBERTa

In [8]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['RoBERTa'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

         0_0       0.98      0.97      0.98      2395
         1_1       0.73      0.89      0.80        36
         1_2       0.56      0.42      0.48        12
         1_3       0.88      0.58      0.70        48
         1_4       0.68      0.81      0.74        70
         1_6       0.85      0.71      0.77        24
         1_7       0.41      0.65      0.50        34

   micro avg       0.95      0.95      0.95      2619
   macro avg       0.73      0.72      0.71      2619
weighted avg       0.96      0.95      0.95      2619
 samples avg       0.95      0.95      0.95      2619



### Baseline 2: Fine-tuned RoBERTa w/ limited training data (RoBERTa_MINI)

In [9]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['RoBERTa_MINI'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

         0_0       0.94      0.99      0.96      2395
         1_1       0.56      0.75      0.64        36
         1_2       0.00      0.00      0.00        12
         1_3       0.00      0.00      0.00        48
         1_4       0.62      0.14      0.23        70
         1_6       0.92      0.46      0.61        24
         1_7       0.00      0.00      0.00        34

   micro avg       0.93      0.92      0.93      2619
   macro avg       0.43      0.33      0.35      2619
weighted avg       0.89      0.92      0.90      2619
 samples avg       0.93      0.93      0.93      2619



  _warn_prf(average, modifier, msg_start, len(result))


### Baseline 3: SBERT

In [10]:
with open('../data/bisection_records/SBERT.json') as file:
    sbert_record = json.load(file)

In [11]:
sbert_thresholds = get_thresholds_from_record(sbert_record, start=-1, end=1)

In [12]:
test_df['pred_multi'] = test_df.apply(lambda x: get_multi_pred(x, 'sbert_cosine', claims, thresholds=sbert_thresholds), axis=1)

In [13]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['pred_multi'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

         0_0       0.96      0.96      0.96      2395
         1_1       0.41      0.92      0.57        36
         1_2       0.20      0.08      0.12        12
         1_3       0.64      0.15      0.24        48
         1_4       0.40      0.44      0.42        70
         1_6       0.52      0.96      0.68        24
         1_7       0.50      0.50      0.50        34

   micro avg       0.91      0.92      0.92      2619
   macro avg       0.52      0.57      0.50      2619
weighted avg       0.92      0.92      0.92      2619
 samples avg       0.92      0.93      0.92      2619



In [14]:
datapoints = list()
for k in sbert_record:
    datapoints += sbert_record[k]['texts']
len(set(datapoints))

682

### Baseline 4: BART MNLI model with unique threshold (Zero-Shot Learning approach)

In [15]:
artificial_thresholds = {k: {'threshold': 0.95} for k in claims}

In [16]:
test_df['pred_multi'] = test_df.apply(lambda x: get_multi_pred(x, 'FSL_BART', claims, thresholds=artificial_thresholds), axis=1)

In [17]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['pred_multi'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

         0_0       0.97      0.92      0.94      2395
         1_1       1.00      0.33      0.50        36
         1_2       0.53      0.75      0.62        12
         1_3       0.60      0.75      0.67        48
         1_4       0.75      0.47      0.58        70
         1_6       0.93      0.54      0.68        24
         1_7       0.10      0.65      0.17        34

   micro avg       0.88      0.89      0.88      2619
   macro avg       0.70      0.63      0.60      2619
weighted avg       0.94      0.89      0.91      2619
 samples avg       0.89      0.89      0.89      2619



### Ours 1: Few-shots NLI approach using BART MNLI

In [18]:
with open('../data/bisection_records/BART.json') as file:
    bart_record = json.load(file)

In [19]:
bart_thresholds = get_thresholds_from_record(bart_record, start=0, end=1)

In [20]:
test_df['pred_multi'] = test_df.apply(lambda x: get_multi_pred(x, 'FSL_BART', claims, thresholds=bart_thresholds), axis=1)

In [21]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['pred_multi'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

         0_0       0.98      0.98      0.98      2395
         1_1       0.68      0.64      0.66        36
         1_2       0.82      0.75      0.78        12
         1_3       0.61      0.77      0.68        48
         1_4       0.68      0.70      0.69        70
         1_6       0.88      0.62      0.73        24
         1_7       0.51      0.65      0.57        34

   micro avg       0.95      0.95      0.95      2619
   macro avg       0.74      0.73      0.73      2619
weighted avg       0.95      0.95      0.95      2619
 samples avg       0.95      0.96      0.95      2619



In [22]:
datapoints = list()
for k in bart_record:
    datapoints += bart_record[k]['texts']
len(set(datapoints))

517

### Ours 2: Few-shots NLI approach using DistilBART MNLI

In [23]:
with open('../data/bisection_records/DistilBART.json') as file:
    distilbart_record = json.load(file)

In [24]:
distilbart_thresholds = get_thresholds_from_record(distilbart_record, start=0, end=1)

In [25]:
test_df['pred_multi'] = test_df.apply(lambda x: get_multi_pred(x, 'FSL_DistilBART', claims, thresholds=distilbart_thresholds), axis=1)

In [26]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['pred_multi'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

         0_0       0.98      0.97      0.97      2395
         1_1       0.68      0.75      0.71        36
         1_2       0.48      0.92      0.63        12
         1_3       0.73      0.50      0.59        48
         1_4       0.58      0.71      0.64        70
         1_6       0.83      0.79      0.81        24
         1_7       0.35      0.65      0.46        34

   micro avg       0.94      0.94      0.94      2619
   macro avg       0.66      0.76      0.69      2619
weighted avg       0.95      0.94      0.94      2619
 samples avg       0.94      0.95      0.94      2619



In [27]:
datapoints = list()
for k in distilbart_record:
    datapoints += distilbart_record[k]['texts']
len(set(datapoints))

596

### Ours 3: Few-shots NLI approach using DeBERTa MNLI

In [28]:
with open('../data/bisection_records/DeBERTa.json') as file:
    deberta_record = json.load(file)

In [29]:
deberta_thresholds = get_thresholds_from_record(deberta_record, start=0, end=1)

In [30]:
test_df['pred_multi'] = test_df.apply(lambda x: get_multi_pred(x, 'FSL_DeBERTa', claims, thresholds=deberta_thresholds), axis=1)

In [31]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['pred_multi'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

         0_0       0.98      0.93      0.95      2395
         1_1       0.64      0.64      0.64        36
         1_2       0.48      0.83      0.61        12
         1_3       0.52      0.52      0.52        48
         1_4       0.68      0.79      0.73        70
         1_6       0.65      0.71      0.68        24
         1_7       0.13      0.68      0.21        34

   micro avg       0.89      0.91      0.90      2619
   macro avg       0.58      0.73      0.62      2619
weighted avg       0.94      0.91      0.92      2619
 samples avg       0.90      0.91      0.90      2619



In [32]:
datapoints = list()
for k in deberta_record:
    datapoints += deberta_record[k]['texts']
len(set(datapoints))

445

## Experiment 2: Changing classification requirements

#### A. Requiring at least 2 claims

In [33]:
with open('../data/bisection_records/BART.json') as file:
    bart_record = json.load(file)

In [34]:
bart_thresholds = get_thresholds_from_record(bart_record, start=0, end=1)

In [35]:
test_df['pred_multi'] = test_df.apply(lambda x: get_multi_pred(x, 'FSL_BART', claims, thresholds=bart_thresholds, min_length=2), axis=1)

In [36]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['pred_multi'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

         0_0       0.94      1.00      0.97      2395
         1_1       0.93      0.39      0.55        36
         1_2       1.00      0.08      0.15        12
         1_3       0.78      0.15      0.25        48
         1_4       0.89      0.36      0.51        70
         1_6       0.86      0.25      0.39        24
         1_7       0.60      0.09      0.15        34

   micro avg       0.94      0.93      0.94      2619
   macro avg       0.86      0.33      0.42      2619
weighted avg       0.93      0.93      0.92      2619
 samples avg       0.94      0.94      0.94      2619



#### B. Considering only top 1 claim ONLY during classification step (same thresholds as in initial set up)

In [37]:
with open('../data/bisection_records/BART.json') as file:
    bart_record = json.load(file)

In [38]:
bart_thresholds = get_thresholds_from_record(bart_record, start=0, end=1)

In [39]:
test_df['pred_multi'] = test_df.apply(lambda x: get_multi_pred(x, 'FSL_BART', claims, thresholds=bart_thresholds, top_claim_only=True), axis=1)

In [40]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['pred_multi'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

         0_0       0.97      0.98      0.98      2395
         1_1       0.77      0.64      0.70        36
         1_2       0.86      0.50      0.63        12
         1_3       0.60      0.75      0.67        48
         1_4       0.70      0.69      0.69        70
         1_6       0.88      0.62      0.73        24
         1_7       0.47      0.53      0.50        34

   micro avg       0.95      0.95      0.95      2619
   macro avg       0.75      0.67      0.70      2619
weighted avg       0.95      0.95      0.95      2619
 samples avg       0.95      0.95      0.95      2619



#### C. Considering only top 1 claim ONLY during threshold-tuning step (new thresholds)

In [41]:
with open('../data/bisection_records/BART_top_1_only.json') as file:
    bart_record = json.load(file)

In [42]:
bart_thresholds = get_thresholds_from_record(bart_record, start=0, end=1)

In [43]:
test_df['pred_multi'] = test_df.apply(lambda x: get_multi_pred(x, 'FSL_BART', claims, thresholds=bart_thresholds), axis=1)

In [44]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['pred_multi'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

         0_0       0.97      0.99      0.98      2395
         1_1       0.73      0.61      0.67        36
         1_2       0.89      0.67      0.76        12
         1_3       0.60      0.75      0.67        48
         1_4       0.80      0.64      0.71        70
         1_6       0.94      0.62      0.75        24
         1_7       0.52      0.47      0.49        34

   micro avg       0.95      0.96      0.95      2619
   macro avg       0.78      0.68      0.72      2619
weighted avg       0.95      0.96      0.95      2619
 samples avg       0.96      0.96      0.96      2619



#### D. Considering only top 1 claim during BOTH threshold-tuning and classification steps (new thresholds)

In [45]:
with open('../data/bisection_records/BART_top_1_only.json') as file:
    bart_record = json.load(file)

In [46]:
bart_thresholds = get_thresholds_from_record(bart_record, start=0, end=1)

In [47]:
test_df['pred_multi'] = test_df.apply(lambda x: get_multi_pred(x, 'FSL_BART', claims, thresholds=bart_thresholds, top_claim_only=True), axis=1)

In [48]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['pred_multi'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

         0_0       0.97      0.99      0.98      2395
         1_1       0.91      0.56      0.69        36
         1_2       0.86      0.50      0.63        12
         1_3       0.60      0.75      0.67        48
         1_4       0.84      0.59      0.69        70
         1_6       0.94      0.62      0.75        24
         1_7       0.53      0.47      0.50        34

   micro avg       0.95      0.95      0.95      2619
   macro avg       0.81      0.64      0.70      2619
weighted avg       0.95      0.95      0.95      2619
 samples avg       0.96      0.96      0.95      2619



#### E. Average of top 2 claims ONLY during classification step (same thresholds as in initial set up)

In [49]:
with open('../data/bisection_records/BART.json') as file:
    bart_record = json.load(file)

In [50]:
bart_thresholds = get_thresholds_from_record(bart_record, start=0, end=1)

In [51]:
test_df['pred_multi'] = test_df.apply(lambda x: get_multi_pred(x, 'FSL_BART', claims, min_length=2, min_avgs=bart_thresholds), axis=1)

In [52]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['pred_multi'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

         0_0       0.95      1.00      0.97      2395
         1_1       0.90      0.53      0.67        36
         1_2       1.00      0.33      0.50        12
         1_3       0.85      0.23      0.36        48
         1_4       0.84      0.44      0.58        70
         1_6       0.86      0.25      0.39        24
         1_7       0.80      0.24      0.36        34

   micro avg       0.94      0.94      0.94      2619
   macro avg       0.88      0.43      0.55      2619
weighted avg       0.94      0.94      0.93      2619
 samples avg       0.95      0.94      0.94      2619



#### F. Average of top 2 claims during BOTH the threshold-tuning and classification steps (new thresholds)

In [53]:
with open('../data/bisection_records/BART_top_2_avg.json') as file:
    bart_record = json.load(file)

In [54]:
bart_thresholds = get_thresholds_from_record(bart_record, start=0, end=1)

In [55]:
test_df['pred_multi'] = test_df.apply(lambda x: get_multi_pred(x, 'FSL_BART', claims, top_len=2, min_avgs=bart_thresholds), axis=1)

In [56]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['pred_multi'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

         0_0       0.95      1.00      0.97      2395
         1_1       0.89      0.44      0.59        36
         1_2       1.00      0.33      0.50        12
         1_3       0.80      0.33      0.47        48
         1_4       0.82      0.39      0.52        70
         1_6       0.75      0.12      0.21        24
         1_7       0.86      0.18      0.29        34

   micro avg       0.94      0.94      0.94      2619
   macro avg       0.87      0.40      0.51      2619
weighted avg       0.94      0.94      0.93      2619
 samples avg       0.94      0.94      0.94      2619



## Experiment 3: Using negated claims

In [57]:
with open('../data/claims/claims_neg.json') as file:
    claims_neg = json.load(file)

In [58]:
def normalise_nli_scores(row):
    dict_pos = {k: row['FSL_BART'][claims[k]] for k in claims}
    dict_neg = {k: row['FSL_BART_neg'][claims_neg[k]] for k in claims_neg}
    new_dict = dict()
    for k in dict_pos:
        if dict_neg[k] > dict_pos[k]:
            new_dict[claims[k]] = 0.0
        else:
            new_dict[claims[k]] = dict_pos[k]
    return new_dict

In [59]:
test_df['FSL_BART_pos_and_neg'] = test_df.apply(normalise_nli_scores, axis=1)

In [60]:
with open('../data/bisection_records/BART.json') as file:
    bart_record = json.load(file)

In [61]:
bart_thresholds = get_thresholds_from_record(bart_record, start=0, end=1)

In [62]:
test_df['pred_multi'] = test_df.apply(lambda x: get_multi_pred(x, 'FSL_BART_pos_and_neg', claims, thresholds=bart_thresholds), axis=1)

In [63]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['pred_multi'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

         0_0       0.97      0.98      0.98      2395
         1_1       0.70      0.58      0.64        36
         1_2       0.82      0.75      0.78        12
         1_3       0.60      0.75      0.67        48
         1_4       0.68      0.70      0.69        70
         1_6       0.94      0.62      0.75        24
         1_7       0.52      0.65      0.58        34

   micro avg       0.95      0.95      0.95      2619
   macro avg       0.75      0.72      0.73      2619
weighted avg       0.95      0.95      0.95      2619
 samples avg       0.95      0.96      0.95      2619



## Experiment 4: Adding more claims

In [78]:
with open('../data/claims/claims_extras.json') as file:
    claims_extras = json.load(file)
    
claims_w_extras = {**claims, **claims_extras}

In [80]:
with open('../data/bisection_records/BART.json') as file:
    bart_record = json.load(file)

with open('../data/bisection_records/BART_extras.json') as file:
    bart_record_extras = json.load(file)
    
bart_record_w_extras = {**bart_record, **bart_record_extras}

In [81]:
bart_extras_thresholds = get_thresholds_from_record(bart_record_w_extras, start=0, end=1)

In [82]:
test_df['pred_multi'] = test_df.apply(lambda x: get_multi_pred(x, 'FSL_BART_extras', claims_w_extras, thresholds=bart_extras_thresholds), axis=1)

In [83]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['pred_multi'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

         0_0       0.98      0.96      0.97      2395
         1_1       0.66      0.69      0.68        36
         1_2       0.48      0.92      0.63        12
         1_3       0.62      0.85      0.72        48
         1_4       0.55      0.83      0.66        70
         1_6       0.77      0.71      0.74        24
         1_7       0.37      0.68      0.47        34

   micro avg       0.93      0.95      0.94      2619
   macro avg       0.63      0.81      0.70      2619
weighted avg       0.95      0.95      0.94      2619
 samples avg       0.94      0.95      0.94      2619



## Experiment 5: Changing the value of p for the Probabilistic Bisection

### p = 0.6

In [12]:
with open('../data/bisection_records/BART_0-6.json') as file:
    bart_record = json.load(file)

In [13]:
bart_thresholds = get_thresholds_from_record(bart_record, start=0, end=1)

In [14]:
test_df['pred_multi'] = test_df.apply(lambda x: get_multi_pred(x, 'FSL_BART', thresholds=bart_thresholds), axis=1)

In [15]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['pred_multi'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

         0_0       0.98      0.91      0.94      2395
         1_1       0.68      0.64      0.66        36
         1_2       0.46      0.92      0.61        12
         1_3       0.61      0.77      0.68        48
         1_4       0.62      0.73      0.67        70
         1_6       0.83      0.62      0.71        24
         1_7       0.10      0.65      0.17        34

   micro avg       0.88      0.89      0.88      2619
   macro avg       0.61      0.75      0.64      2619
weighted avg       0.94      0.89      0.91      2619
 samples avg       0.89      0.89      0.89      2619



### p = 0.8

In [16]:
with open('../data/bisection_records/BART_0-8.json') as file:
    bart_record = json.load(file)

In [17]:
bart_thresholds = get_thresholds_from_record(bart_record, start=0, end=1)

In [18]:
test_df['pred_multi'] = test_df.apply(lambda x: get_multi_pred(x, 'FSL_BART', thresholds=bart_thresholds), axis=1)

In [19]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['pred_multi'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

         0_0       0.97      0.98      0.98      2395
         1_1       0.64      0.64      0.64        36
         1_2       0.83      0.83      0.83        12
         1_3       0.61      0.77      0.68        48
         1_4       0.70      0.69      0.69        70
         1_6       0.88      0.62      0.73        24
         1_7       0.50      0.56      0.53        34

   micro avg       0.95      0.95      0.95      2619
   macro avg       0.73      0.73      0.73      2619
weighted avg       0.95      0.95      0.95      2619
 samples avg       0.95      0.95      0.95      2619



### p = 0.9

In [20]:
with open('../data/bisection_records/BART_0-9.json') as file:
    bart_record = json.load(file)

In [21]:
bart_thresholds = get_thresholds_from_record(bart_record, start=0, end=1)

In [22]:
test_df['pred_multi'] = test_df.apply(lambda x: get_multi_pred(x, 'FSL_BART', thresholds=bart_thresholds), axis=1)

In [23]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['pred_multi'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

         0_0       0.97      0.97      0.97      2395
         1_1       0.64      0.69      0.67        36
         1_2       0.69      0.75      0.72        12
         1_3       0.57      0.71      0.63        48
         1_4       0.82      0.66      0.73        70
         1_6       0.88      0.62      0.73        24
         1_7       0.35      0.74      0.48        34

   micro avg       0.94      0.95      0.94      2619
   macro avg       0.70      0.73      0.70      2619
weighted avg       0.95      0.95      0.95      2619
 samples avg       0.95      0.95      0.95      2619

