# Classifiers Agreement

### How often do different classifiers make the same prediction?


Classifiers:
+ LR with trad features
+ LR with all features
+ LR with arg features
+ DistilBERT
+ BERT
+ RoBERTa
+ DeBERTa
+ BERT-large

In [1]:
import os
import json
import pandas as pd
import numpy as np
import pprint as pp

import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
from datasets import load_from_disk

## Transformers

Make predictions

In [2]:
PROJECT_DIR = os.path.dirname(os.path.dirname(os.getcwd()))
SPLITS_DIR = os.path.join(PROJECT_DIR, "classification/split_datasets/coqa")
MODELS_DIR = os.path.join(PROJECT_DIR, "classification/models")

models_paths_map = {
    'distilbert': os.path.join(MODELS_DIR, "distilbert-base-uncased_13091207"),
    'bert': os.path.join(MODELS_DIR, "bert_14102004"),
    'bert-large': os.path.join(MODELS_DIR, "bert-large_14101938"),
    'roberta': os.path.join(MODELS_DIR, "roberta_14102014"),
    'deberta': os.path.join(MODELS_DIR, "deberta_14102242")
}

raw_dataset = load_from_disk(SPLITS_DIR)

device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

In [3]:
# get gold labels
gold_labels = {
    'validation': [0 if l==False else 1 for l in raw_dataset['validation']['label']],
    'test': [0 if l==False else 1 for l in raw_dataset['test']['label']]
}

In [4]:
raw_dataset['test']

Dataset({
    features: ['text', 'label', 'pandas_idx'],
    num_rows: 1000
})

In [5]:
def init_pipe_and_predict(path_to_existing_model):
    
    tokenizer = AutoTokenizer.from_pretrained(path_to_existing_model)
    model = AutoModelForSequenceClassification.from_pretrained(path_to_existing_model)
    model.to(device)

    pipe = pipeline("text-classification",
        tokenizer=tokenizer,
        model=model,
        #top_k=None, # get confidence scores for predictions
        # `return_all_scores` is now deprecated,  if want a similar funcionality use `top_k=None` instead of `return_all_scores=True` or `top_k=1` instead of `return_all_scores=False`.
    )
    pipe.device = device

    # Predict
    # Sort raw_dataset by pandas_idx to get the same order of instances of in the LR predictions
    val_preds = pipe(raw_dataset['test']['text'])
    test_preds = pipe(raw_dataset['test']['text'])
    
    preds = {
        'validation': [
            int(pred['label'].split('_')[-1]) for pred in val_preds
        ],
        'test': [
            int(pred['label'].split('_')[-1]) for pred in test_preds
        ]
    }

    return preds

In [6]:
distilbert_preds = init_pipe_and_predict(models_paths_map['distilbert'])
bert_preds = init_pipe_and_predict(models_paths_map['bert'])
bert_large_preds = init_pipe_and_predict(models_paths_map['bert-large'])
roberta_preds = init_pipe_and_predict(models_paths_map['roberta'])
deberta_preds = init_pipe_and_predict(models_paths_map['deberta'])

In [11]:
pp.pprint(distilbert_preds['test'], compact=True, width=120)

[1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1,
 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1

In [12]:
''' Sanity check: pandas_idx matches with order in responses-fe (and features file) '''
for i, instance in enumerate(raw_dataset['train']):
    if raw_dataset['train'][i]['pandas_idx'] <= 5:
        print(i)
        pp.pprint(instance, compact=True, width=120)
for i, instance in enumerate(raw_dataset['test']):
    if raw_dataset['test'][i]['pandas_idx'] <= 5:
        print(i)
        pp.pprint(instance, compact=True, width=120)

3404
{'label': False,
 'pandas_idx': 2,
 'text': 'First, if the choker is not in a jewelry box or boutique, it must be somewhere else. So, we need to consider '
         'other possible locations. \n'
         '\n'
         'Some options could be:\n'
         '\n'
         "- Clothing stores: Some clothing stores may sell accessories like chokers, so it's worth checking out.\n"
         '- Online marketplaces: You can search for chokers on online marketplaces like Amazon, Etsy, or eBay.\n'
         '- Second-hand stores: You may be able to find a unique choker at a second-hand store or thrift shop.\n'
         "- Craft stores: If you're feeling creative, you can make your own choker by purchasing supplies at a craft "
         'store.\n'
         '\n'
         'So, the answer would be none of the above options listed in the question.'}
6521
{'label': True,
 'pandas_idx': 1,
 'text': 'Sammy wants to go where the people are. This means he is looking for a place where there are a lot of '

## LR

Load predictions

In [43]:
PREDS_DIR = os.path.join(PROJECT_DIR, "classification/preds")

'''
arg: features from argument mining
all: arg + traditional features
col: select by removing collinear features
kbest: select by kbest (chi2 test)
rfe: select by recursive feature elimination
'''

preds_map = {
    "lr_arg": os.path.join(PREDS_DIR, "preds_12091031_all_arg.json"),
    "lr_all-col-rfe": os.path.join(PREDS_DIR, "preds_12091031_col-rfe_all.json"), # best accuracy
    "lr_all-rfe": os.path.join(PREDS_DIR, "preds_12091031_rfe_all.json"),
    "lr_all_kbest-rfe": os.path.join(PREDS_DIR, "preds_12091031_kbest-rfe-ensemble_all.json"),
}

In [14]:
from collections import OrderedDict

def make_preds_comparable(lr_preds, dataset_split):
    '''
    LR predictions are sorted in ascending order of pandas_idx
    and they come in a dict format: {pandas_idx: bool pred}

    Change the order so that it is the same as the one in the dataset_split
    (and matches the order of predictions of the transformer models)

    Return a list of predictions as ints
    '''
    new_lr_preds = OrderedDict()
    for instance in dataset_split:
        idx = str(instance['pandas_idx'])
        new_lr_preds[idx] = lr_preds[idx]

    preds = []
    for pred in new_lr_preds.values():
        preds.append(int(pred))
    return preds

In [44]:
lr_preds_arg = make_preds_comparable(json.load(open(preds_map["lr_arg"])), raw_dataset['test'])

lr_preds_all_col_rfe = make_preds_comparable(json.load(open(preds_map["lr_all-col-rfe"])), raw_dataset['test'])
lr_preds_all_rfe = make_preds_comparable(json.load(open(preds_map["lr_all-rfe"])), raw_dataset['test'])
lr_preds_all_kbest_rfe = make_preds_comparable(json.load(open(preds_map["lr_all_kbest-rfe"])), raw_dataset['test'])

In [45]:
# Sanity check: same number of instances in all lists
len(lr_preds_all_col_rfe) == len(gold_labels['test']) == len(distilbert_preds['test'])

True

In [46]:
# Sanity check: same accuracy as in stats/
# Proves that the order issue is fixed
from sklearn.metrics import accuracy_score
accuracy_score(lr_preds_all_col_rfe, gold_labels['test'])

0.722

## Cohen's Kappa

It calculates the agreement between two raters who each classify N items into C mutually exclusive categories. The formula for Cohen's Kappa is:

$$\kappa = \frac{p_o - p_e}{1 - p_e}$$

where:
- $p_o$ is the relative observed agreement among raters (identical to accuracy), and
- $p_e$ is the hypothetical probability of chance agreement.


Cohen’s Kappa score ranges from -1 to +1.

+ < 0: No agreement
+ 0: Agreement equivalent to chance
+ 0.01 - 0.20: Slight agreement
+ 0.21 - 0.40: Fair agreement
+ 0.41 - 0.60: Moderate agreement
+ 0.61 - 0.80: Substantial agreement
+ 0.81 - 1.00: Almost perfect agreement


Cohen's Kappa has several advantages over accuracy when comparing two sets of predictions:

1. **Accounting for chance**: Cohen's Kappa takes into account the possibility of the agreement occurring by chance. This is not the case with accuracy, which can give a misleadingly high score if one class is much more common than the other.

2. **More robust for imbalanced classes**: If the classes in your dataset are imbalanced, accuracy can be misleading. For example, if 95% of your data is of class A, a model that always predicts A will have an accuracy of 95%, but it's not a good model. Cohen's Kappa is more robust in this situation.

3. **Comparing different models**: Cohen's Kappa is especially useful when you want to compare the performance of different models or different runs of the same model. It gives you a measure of how consistently the models/runs are predicting.

4. **Inter-rater reliability**: Cohen's Kappa is often used in situations where you want to measure inter-rater reliability, i.e., how much homogeneity or consensus exists in the ratings given by various judges.

In [47]:
from sklearn.metrics import cohen_kappa_score

cohen_kappa_score(distilbert_preds['test'], lr_preds_all_col_rfe)

0.46819496076001654

In [48]:
# lists of pandas_idx on which the models agree/disagree
instances_with_agreement = []
instances_with_disagreement = []

for i, instance in enumerate(raw_dataset['test']):
    if distilbert_preds['test'][i] == lr_preds_all_col_rfe[i]:
        instances_with_agreement.append(instance['pandas_idx'])
    else:
        instances_with_disagreement.append(instance['pandas_idx'])

In [49]:
len(instances_with_agreement), len(instances_with_disagreement)

(794, 206)

In [68]:
def get_agreement_stats(preds1, preds2, dataset_split):
    '''
    {
    'distilbertxlr_all': {
        'cohens_kappa': float,
        'agreement': int, # len(instances_with_agreement)
        'disagreement': int, # len(instances_with_disagreement)
        'instances_with_agreement': list,
        'instances_with_disagreement': list,
        },
    ...}
    '''
    agreement_stats = {
        'cohens_kappa': round(cohen_kappa_score(preds1, preds2), 3),
        'agreement': 0,
        'disagreement': 0,
        'accuracy_1': round(accuracy_score(preds1, gold_labels['test']), 3),
        'accuracy_2': round(accuracy_score(preds2, gold_labels['test']), 3),
        'instances_with_agreement': [],
        'instances_with_disagreement': [],
    }

    for i, instance in enumerate(dataset_split):
        if preds1[i] == preds2[i]:
            agreement_stats['agreement'] += 1
            agreement_stats['instances_with_agreement'].append(instance['pandas_idx'])
        else:
            agreement_stats['disagreement'] += 1
            agreement_stats['instances_with_disagreement'].append(instance['pandas_idx'])

    return agreement_stats

In [69]:
# Agreement stats for all pairs of models
all_agreement_stats = {
    
    'transformerxlr': {
        # all features
        'distilbertxlr_all-col-rfe': get_agreement_stats(distilbert_preds['test'], lr_preds_all_col_rfe, raw_dataset['test']),
        'distilbertxlr_all-rfe': get_agreement_stats(distilbert_preds['test'], lr_preds_all_rfe, raw_dataset['test']),
        'distilbertxlr_all_kbest-rfe': get_agreement_stats(distilbert_preds['test'], lr_preds_all_kbest_rfe, raw_dataset['test']),

        'bertxlr_all-col-rfe': get_agreement_stats(bert_preds['test'], lr_preds_all_col_rfe, raw_dataset['test']),
        'bertxlr_all-rfe': get_agreement_stats(bert_preds['test'], lr_preds_all_rfe, raw_dataset['test']),
        'bertxlr_all_kbest-rfe': get_agreement_stats(bert_preds['test'], lr_preds_all_kbest_rfe, raw_dataset['test']),

        'bert-largexlr_all-col-rfe': get_agreement_stats(bert_large_preds['test'], lr_preds_all_col_rfe, raw_dataset['test']),
        'bert-largexlr_all-rfe': get_agreement_stats(bert_large_preds['test'], lr_preds_all_rfe, raw_dataset['test']),
        'bert-largexlr_all_kbest-rfe': get_agreement_stats(bert_large_preds['test'], lr_preds_all_kbest_rfe, raw_dataset['test']),

        'robertaxlr_all-col-rfe': get_agreement_stats(roberta_preds['test'], lr_preds_all_col_rfe, raw_dataset['test']),
        'robertaxlr_all-rfe': get_agreement_stats(roberta_preds['test'], lr_preds_all_rfe, raw_dataset['test']),
        'robertaxlr_all_kbest-rfe': get_agreement_stats(roberta_preds['test'], lr_preds_all_kbest_rfe, raw_dataset['test']),

        'debertaxlr_all-col-rfe': get_agreement_stats(deberta_preds['test'], lr_preds_all_col_rfe, raw_dataset['test']),
        'debertaxlr_all-rfe': get_agreement_stats(deberta_preds['test'], lr_preds_all_rfe, raw_dataset['test']),
        'debertaxlr_all_kbest-rfe': get_agreement_stats(deberta_preds['test'], lr_preds_all_kbest_rfe, raw_dataset['test']),

        # arg features
        'distilbertxlr_arg': get_agreement_stats(distilbert_preds['test'], lr_preds_arg, raw_dataset['test']),
        'bertxlr_arg': get_agreement_stats(bert_preds['test'], lr_preds_arg, raw_dataset['test']),
        'bert-largexlr_arg': get_agreement_stats(bert_large_preds['test'], lr_preds_arg, raw_dataset['test']),
        'robertaxlr_arg': get_agreement_stats(roberta_preds['test'], lr_preds_arg, raw_dataset['test']),
        'debertaxlr_arg': get_agreement_stats(deberta_preds['test'], lr_preds_arg, raw_dataset['test']),
    },

    'transformerxtransformer': {
        'distilbertxbert': get_agreement_stats(distilbert_preds['test'], bert_preds['test'], raw_dataset['test']),
        'distilbertxbert-large': get_agreement_stats(distilbert_preds['test'], bert_large_preds['test'], raw_dataset['test']),
        'distilbertxroberta': get_agreement_stats(distilbert_preds['test'], roberta_preds['test'], raw_dataset['test']),
        'distilbertxdeberta': get_agreement_stats(distilbert_preds['test'], deberta_preds['test'], raw_dataset['test']),
        'bertxbert-large': get_agreement_stats(bert_preds['test'], bert_large_preds['test'], raw_dataset['test']),
        'bertxroberta': get_agreement_stats(bert_preds['test'], roberta_preds['test'], raw_dataset['test']),
        'bertxdeberta': get_agreement_stats(bert_preds['test'], deberta_preds['test'], raw_dataset['test']),
        'bert-largexroberta': get_agreement_stats(bert_large_preds['test'], roberta_preds['test'], raw_dataset['test']),
        'bert-largexdeberta': get_agreement_stats(bert_large_preds['test'], deberta_preds['test'], raw_dataset['test']),
        'robertaxdeberta': get_agreement_stats(roberta_preds['test'], deberta_preds['test'], raw_dataset['test']),
    },

    'lrxlr': {
        'lr_all-col-rfexlr_all-rfe': get_agreement_stats(lr_preds_all_col_rfe, lr_preds_all_rfe, raw_dataset['test']),
        'lr_all-col-rfexlr_all_kbest-rfe': get_agreement_stats(lr_preds_all_col_rfe, lr_preds_all_kbest_rfe, raw_dataset['test']),
        'lr_all-rfexlr_all_kbest-rfe': get_agreement_stats(lr_preds_all_rfe, lr_preds_all_kbest_rfe, raw_dataset['test']),
        'lr_all-col-rfexlr_arg': get_agreement_stats(lr_preds_all_col_rfe, lr_preds_arg, raw_dataset['test']),
        'lr_all-rfexlr_arg': get_agreement_stats(lr_preds_all_rfe, lr_preds_arg, raw_dataset['test']),
        'lr_all_kbest-rfexlr_arg': get_agreement_stats(lr_preds_all_kbest_rfe, lr_preds_arg, raw_dataset['test']),
    }
}

In [70]:
# Save to file
ANALYSIS_DIR = os.path.join(PROJECT_DIR, "classification/analysis")

with open(os.path.join(ANALYSIS_DIR, "agreement_stats.json"), 'w') as f:
    json.dump(all_agreement_stats, f, indent=2)

In [71]:
agreement_stats = json.load(open(os.path.join(ANALYSIS_DIR, "agreement_stats.json")))

transformerxlr_agreement_df = pd.DataFrame.from_dict(
    agreement_stats['transformerxlr'],
    orient='index', 
    columns=['cohens_kappa', 'agreement', 'disagreement', 'accuracy_1', 'accuracy_2']
)
transformerxlr_agreement_df.sort_values(by=['cohens_kappa'], ascending=False)

Unnamed: 0,cohens_kappa,agreement,disagreement,accuracy_1,accuracy_2
distilbertxlr_all-rfe,0.471,785,215,0.804,0.719
distilbertxlr_all-col-rfe,0.468,794,206,0.804,0.722
debertaxlr_all-rfe,0.456,779,221,0.81,0.719
debertaxlr_all-col-rfe,0.453,788,212,0.81,0.722
robertaxlr_all-rfe,0.44,772,228,0.813,0.719
bert-largexlr_all-rfe,0.436,775,225,0.814,0.719
bert-largexlr_all-col-rfe,0.435,786,214,0.814,0.722
bertxlr_all-rfe,0.435,773,227,0.814,0.719
bertxlr_all-col-rfe,0.429,782,218,0.814,0.722
robertaxlr_all-col-rfe,0.426,777,223,0.813,0.722


In [74]:
transformerxtransformer_agreement_df = pd.DataFrame.from_dict(
    agreement_stats['transformerxtransformer'],
    orient='index', 
    columns=['cohens_kappa', 'agreement', 'disagreement', 'accuracy_1', 'accuracy_2']
)
transformerxtransformer_agreement_df.sort_values(by=['cohens_kappa'], ascending=False)

Unnamed: 0,cohens_kappa,agreement,disagreement,accuracy_1,accuracy_2
bertxbert-large,0.955,984,16,0.814,0.814
bertxdeberta,0.939,978,22,0.814,0.81
bert-largexdeberta,0.933,976,24,0.814,0.81
robertaxdeberta,0.933,975,25,0.813,0.81
bertxroberta,0.931,975,25,0.814,0.813
bert-largexroberta,0.931,975,25,0.814,0.813
distilbertxbert-large,0.9,964,36,0.804,0.814
distilbertxbert,0.89,960,40,0.804,0.814
distilbertxdeberta,0.881,956,44,0.804,0.81
distilbertxroberta,0.868,951,49,0.804,0.813


In [75]:
lrxlr_agreement_df = pd.DataFrame.from_dict(
    agreement_stats['lrxlr'],
    orient='index', 
    columns=['cohens_kappa', 'agreement', 'disagreement', 'accuracy_1', 'accuracy_2']
)
lrxlr_agreement_df.sort_values(by=['cohens_kappa'], ascending=False)

Unnamed: 0,cohens_kappa,agreement,disagreement,accuracy_1,accuracy_2
lr_all-col-rfexlr_all-rfe,0.735,889,111,0.722,0.719
lr_all-rfexlr_all_kbest-rfe,0.719,880,120,0.719,0.695
lr_all-col-rfexlr_all_kbest-rfe,0.682,869,131,0.722,0.695
lr_all-col-rfexlr_arg,0.377,774,226,0.722,0.63
lr_all_kbest-rfexlr_arg,0.364,761,239,0.695,0.63
lr_all-rfexlr_arg,0.334,743,257,0.719,0.63
