## Link to the video
https://drive.google.com/file/d/1oDJYwqzMEUjub6QMQZptKvJU-SRbFJf2/view?usp=sharing
## Link to GitHub repository
https://github.com/pradeep1018/conf_matrix_and_calibration

In [None]:
# all the required functions for the assignment are located in this repo
# this repo is the forked version of original implementation with few modifications to run the code
! git clone https://github.com/pradeep1018/conf_matrix_and_calibration

Cloning into 'conf_matrix_and_calibration'...
remote: Enumerating objects: 76, done.[K
remote: Counting objects: 100% (76/76), done.[K
remote: Compressing objects: 100% (63/63), done.[K
remote: Total 76 (delta 31), reused 42 (delta 12), pack-reused 0[K
Unpacking objects: 100% (76/76), done.


In [None]:
# install dependencies
! pip install attrdict
! pip install deepdish
! pip3 install pyro-ppl
! pip install uncertainty-calibration

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting attrdict
  Downloading attrdict-2.0.1-py2.py3-none-any.whl (9.9 kB)
Installing collected packages: attrdict
Successfully installed attrdict-2.0.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting deepdish
  Downloading deepdish-0.3.7-py2.py3-none-any.whl (37 kB)
Installing collected packages: deepdish
Successfully installed deepdish-0.3.7
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyro-ppl
  Downloading pyro_ppl-1.8.2-py3-none-any.whl (722 kB)
[K     |████████████████████████████████| 722 kB 4.1 MB/s 
Collecting pyro-api>=0.1.1
  Downloading pyro_api-0.1.2-py3-none-any.whl (11 kB)
Installing collected packages: pyro-api, pyro-ppl
Successfully installed pyro-api-0.1.2 pyro-ppl-1.8.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/

In [None]:
# change working directory
import os
os.chdir('/content/conf_matrix_and_calibration')

In [None]:
# import necessary libraries
from data_utils import *
from utils import *
from combination_methods import *
from tqdm.auto import tqdm
import torch
from torch import nn
from sklearn.model_selection import train_test_split
from metrics import *
import csv
import numpy as np
import pandas as pd
import random
from statistics import mode
from torch.nn import ReLU
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor

# Task 1 #

Reproduces the results in the paper. The performance is compared with Appendix C of the paper

In [None]:
def _run_experiment(y_h=None, model_probs=None, y_true=None, **kwargs):
  seed = kwargs.pop('seed', 0)
  n_runs = kwargs.pop('n_runs', 25)
  test_size = kwargs.pop('test_size', 0.3)
  calibration_methods = kwargs.pop('calibration_methods', ['none'])

  acc_data = []
  for i in tqdm(range(n_runs), leave=False, desc='Runs'):
    y_h_tr, y_h_te, model_probs_tr, model_probs_te, y_true_tr, y_true_te = train_test_split(
        y_h, model_probs, y_true, test_size=test_size, random_state=i * seed)
    acc_h = get_acc(y_h_te, y_true_te)
    acc_m = get_acc(np.argmax(model_probs_te, axis=1), y_true_te)

    _acc_data = [acc_h, acc_m]
    _cal_data = []
    for calibration_method in calibration_methods:
      if calibration_method == 'confusion':
        combiner = DoubleConfusionCombiner()
        combiner.fit(model_probs_tr, y_h_tr, y_true_tr)
      else:
        combiner = OracleCombiner(calibration_method=calibration_method)
        combiner.fit(model_probs_tr, y_h_tr, y_true_tr)

      y_comb_te = combiner.combine(model_probs_te, y_h_te)
      acc_comb = get_acc(y_comb_te, y_true_te)
      _acc_data.append(acc_comb)

    acc_data += [_acc_data]

  err_data = np.array(acc_data)
  err_data = np.subtract(1,err_data)
  mean_err = np.round((100 * np.mean(err_data, axis = 0)), 2)
  std_err = np.round((100 * np.std(err_data, axis = 0)), 2)

  return list(mean_err), list(std_err)

In [None]:
def run_experiment_cifar10(experiment_args=None, seed=0, calibration_methods=None):
  model_names = ['resnet-110', 'densenet-bc-L190-k40']
  mean_errs = []
  std_errs = []
  for model_name in tqdm(model_names, desc='Models', leave=True):
    human_counts, model_probs, y_true = load_CIFAR10H(model_name)
    y_h = simulate_single_human(human_counts, seed=seed)

    mean_err, std_err = _run_experiment(y_h=y_h, model_probs=model_probs, y_true=y_true, **experiment_args)
    mean_errs.append(mean_err)
    std_errs.append(std_err)

  df_mean = pd.DataFrame(mean_errs)
  df_mean.columns = ['human', 'model'] + [f'comb {cal_m}' for cal_m in calibration_methods]
  df_mean.index = model_names

  df_std = pd.DataFrame(std_errs)
  df_std.columns = ['human', 'model'] + [f'comb {cal_m}' for cal_m in calibration_methods]
  df_std.index = model_names

  df_mean = df_mean.style.set_caption("Error rates")
  display(df_mean)
  df_std = df_std.style.set_caption('One standard deviation of error rates')
  display(df_std)

In [None]:
def task1():
  seed = 9658
  torch.manual_seed(seed)
  np.random.seed(seed)

  calibration_methods = ['confusion', 'none', 'temperature scaling', 'ensemble temperature scaling', 'imax binning']

  args = {'n_runs': 25,
          'test_size': 0.3,
          'calibration_methods': calibration_methods,
          'seed': seed
          }

  run_experiment_cifar10(experiment_args=args, seed=seed, calibration_methods=calibration_methods)

In [None]:
task1()

Models:   0%|          | 0/2 [00:00<?, ?it/s]

Runs:   0%|          | 0/25 [00:00<?, ?it/s]

(7000, 10)
(7000,)
(7000,)
<class 'numpy.float64'>
<class 'numpy.int64'>
<class 'numpy.int64'>


KeyboardInterrupt: ignored

### Observations:
The results obtained above are almost identical to the results shown in Appendix C of the paper. The experimentation is done on ResNet-110 and DenseNet-BC and the dataset chosen for the task is CIFAR-10H. The mean and standard deviation errors were calculated by training the model 25 times separately.

### Challenges Faced
It was initially harder to get started with the implementation. It took me time to understand the codebase and the different driver functions present in it. I tried to go through the codebase and read the paper alternatively. Honestly, I still can't say that I'm completely thorough with all the math used in the paper, but I have invested enough time to develop a decent understanding of the approach in the paper. I struggled a bit to integrate the github repository with the colab file so that I don't end up copying all the code from the repo and make the notebook look clumsy. The final solution for this issue ended up being quite simple. I learned how to read a research paper and understand its implementation to a good level and also learned how to understand large codebases especially in the context of machine learning and deep learning.

### Hyperparameters Used:
Hyperparameters mainly used in this experiment are seed value, size of test data, number of runs and the calibration methods. In addition, there are also more hyperparameters specific to the calibrators which can impact the accuracy. For instance, temperature scaling and ensemble temperature scaling has temperature as a hyperparameter. 

# Task 2 #

In [None]:
def simulate_three_humans(human_counts, seed=0):
  rng = np.random.default_rng(seed)
  n_rows = human_counts.shape[0]
  n_classes = human_counts.shape[1]

  human_labels= np.empty(shape=(n_rows))
  for row in range(n_rows):
      temp = []
      for i in range(n_classes):
          temp += [i] * int(human_counts[row, i])
      rng.shuffle(temp)

      if len(set(temp[:3])) == 3:
        human_labels[row] = random.choice(temp[:3])
      else:
        human_labels[row] = mode(temp[:3])

  human_labels = np.reshape(human_labels, (n_rows, ))
  return human_labels.astype(int)

In [None]:
def run_experiment_cifar10_multiple(experiment_args=None, seed=0, calibration_methods=None):
  model_names = ['resnet-110', 'densenet-bc-L190-k40']
  mean_errs = []
  std_errs = []
  for model_name in tqdm(model_names, desc='Models', leave=True):
    # Load data
    human_counts, model_probs, y_true = load_CIFAR10H(model_name)
    y_h = simulate_three_humans(human_counts, seed=seed)

    mean_err, std_err = _run_experiment(y_h=y_h, model_probs=model_probs, y_true=y_true, **experiment_args)
    mean_errs.append(mean_err)
    std_errs.append(std_err)

  df_mean = pd.DataFrame(mean_errs)
  df_mean.columns = ['human', 'model'] + [f'comb {cal_m}' for cal_m in calibration_methods]
  df_mean.index = model_names

  df_std = pd.DataFrame(std_errs)
  df_std.columns = ['human', 'model'] + [f'comb {cal_m}' for cal_m in calibration_methods]
  df_std.index = model_names

  df_mean = df_mean.style.set_caption("Error rates")
  display(df_mean)
  df_std = df_std.style.set_caption('One standard deviation of error rates')
  display(df_std)

In [None]:
def task2():
  seed = 9658
  torch.manual_seed(seed)
  np.random.seed(seed)

  calibration_methods = ['confusion', 'none', 'temperature scaling', 'ensemble temperature scaling', 'imax binning']

  args = {'n_runs': 25,
          'test_size': 0.3,
          'calibration_methods': calibration_methods,
          'seed': seed
          }

  run_experiment_cifar10_multiple(experiment_args=args, seed=seed, calibration_methods=calibration_methods)

In [None]:
task2()

Models:   0%|          | 0/2 [00:00<?, ?it/s]

Runs:   0%|          | 0/25 [00:00<?, ?it/s]

Runs:   0%|          | 0/25 [00:00<?, ?it/s]

Unnamed: 0,human,model,comb confusion,comb none,comb temperature scaling,comb ensemble temperature scaling,comb imax binning
resnet-110,2.04,6.1,2.26,2.22,1.65,1.65,1.66
densenet-bc-L190-k40,2.02,3.25,1.99,1.67,1.51,1.53,1.47


Unnamed: 0,human,model,comb confusion,comb none,comb temperature scaling,comb ensemble temperature scaling,comb imax binning
resnet-110,0.2,0.37,0.17,0.22,0.19,0.18,0.18
densenet-bc-L190-k40,0.22,0.29,0.22,0.18,0.17,0.18,0.18


### Observations:
This approach has performed better than the previous approach where there was only one human decision maker. This suggests that having more humans to predict increases the accuracy of predictions. The error by three human system is less than half the error by one human system(2.02 vs 4.62). The deviation of errors has been the same for both though. The same trend is observed for different calibrators i.e better accuracy and less error is achieved by 3 human system. This further proves that team work always outperforms individual work!!!
### Use Cases:
It can be used in many AI systems especially when AI is used to assist and guide humans. For instance, an AI model can be used to help a hiring committee to select candidates for jobs. It can be used by doctors to assist them during surgeries. We can see above that when the right calibrator is used, a combination of a group of humans and a model can outperform everything else. The model will come to the rescue when humans may go wrong in some cases due to their own biases. 

## Is there a better mathematical method to incorporate multiple humans?
In the original CIFAR-10H dataset, there are about 50 predictions made by humans. We could take advantage of the larger dataset, by taking the most likely class chosen by 50 humans instead of just 3 humans. When we consider only 3 humans, there is more chance of selecting random predictions, which occurs when all 3 predictions are different, than when we consider 50 humans. We will most likely get a most likely class which reduces the randomness in selection. Thus, having 50 humans to make decisions is even better than having just 3 humans to predict. This becomes more true and prominent in real life cases where it is important to have diverse set of people in the selection committee. It is easier to achieve diversity with a team of 50 humans than 3 humans.


# Task 3 #

In [None]:
class TeamModel(nn.Module):
  def __init__(self, epochs=10):
    super(TeamModel, self).__init__()
    self.model = nn.Sequential(
        nn.Linear(11, 20),                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   
        nn.ReLU(),
        nn.Linear(20, 10),
        nn.Sigmoid()
    )
    self.epochs = epochs

  def fit(self, model_probs, y_h, y_true):
    y_h = np.reshape(y_h, (y_h.shape[0], 1))
    y_true = np.reshape(y_true, (y_true.shape[0], 1))
    x = np.concatenate((model_probs, y_h), axis = 1)
    y = y_true

    traindata = np.concatenate((x, y), axis=1)
    dataloader = DataLoader(traindata, batch_size=64, shuffle=True) 

    optimizer = Adam(self.model.parameters(), lr=0.01)
    criterion = CrossEntropyLoss()
    for epoch in range(self.epochs):
      for data  in dataloader:
        x = data[:,:-1]
        x = x.type(torch.FloatTensor)
        y = data[:,-1]
        y = y.type(torch.LongTensor)

        optimizer.zero_grad()
        y_hat = self.model(x)
        loss = criterion(y_hat, y)
        loss.backward()
        optimizer.step()
      
  def combine(self, model_probs, y_h):
    y_h = np.reshape(y_h, (y_h.shape[0], 1))
    x = np.concatenate((model_probs, y_h), axis = 1)
    x = torch.from_numpy(x)
    x = x.type(torch.FloatTensor)
    y_pred = self.model(x)
    y_pred = y_pred.detach().numpy()
    y_pred = np.argmax(y_pred, axis=1)
    return y_pred

In [None]:
def _run_experiment_nn(y_h=None, model_probs=None, y_true=None, calibration_methods=None, **kwargs):
  seed = kwargs.pop('seed', 0)
  n_runs = kwargs.pop('n_runs', 25)
  test_size = kwargs.pop('test_size', 0.3)

  acc_data = []
  for i in tqdm(range(n_runs), leave=False, desc='Runs'):
    # Train/test split
    y_h_tr, y_h_te, model_probs_tr, model_probs_te, y_true_tr, y_true_te = train_test_split(
        y_h, model_probs, y_true, test_size=test_size, random_state=i * seed)

    acc_h = get_acc(y_h_te, y_true_te)
    acc_m = get_acc(np.argmax(model_probs_te, axis=1), y_true_te)

    _acc_data = [acc_h, acc_m]
    _cal_data = []
    for calibration_method in calibration_methods:
      if calibration_method == 'confusion':
        combiner = DoubleConfusionCombiner()
        combiner.fit(model_probs_tr, y_h_tr, y_true_tr)
      elif calibration_method == 'neural networks':
        combiner = TeamModel()
        combiner.fit(model_probs_tr, y_h_tr, y_true_tr)
      else:
        combiner = OracleCombiner(calibration_method=calibration_method)
        combiner.fit(model_probs_tr, y_h_tr, y_true_tr)

      y_comb_te = combiner.combine(model_probs_te, y_h_te)
      acc_comb = get_acc(y_comb_te, y_true_te)
      _acc_data.append(acc_comb)

    acc_data += [_acc_data]

  err_data = np.array(acc_data)
  err_data = np.subtract(1,err_data)
  mean_err = np.round((100 * np.mean(err_data, axis = 0)), 2)
  std_err = np.round((100 * np.std(err_data, axis = 0)), 2)

  return list(mean_err), list(std_err)

In [None]:
def run_experiment_cifar10_nn(experiment_args=None, seed=0, calibration_methods=None):
  model_names = ['resnet-110', 'densenet-bc-L190-k40']
  mean_errs = []
  std_errs = []
  for model_name in tqdm(model_names, desc='Models', leave=True):
    # Load data
    human_counts, model_probs, y_true = load_CIFAR10H(model_name)
    y_h = simulate_single_human(human_counts, seed=seed)

    mean_err, std_err = _run_experiment_nn(y_h=y_h, model_probs=model_probs, y_true=y_true, calibration_methods=calibration_methods, **experiment_args)
    mean_errs.append(mean_err)
    std_errs.append(std_err)

  df_mean = pd.DataFrame(mean_errs)
  df_mean.columns = ['human', 'model'] + [f'comb {cal_m}' for cal_m in calibration_methods]
  df_mean.index = model_names

  df_std = pd.DataFrame(std_errs)
  df_std.columns = ['human', 'model'] + [f'comb {cal_m}' for cal_m in calibration_methods]
  df_std.index = model_names

  df_mean = df_mean.style.set_caption("Error rates")
  display(df_mean)
  df_std = df_std.style.set_caption('One standard deviation of error rates')
  display(df_std)

In [None]:
def task3():
  seed = 9658
  torch.manual_seed(seed)
  np.random.seed(seed)

  calibration_methods = ['confusion', 'none', 'temperature scaling', 'ensemble temperature scaling', 'imax binning', 'neural networks']

  args = {'n_runs': 25,
          'test_size': 0.3,
          'seed': seed
          }

  run_experiment_cifar10_nn(experiment_args=args, seed=seed, calibration_methods=calibration_methods)

In [None]:
task3()

Models:   0%|          | 0/2 [00:00<?, ?it/s]

Runs:   0%|          | 0/25 [00:00<?, ?it/s]

Runs:   0%|          | 0/25 [00:00<?, ?it/s]

Unnamed: 0,human,model,comb confusion,comb none,comb temperature scaling,comb ensemble temperature scaling,comb imax binning,comb neural networks
resnet-110,4.62,6.1,4.71,3.05,2.85,2.82,2.85,5.37
densenet-bc-L190-k40,4.62,3.25,3.39,2.22,2.03,2.17,2.04,3.25


Unnamed: 0,human,model,comb confusion,comb none,comb temperature scaling,comb ensemble temperature scaling,comb imax binning,comb neural networks
resnet-110,0.32,0.37,0.36,0.23,0.22,0.23,0.22,0.37
densenet-bc-L190-k40,0.32,0.29,0.31,0.21,0.17,0.17,0.18,0.32


### Observations:
When compared with the other calibrators, neural networks performed worse than the rest. I tried with different architectures of neural network, different epochs and different learning rates. However, this was the best result I could achieve. In this case, feature engineering wins over neural networks. The accuracy achieved by the neural network calibrator is only slightly better and falls short in comparison to other calibrators. So, team model wasn't the best approach human and model predictions.