# Calculate calibration curve points using SciKit Learn

### Calculate calibration curve datapoints for subsequent plotting in R

Shubhayu Bhattacharyay
<br>

## I. Initialization

### Import necessary packages

In [1]:
# Fundamental methods
import os
import re
import sys
import time
import glob
import random
import warnings
import itertools
import numpy as np
import pandas as pd
import pickle as cp
import seaborn as sns
from scipy import stats
from pathlib import Path
import matplotlib.pyplot as plt
warnings.filterwarnings(action="ignore")

# SciKit-Learn methods
from sklearn.calibration import calibration_curve

## II. Calculate probability calibration curve points

### Loop through result files and calculate probability calibration points

In [111]:
# Extract dataframe of optimal tuning results
opt_tuning_results = pd.read_csv('../results/detection_results/regression_results/optimal_tuning_results.csv')

# Imputation directories
imp_dirs = glob.glob('../results/detection_results/regression_results/imp*/')

calib_df = pd.DataFrame(np.empty((0,6)))
for i in range(len(opt_tuning_results.index)):
    if (opt_tuning_results.model[i] == 'mnlr'):
        model_file_name =  ('mnlr_mdl_'+ str(opt_tuning_results.maxAUROCdim[i])+ '_preds.csv')
    elif (opt_tuning_results.model[i] == 'mnlr.smote'):
        model_file_name =  ('SMOTE_mnlr_mdl_'+ str(opt_tuning_results.maxAUROCdim[i])+ '_preds.csv')
    elif (opt_tuning_results.model[i] == 'polr'):
        model_file_name =  ('polr_mdl_'+ str(opt_tuning_results.maxAUROCdim[i])+ '_preds.csv')
    elif (opt_tuning_results.model[i] == 'polr.smote'):
        model_file_name =  ('SMOTE_polr_mdl_'+ str(opt_tuning_results.maxAUROCdim[i])+ '_preds.csv')
        
    if opt_tuning_results['obs.window'][i] % 1 == 0:
        curr_obs_window = str(int(opt_tuning_results['obs.window'][i]))
    else:
        curr_obs_window = str(opt_tuning_results['obs.window'][i])
            
    for j in range(len(imp_dirs)):
        curr_mdl_results = pd.read_csv(os.path.join(imp_dirs[j],'detection_window_'+curr_obs_window,model_file_name))
        uniq_labels = np.sort(curr_mdl_results['true.labels'].unique())
        for k in range(len(uniq_labels)):
            curr_mdl_results = curr_mdl_results.assign(temp_label= (curr_mdl_results['true.labels'] == uniq_labels[k]).astype(int))
            curr_label_prob_name = [col for col in curr_mdl_results if col.startswith('GCSm='+str(uniq_labels[k]))]
            prob_true, prob_pred =  calibration_curve(curr_mdl_results.temp_label.to_numpy(), np.clip(curr_mdl_results[curr_label_prob_name[0]].to_numpy(),0,1), n_bins=5, strategy='uniform') 
            temp_df = pd.DataFrame({'prob_true':prob_true, 'prob_pred':prob_pred,'obs_window':curr_obs_window,'model':opt_tuning_results.model[i],'imp':j+1,'GCSm':uniq_labels[k]})
            calib_df.columns = temp_df.columns
            calib_df = calib_df.append(temp_df,ignore_index=True)
calib_df.to_csv('../results/detection_results/regression_results/compiled_calibration_curves.csv',index=False)