In [None]:
"""
Cell For Papermill Parameters
"""

test_size = 0.2
input_data = "to_ngboost.csv"
Label_Column = 'label_clas'
estimators = 1000
NUM_CLASSES = 2
in_file_MRI = "uncertainty_MRI.csv"
in_file_US = "uncertainty_US.csv"

In [None]:
import os
import random
import sys
import gc
import copy

import numpy as np
import pandas as pd
import pickle

import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [None]:
with open(in_file_MRI+'.pickle', 'rb') as f:
    mri_combined_uncertainties = pickle.load(f)
    
with open(in_file_US+'.pickle', 'rb') as f:
    us_combined_uncertainties = pickle.load(f)

### 5. Load RandomForest Model + Make Predictions / Uncertainty Estimates

This section follows a similar format to the DNN models but with slight modifications 

#### 5.1 Load RF Model Parameters

#### 5.2 Load RF Data Parameters

In [None]:
df = pd.read_csv(input_data)

#df.drop(["cancer_in_core_max","cancer_in_core_mean","patient","Patient Number","name","ethnic_grp","occupation","patient_US"],axis=1, inplace=True)
df.drop(["name","smoking_status","occupation","Model-MRI-DNN","Model-US-DNN"],axis=1, inplace=True)

age_mean = df["age"].mean()
size_mean = df["size"].mean()
weight_mean = df["weight"].mean()
psa_mean = df["PSA"].mean()

df= df.fillna({"age":age_mean, "size":size_mean, "weight":weight_mean, "PSA":psa_mean})
df.dropna(inplace=True)

df.loc[df.ethnic_grp =="Patient Refused","ethnic_grp"] = "Unknown"
df.loc[df.ethnic_grp =="Unknown [3]","ethnic_grp"] = "Unknown"

Label_Column = 'label_clas'
df[Label_Column] = df[Label_Column].astype(int)

features_df = df.drop(Label_Column,axis=1,inplace=False)
features_one_hot_df = pd.get_dummies(features_df)

labels_df = df[[Label_Column]]

In [None]:
features_one_hot_df.head(5)

In [None]:
features_np = features_one_hot_df.values.tolist()
labels_np = labels_df.values.ravel()
X_train, X_test, Y_train, Y_test = train_test_split(features_np,labels_np, random_state=0,test_size=0.2)

#### 5.3 Load RF Model

In [None]:
clf = RandomForestClassifier(n_estimators=estimators,n_jobs=-1,verbose=0)
clf = clf.fit(X_train, Y_train)

#### 5.4 Make Uncertainty Estimates on the Data

In [None]:
## existing section to make predictions on input data ##
prediction = clf.predict(X_test)
probability = clf.predict_proba(X_test)

probability_df = pd.DataFrame(probability)
probability_df.columns = ['Prob_0','Prob_1']
probability_df

In [None]:
## new section to prep uncertainty estimates ##
def get_input_rf_monte_carlo_predictions(data, forward_passes,
                                      model, n_classes, n_samples,
                                     uncertainty_list, variable_col_index):
    """ Function to get the monte-carlo samples and uncertainty estimates
    through multiple forward passes in a RF model, coming from a variable
    and its uncertainty estimates

    Parameters
    ----------
    data : list
        data as list that is fed into the RF model. Each entry should represent
        a single data point / individual.
    forward_passes : int
        number of monte-carlo samples/forward passes
    model : SKLearn RF Model
        Random Forest classifier model
    n_classes : int
        number of classes in the dataset
    n_samples : int
        number of samples in the test set
    uncertainty_list : list
        list of uncertainty estimates associated with the input variable
    variable_col_index : int
        int representing the column position of the variable with uncertainty
    """
    
    noise_predictions = np.empty((0, n_samples, n_classes))
    softmax = nn.Softmax(dim=1)
    
    # create forward_passes number of monte carlo predictions
    for i in range(forward_passes):
        
        predictions = np.empty((0, n_classes))
        
        # loop over each entry
        for i, entry_uncertainty in enumerate(uncertainty_list):
            
            # get variable values for the current entry
            X_row = data[i]
            
            # make copy so the original data isn't affected on the next monte carlo loop
            X_row_copy = copy.deepcopy(X_row)
            
            # generate noise from distribution where std = uncertainty estimate in the variable.
            # this noise comes back as a tensor of a single value
            noise = torch.normal(mean = 0., std = entry_uncertainty, size = [1])
            
            # extract the single value from the tensor and convert to float
            noise = noise.data.cpu().numpy()[0]
            
            # add the noise to the variable's value
            noisy_variable = X_row_copy[variable_col_index] + noise
            
            # replace variable with the noisy version
            X_row_copy[variable_col_index] = noisy_variable
            
            # reshape to make prediction
            X_row_copy = np.array(X_row_copy).reshape(1,-1)

            # make prediction with the noisy variable
            output = model.predict_proba(X_row_copy)
            predictions = np.vstack((predictions, output))


        noise_predictions = np.vstack((noise_predictions, predictions[np.newaxis, :, :]))
        # noise predictions - shape (forward_passes, n_samples, n_classes)

    # Calculating variance across multiple forward passes 
    variance = np.var(noise_predictions, axis=0) # shape (n_samples, n_classes)
    
    return (variance**0.5)

def get_rf_model_uncertainty(clf, X):
    '''Function to take a SKLearn RF classifier and inputs
    to make predictions for each tree in the classifier. Computes
    and returns the uncertainty from this distribution.'''
    
    # list where each entry is a classifier's predictions for all passed examples
    # aka shape [num_trees, num_examples, num_classes]
    per_tree_pred = [tree.predict_proba(X) for tree in clf.estimators_]

    variance = np.var(per_tree_pred, axis=0)
    
    uncertainty = variance**0.5
    
    return uncertainty

In [None]:
print(len(X_test))
print(len(mri_combined_uncertainties))
print(len(us_combined_uncertainties))

In [None]:
df_extra = pd.read_csv(input_data)

df_extra.drop(["smoking_status","occupation","Model-MRI-DNN","Model-US-DNN"],axis=1, inplace=True)

age_mean = df_extra["age"].mean()
size_mean = df_extra["size"].mean()
weight_mean = df_extra["weight"].mean()
psa_mean = df_extra["PSA"].mean()

df_extra= df_extra.fillna({"age":age_mean, "size":size_mean, "weight":weight_mean, "PSA":psa_mean})
df_extra.dropna(inplace=True)

df_extra.loc[df_extra.ethnic_grp =="Patient Refused","ethnic_grp"] = "Unknown"
df_extra.loc[df_extra.ethnic_grp =="Unknown [3]","ethnic_grp"] = "Unknown"

Label_Column = 'label_clas'
df_extra[Label_Column] = df_extra[Label_Column].astype(int)

features_df_extra = df_extra.drop(Label_Column,axis=1,inplace=False)
features_one_hot_df_extra = pd.get_dummies(features_df_extra, columns = ["ethnic_grp"])

# get df to find original row indices corresponding to specific
# patient numbers
df_patient_nums = features_one_hot_df_extra["name"].str[-4:].astype(int)

df_patient_nums

In [None]:
## new section to find and match uncertainties in the same order as X_test ##

model_scores = 'dnn_model_scores-MRI.csv'
model_scores2 = 'dnn_model_scores-US.csv'

# get rows of features_one_hot_df which are in X_test, aka X_test as a pandas df
X_test_patients = features_one_hot_df.iloc[
    [i for i, features in enumerate(features_one_hot_df.values.tolist())
     if features in X_test]
]

# import dataframe that has MRI results + patient number
mri_df = pd.read_csv(model_scores)
mri_df = mri_df[['patient','predicted','prob']]
mri_df = mri_df.add_suffix('_MRI')
mri_df["patient_num"] = mri_df["patient_MRI"].str[-4:].astype(int)
# import dataframe that has US results + patient number
us_df = pd.read_csv(model_scores2)
us_df = us_df[['patient','predicted','prob']]
us_df = us_df.add_suffix('_US')
us_df["patient_num"] = us_df["patient_US"].str[-4:].astype(int)
            
# put indices of uncertainty lists in correct order, corresponding to the appropriate
# entry in X_test
mri_indices_ordered = []
us_indices_ordered = []
for X_test_index in X_test_patients.index.values:
    patient_number = df_patient_nums.loc[[X_test_index]].iloc[0]
    for mri_index, mri_patient_number in enumerate(mri_df["patient_num"]):
        if patient_number == mri_patient_number:
            for us_index, us_patient_number in enumerate(us_df["patient_num"]):
                if patient_number == us_patient_number:
                    if mri_index not in mri_indices_ordered and us_index not in us_indices_ordered:
                        mri_indices_ordered.append(mri_index)
                        us_indices_ordered.append(us_index)

# get subset of uncertainties corresponding to X_test
# in correct order 
mri_uncertainty_subset = [mri_combined_uncertainties[i] for i in mri_indices_ordered]
us_uncertainty_subset = [us_combined_uncertainties[i] for i in us_indices_ordered]

In [None]:
## new section to make input uncertainty estimates ## 

# get the column index of the variables with uncertainty
mri_col_index = features_one_hot_df.columns.get_loc("prob_MRI")
us_col_index = features_one_hot_df.columns.get_loc("prob_US")

rf_mri_uncertainties = get_input_rf_monte_carlo_predictions(
    X_test, forward_passes = 100,
    model = clf, n_classes = NUM_CLASSES, n_samples = len(mri_uncertainty_subset),
    uncertainty_list = mri_uncertainty_subset, variable_col_index = mri_col_index
)
print("MRI done.")
rf_us_uncertainties = get_input_rf_monte_carlo_predictions(
    X_test, forward_passes = 100,
    model = clf, n_classes = NUM_CLASSES, n_samples = len(us_uncertainty_subset),
    uncertainty_list = us_uncertainty_subset, variable_col_index = us_col_index
)

In [None]:
rf_us_uncertainties

In [None]:
## new section to make model uncertainty estimates ##

rf_model_uncertainties = get_rf_model_uncertainty(clf, X_test)

rf_model_uncertainties

In [None]:
## new section to process uncertainty estimates ##

rf_combined_uncertainties = (rf_model_uncertainties**2 + 
                             rf_mri_uncertainties**2 + 
                             rf_us_uncertainties**2)**0.5

In [None]:
rf_combined_uncertainties

In [None]:
# because binary classification leads to uncertainty in one class being equal to the other
rf_combined_uncertainties_clean = [row[0] for row in rf_combined_uncertainties]

# each entry is a unique individual / data point
rf_combined_uncertainties_clean

In [None]:
df = pd.DataFrame(rf_combined_uncertainties_clean)
df.to_csv("uncertainty_RF.csv") 