# Part 8: Leave One Out with Random Forest

In [1]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [2]:
# Run some setup code for this notebook.
import glob
import matplotlib.pyplot as plt
import numpy as np
import os
import ipywidgets as widgets

from ipywidgets import VBox, HBox, Layout


from sklearn import preprocessing
from sklearn.utils import shuffle

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

from utils.checkbox import *
from utils.data_utils import *
#from utils.data_processing import *
from utils.svm_modeling import *
from utils.model_eval import *
from __future__ import print_function

# This is a bit of magic to make matplotlib figures appear inline in the
# notebook rather than in a new window.
%matplotlib inline
plt.rcParams['figure.figsize'] = (15.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

%load_ext autoreload
%autoreload 2

In [3]:
# Load data.
path = 'dataset_new'
feature_dict = load_features(path, dont_show=True)
#show_feature_details(feature_dict)

Feature dict loaded.



In [14]:
# Select some features.
use_all = False
#use_all = True

level_1 = [7, 41, 25, 60, 16, 17, 23, 28, 30, 31, 38, 40, 42, 43, 44, 46, 47, 48, 52, 56, 61, 62, 63, 64, 65, 66]
level_2 = [7, 41, 25]
cui_statistic = [7, 11, 15, 22, 23, 25, 29, 35, 41, 44, 55, 62]
cui_1 = [7, 22, 23, 29, 41, 55]
cui_2 = [7, 12, 15, 22, 23, 25, 35, 41, 44, 55, 62]
cui_3 = [7, 22, 23, 41, 52, 55]
gao_1 = [7, 22, 23, 25, 29, 41, 55]
gao_asm_1 = [7, 15, 16, 23, 25, 28, 38, 39, 40, 41, 42, 49]  #108
gao_asm_2 = [7, 13, 15, 16, 18, 23, 25, 27, 28, 29, 38, 39, 40, 41, 42, 49] #108

group_1 = [7, 23, 41]
group_2 = [7, 23, 41, 29, 55, 22]
group_3 = [7, 23, 41, 29, 55, 22, 25, 15, 35]

include_feature_groups = []
include_feature_index = group_3
exclude_feature_index = []

if use_all:
    include_feature_index = np.arange(1, len(feature_dict), 1)

feature_pre_selected = pre_select_feature(include_feature_groups, include_feature_index, exclude_feature_index, dont_show=True)
precheck_boxes = generate_precheck_boxes(feature_pre_selected, feature_dict, dont_show=True)

hbox = gen_checkbox(precheck_boxes, feature_dict)
HBox(hbox)

In [15]:
# Turn on / off log.
#use_log = False
use_log = True

# Load data.
checked_features = review_checkbox(hbox, dont_show=False, log=use_log)

X = load_using_features(feature_dict, checked_features, dont_show=True)

mask_array = np.arange(0, X.shape[0])


Checked features:
  [7, 15, 22, 23, 25, 29, 35, 41, 55]
Loading (9) features, done.


## Leave one out test for Random Forest

## Method I: Random Forest Classifier on Sarcopenia

In [16]:
result_val_RFC = np.zeros([X.shape[0]])
X = load_using_features(feature_dict, checked_features, dont_show=True)
asm, asm_h2, sarcopenia, gender, height_squared, patient_id = load_asm(), load_asm_over_h2(), load_sarcopenia(), load_gender(), load_height_squared(), load_index()

for i in np.arange(0, X.shape[0]):
    mask = np.hstack((mask_array[:i], mask_array[i+1:]))

    asm_train, asm_val = asm[mask], asm[i]
    asm_h2_train, asm_h2_val = asm_h2[mask], asm_h2[i]
    sarcopenia_train, sarcopenia_val = sarcopenia[mask], sarcopenia[i]
    gender_train, gender_val = gender[mask], gender[i]
    height_squared_train, height_squared_val = height_squared[mask], height_squared[i]
    patient_id_train, patient_id_val = patient_id[mask], patient_id[i]
    X_train, X_val = X[mask], X[i].reshape(1, -1)
    
    clf_RFC = None
    clf_RFC = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0)
    clf_RFC.fit(X_train, sarcopenia_train)
    
    result_val_RFC[i] = (clf_RFC.predict(X_val))

#observe_prediction_SVC(clf_RFC, X_train, sarcopenia_train, patient_id_train, dont_show=False, log=use_log, setname='Training')
#result_train_RFC = clf_RFC.predict(X_train)
#eval_classifier(result_train_RFC, sarcopenia_train, show_detail=True, log=use_log, setname='Training')
eval_classifier(result_val_RFC, sarcopenia, show_detail=True, log=use_log, setname='Validation')

ppvs, npvs, sensitivitys, specificitys = eval_classifier_k_fold(result_val_RFC, sarcopenia)

    
print("------------------------ Overall Statistics --------------------------") 
print("ppv: %.3f, \nnpv: %.3f, \nsensitivity: %.3f, \nspecificity: %.3f\n" % 
      (np.mean(ppvs), np.mean(npvs), np.mean(sensitivitys), np.mean(specificitys), ))

#_ = load_using_features(feature_dict, checked_features, dont_show=False)
#print("\nFeature importance: \n", clf_RFC.feature_importances_)

Loading (9) features, done.

Evaluating Validation set:
Positive: 37, Negative: 95
TP: 18, FP: 8, TN: 87, FN: 19
Correct: 105(132), Precision: 0.692, Recall: 0.486, Specificity: 0.916, F1-Score: 0.571

------------------------ Overall Statistics --------------------------
ppv: 0.692, 
npv: 0.821, 
sensitivity: 0.486, 
specificity: 0.916



## Method II: Random Forest Regressor on ASM/h2

In [17]:
result_val_asm_h2_RFR = np.zeros([X.shape[0]])
X = load_using_features(feature_dict, checked_features, dont_show=True)
asm, asm_h2, sarcopenia, gender, height_squared, patient_id = load_asm(), load_asm_over_h2(), load_sarcopenia(), load_gender(), load_height_squared(), load_index()


for i in np.arange(0, X.shape[0]):
    mask = np.hstack((mask_array[:i], mask_array[i+1:]))

    asm_train, asm_val = asm[mask], asm[i]
    asm_h2_train, asm_h2_val = asm_h2[mask], asm_h2[i]
    sarcopenia_train, sarcopenia_val = sarcopenia[mask], sarcopenia[i]
    gender_train, gender_val = gender[mask], gender[i]
    height_squared_train, height_squared_val = height_squared[mask], height_squared[i]
    patient_id_train, patient_id_val = patient_id[mask], patient_id[i]
    X_train, X_val = X[mask], X[i].reshape(1, -1)
    
    clf_RFR_asmh2 = None
    clf_RFR_asmh2 = RandomForestRegressor(n_estimators=100, max_features='auto', max_depth=5, min_samples_split=2, random_state=0)
    clf_RFR_asmh2.fit(X_train, asm_h2_train)
    
    result_val_asm_h2_RFR[i] = (clf_RFR_asmh2.predict(X_val))

#observe_prediction_asm_h2_SVR(clf_RFR_asmh2, X_train, asm_h2_train, gender_train, sarcopenia_train, patient_id_train, dont_show=False, log=use_log, setname='Training')
#result_train = eval_sarcopenia_asm_h2(clf_RFR_asmh2, X_train, gender_train, sarcopenia_train)
#eval_classifier(result_train, sarcopenia_train, show_detail=True, log=use_log, setname='Training')

result_val = eval_sarcopenia_asm_h2_leave_one_out(result_val_asm_h2_RFR, gender, sarcopenia)
eval_classifier(result_val, sarcopenia, show_detail=True, log=use_log, setname='Validation')

ppvs, npvs, sensitivitys, specificitys = eval_classifier_k_fold(result_val, sarcopenia)

    
print("------------------------ Overall Statistics --------------------------") 
print("ppv: %.3f, \nnpv: %.3f, \nsensitivity: %.3f, \nspecificity: %.3f\n" % 
      (np.mean(ppvs), np.mean(npvs), np.mean(sensitivitys), np.mean(specificitys), ))

#_ = load_using_features(feature_dict, checked_features, dont_show=False)
#print("\nFeature importance: \n", clf_RFR_asmh2.feature_importances_)

Loading (9) features, done.

Evaluating Validation set:
Positive: 37, Negative: 95
TP: 16, FP: 8, TN: 87, FN: 21
Correct: 103(132), Precision: 0.667, Recall: 0.432, Specificity: 0.916, F1-Score: 0.525

------------------------ Overall Statistics --------------------------
ppv: 0.667, 
npv: 0.806, 
sensitivity: 0.432, 
specificity: 0.916



## Method III: Random Forest Regressor on ASM

In [9]:
result_val_asm_RFR = np.zeros([X.shape[0]])
X = load_using_features(feature_dict, checked_features, dont_show=True)
asm, asm_h2, sarcopenia, gender, height_squared, patient_id = load_asm(), load_asm_over_h2(), load_sarcopenia(), load_gender(), load_height_squared(), load_index()

for i in np.arange(0, X.shape[0]):
    mask = np.hstack((mask_array[:i], mask_array[i+1:]))

    asm_train, asm_val = asm[mask], asm[i]
    asm_h2_train, asm_h2_val = asm_h2[mask], asm_h2[i]
    sarcopenia_train, sarcopenia_val = sarcopenia[mask], sarcopenia[i]
    gender_train, gender_val = gender[mask], gender[i]
    height_squared_train, height_squared_val = height_squared[mask], height_squared[i]
    patient_id_train, patient_id_val = patient_id[mask], patient_id[i]
    X_train, X_val = X[mask], X[i].reshape(1, -1)
    
    
    clf_RFR_asm = None
    clf_RFR_asm = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=None)
    clf_RFR_asm.fit(X_train, asm_train)
    
    result_val_asm_RFR[i] = (clf_RFR_asm.predict(X_val))

#observe_prediction_asm_SVR(clf_RFR_asm, X_train, asm_train, gender_train, height_squared_train, sarcopenia_train, patient_id_train, dont_show=False, log=use_log, setname='Training')
#result_train = eval_sarcopenia_asm(clf_RFR_asm, X_train, gender_train, height_squared_train, sarcopenia_train)
#eval_classifier(result_train, sarcopenia_train, show_detail=False, log=use_log, setname='Training')

result_val = eval_sarcopenia_asm_leave_one_out(result_val_asm_RFR, gender, height_squared, sarcopenia)
eval_classifier(result_val, sarcopenia, show_detail=True, log=use_log, setname='Validation')

ppvs, npvs, sensitivitys, specificitys = eval_classifier_k_fold(result_val, sarcopenia)

    
print("------------------------ Overall Statistics --------------------------") 
print("ppv: %.3f, \nnpv: %.3f, \nsensitivity: %.3f, \nspecificity: %.3f\n" % 
      (np.mean(ppvs), np.mean(npvs), np.mean(sensitivitys), np.mean(specificitys), ))

#_ = load_using_features(feature_dict, checked_features, dont_show=False)
#print("\nFeature importance: \n", clf_RFR_asmh2.feature_importances_)


Loading (3) features, done.

Evaluating Validation set:
Positive: 37, Negative: 95
TP: 16, FP: 13, TN: 82, FN: 21
Correct: 98(132), Precision: 0.552, Recall: 0.432, Specificity: 0.863, F1-Score: 0.485

------------------------ Overall Statistics --------------------------
ppv: 0.552, 
npv: 0.796, 
sensitivity: 0.432, 
specificity: 0.863



## Measurement Index
$$Precision = \frac{True\ Positive}{True\ Positve + False\ Positive}$$
$$Recall = \frac{True\ Positive}{True\ Positive + False\ Negative}$$
$$F1\_Score = \frac{2 \times Precision \times Recall}{Precision + Recall}$$

$$PPV = \frac{True\ Positive}{True\ Positve + False\ Positive}$$
$$NPV = \frac{True\ Negative}{True\ Negative + False\ Negative}$$
$$Sensitivity = \frac{True\ Positive}{True\ Positive + False\ Negative}$$
$$Specificity = \frac{True\ Negative}{True\ Negative + False\ Positive}$$


|       |          | Actual   | Class  |
| :---  | ---      | ---      |    --- |
|       |          | Positive |Negative|
|Predict|Positive  | TP       | FP     |
| Class |Negative  | FN       | TN     |
