# Part IV Random Forest Method

In [7]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [8]:
# Run some setup code for this notebook.
import glob
import matplotlib.pyplot as plt
import numpy as np
import os
import ipywidgets as widgets

from ipywidgets import VBox, HBox, Layout


from sklearn import preprocessing
from sklearn.utils import shuffle

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

from utils.checkbox import *
from utils.data_utils import *
#from utils.data_processing import *
from utils.svm_modeling import *
from utils.model_eval import *
from __future__ import print_function

# This is a bit of magic to make matplotlib figures appear inline in the
# notebook rather than in a new window.
%matplotlib inline
plt.rcParams['figure.figsize'] = (15.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
# Load data.
path = 'dataset_new'
feature_dict = load_features(path, dont_show=True)
#show_feature_details(feature_dict)

Feature dict loaded.



In [10]:
# Select some features.
use_all = False
#use_all = True

level_1 = [7, 41, 25, 60, 16, 17, 23, 28, 30, 31, 38, 40, 42, 43, 44, 46, 47, 48, 52, 56, 61, 62, 63, 64, 65, 66]
level_2 = [7, 41, 25]
cui_statistic = [7, 11, 15, 22, 23, 25, 29, 35, 41, 44, 55, 62]
cui_1 = [7, 22, 23, 29, 41, 55]
cui_2 = [7, 12, 15, 22, 23, 25, 35, 41, 44, 55, 62]
cui_3 = [7, 22, 23, 41, 52, 55]
include_feature_groups = []
include_feature_index = cui_3
exclude_feature_index = []

if use_all:
    include_feature_index = np.arange(1, len(feature_dict), 1)

feature_pre_selected = pre_select_feature(include_feature_groups, include_feature_index, exclude_feature_index, dont_show=True)
precheck_boxes = generate_precheck_boxes(feature_pre_selected, feature_dict, dont_show=True)

hbox = gen_checkbox(precheck_boxes, feature_dict)
HBox(hbox)

In [14]:
# Turn on / off log.
#use_log = False
use_log = True

# Load data.
checked_features = review_checkbox(hbox, dont_show=False, log=use_log)
X = load_using_features(feature_dict, checked_features, dont_show=True)
asm, asm_h2, sarcopenia, gender, height_squared, patient_id = load_asm(), load_asm_over_h2(), load_sarcopenia(), load_gender(), load_height_squared(), load_index()

# Random shuffle. Comment this line if you want to keep the shuffling order.
shuffle_index = np.random.permutation(X.shape[0])

# Data Rescaling.
scaler = set_scaler()
#X_normalized = scaler.fit_transform(X)

# Split dataset.
num_train = 112
num_val = 20
num_test = 0
asm_train, asm_val, asm_test = shuffle_feature(asm, shuffle_index, num_train, num_val, num_test)
asm_h2_train, asm_h2_val, asm_h2_test = shuffle_feature(asm_h2, shuffle_index, num_train, num_val, num_test)
sarcopenia_train, sarcopenia_val, sarcopenia_test = shuffle_feature(sarcopenia, shuffle_index, num_train, num_val, num_test)
gender_train, gender_val, gender_test = shuffle_feature(gender, shuffle_index, num_train, num_val, num_test)
height_squared_train, height_squared_val, height_squared_test = shuffle_feature(height_squared, shuffle_index, num_train, num_val, num_test)
patient_id_train, patient_id_val, patient_id_test = shuffle_feature(patient_id, shuffle_index, num_train, num_val, num_test)
X_train, X_val, X_test = shuffle_feature(X, shuffle_index, num_train, num_val, num_test)
#X_train, X_val, X_test = scaler.fit_transform(X_train), scaler.transform(X_val), scaler.transform(X_test)


Checked features:
  [7, 22, 23, 41, 52, 55]
Loading (6) features, done.


## Method I: Random Forest Classifier on Sarcopenia

In [15]:
# Train SVC.
clf_RFC = RandomForestClassifier(n_estimators=80, max_depth=None, random_state=0)
clf_RFC.fit(X_train, sarcopenia_train)
print(clf_RFC.feature_importances_)

# Observe Model.
observe_prediction_SVC(clf_RFC, X_train, sarcopenia_train, patient_id_train, dont_show=False, log=use_log, setname='Training')
result_train_RFC = clf_RFC.predict(X_train)
eval_classifier(result_train_RFC, sarcopenia_train, show_detail=True, log=use_log, setname='Training')

observe_prediction_SVC(clf_RFC, X_val, sarcopenia_val, patient_id_val, dont_show=False, log=use_log, setname='Validation')
result_val_RFC = clf_RFC.predict(X_val)
eval_classifier(result_val_RFC, sarcopenia_val, show_detail=True, log=use_log, setname='Validation')



[0.21540563 0.27216855 0.08111419 0.3519862  0.04044598 0.03887945]

Observing Training Set:
All correct.

Evaluating Training set:
Positive: 30, Negative: 82
TP: 30, FP: 0, TN: 82, FN: 0
Correct: 112(112), Precision: 1.000, Recall: 1.000, Specificity: 1.000, F1-Score: 1.000


Observing Validation Set:
Truth:  1, Predicted: -1, Patient id: 101
Truth: -1, Predicted:  1, Patient id:  59
Truth:  1, Predicted: -1, Patient id:  65
Truth:  1, Predicted: -1, Patient id:  42
Truth: -1, Predicted:  1, Patient id:  24

Evaluating Validation set:
Positive: 7, Negative: 13
TP: 4, FP: 2, TN: 11, FN: 3
Correct: 15(20), Precision: 0.667, Recall: 0.571, Specificity: 0.846, F1-Score: 0.615



## Method II: Random Forest Regressor on asm/h2 (appendicular skeletal muscle mass / squared height)

In [16]:
# Train RFR on asm/h2.
clf_RFR_asmh2 = RandomForestRegressor(n_estimators=80, max_depth=None, random_state=0)
clf_RFR_asmh2.fit(X_train, asm_h2_train)
print(clf_RFR_asmh2.feature_importances_)

# Training Set:
observe_prediction_asm_h2_SVR(clf_RFR_asmh2, X_train, asm_h2_train, gender_train, sarcopenia_train, patient_id_train, dont_show=False, log=use_log, setname='Training')
result_train_asm_h2_RFR = eval_sarcopenia_asm_h2(clf_RFR_asmh2, X_train, gender_train, sarcopenia_train)
eval_classifier(result_train_asm_h2_RFR, sarcopenia_train, show_detail=True, log=use_log, setname='Training')

# Test Set:
observe_prediction_asm_h2_SVR(clf_RFR_asmh2, X_val, asm_h2_val, gender_val, sarcopenia_val, patient_id_val, dont_show=False, log=use_log, setname='Validation')
result_val_asm_h2_RFR = eval_sarcopenia_asm_h2(clf_RFR_asmh2, X_val, gender_val, sarcopenia_val)
eval_classifier(result_val_asm_h2_RFR, sarcopenia_val, show_detail=True, log=use_log, setname='Validation')


[0.13245893 0.13871995 0.30860145 0.38228352 0.01822559 0.01971056]

Training Set:
Truth: 5.36, Predicted: 5.87, Error:   9.51%, Gender:  2, GT:  1, Pred: -1, Correct:  0, Patient_id:  84
Truth: 4.25, Predicted: 4.68, Error:  10.24%, Gender:  2, GT:  1, Pred:  1, Correct:  1, Patient_id: 102
Truth: 5.58, Predicted: 6.60, Error:  18.29%, Gender:  1, GT:  1, Pred:  1, Correct:  1, Patient_id:  20
Truth: 5.24, Predicted: 5.69, Error:   8.57%, Gender:  2, GT:  1, Pred: -1, Correct:  0, Patient_id: 107
Truth: 6.97, Predicted: 7.15, Error:   2.54%, Gender:  1, GT:  1, Pred: -1, Correct:  0, Patient_id:  47
Truth: 5.36, Predicted: 5.57, Error:   3.98%, Gender:  2, GT:  1, Pred: -1, Correct:  0, Patient_id: 106
Truth: 4.84, Predicted: 5.34, Error:  10.19%, Gender:  2, GT:  1, Pred:  1, Correct:  1, Patient_id:  94
Truth: 4.85, Predicted: 5.68, Error:  17.26%, Gender:  1, GT:  1, Pred:  1, Correct:  1, Patient_id:  58
Truth: 4.87, Predicted: 5.47, Error:  12.36%, Gender:  2, GT:  1, Pred: -1, C

## Method III: Random Forest Regressor on asm (appendicular skeletal muscle mass)

In [17]:
# Train RFR on asm.
clf_RFR = RandomForestRegressor(n_estimators=200, max_depth=None, random_state=0)
clf_RFR.fit(X_train, asm_train)
print(clf_RFR.feature_importances_)

# Training Set.
observe_prediction_asm_SVR(clf_RFR, X_train, asm_train, gender_train, height_squared_train, sarcopenia_train, patient_id_train, dont_show=False, log=use_log, setname='Training')
result_train_asm_RFR = eval_sarcopenia_asm(clf_RFR, X_train, gender_train, height_squared_train, sarcopenia_train)
eval_classifier(result_train_asm_RFR, sarcopenia_train, show_detail=True, log=use_log, setname='Training')

# Test Set.
observe_prediction_asm_SVR(clf_RFR, X_val, asm_val, gender_val, height_squared_val, sarcopenia_val, patient_id_val, dont_show=False, log=use_log, setname='Validation')
result_val_asm_RFR = eval_sarcopenia_asm(clf_RFR, X_val, gender_val, height_squared_val, sarcopenia_val)
eval_classifier(result_val_asm_RFR, sarcopenia_val, show_detail=True, log=use_log, setname='Validation')


[0.097945   0.08904295 0.57218915 0.21525303 0.01096135 0.01460852]

Training Set:
Truth: 16.66, Pred: 18.16, ASM/h2: 7.01, Error:   9.05%, Gender:  1, GT:  1, Pred: -1, Correct:  0, Patient_id:  50
Truth: 14.58, Pred: 15.19, ASM/h2: 5.58, Error:   4.17%, Gender:  2, GT:  1, Pred: -1, Correct:  0, Patient_id:  84
Truth: 10.20, Pred: 11.48, ASM/h2: 4.78, Error:  12.53%, Gender:  2, GT:  1, Pred:  1, Correct:  1, Patient_id: 102
Truth: 17.09, Pred: 19.49, ASM/h2: 6.36, Error:  14.02%, Gender:  1, GT:  1, Pred:  1, Correct:  1, Patient_id:  20
Truth: 12.75, Pred: 13.64, ASM/h2: 5.60, Error:   6.96%, Gender:  2, GT:  1, Pred: -1, Correct:  0, Patient_id: 107
Truth: 19.67, Pred: 20.43, ASM/h2: 7.24, Error:   3.87%, Gender:  1, GT:  1, Pred: -1, Correct:  0, Patient_id:  47
Truth: 19.02, Pred: 19.80, ASM/h2: 7.10, Error:   4.08%, Gender:  1, GT:  1, Pred: -1, Correct:  0, Patient_id:  27
Truth: 17.76, Pred: 19.52, ASM/h2: 7.44, Error:   9.88%, Gender:  1, GT:  1, Pred: -1, Correct:  0, Patie

## Measurement Index
$$Precision = \frac{True\ Positive}{True\ Positve + False\ Positive}$$
$$Recall = \frac{True\ Positive}{True\ Positive + False\ Negative}$$
$$Specificity = \frac{True\ Negative}{True\ Negative + False\ Positive}$$
$$F1\_Score = \frac{2 \times Precision \times Recall}{Precision + Recall}$$


|       |          | Actual   | Class  |
| :---  | ---      | ---      |    --- |
|       |          | Positive |Negative|
|Predict|Positive  | TP       | FP     |
| Class |Negative  | FN       | TN     |
