# Part III: Predict Sarcopenia with Machine Learning

In [1]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [2]:
# Run some setup code for this notebook.
import glob
import matplotlib.pyplot as plt
import numpy as np
import os
import ipywidgets as widgets


import torch
import torch.utils.data
import torch.nn.functional as F
from torch import autograd
from torch import optim
from torch import nn
from torch.autograd import Variable
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

import argparse

from ipywidgets import VBox, HBox, Layout
from sklearn import svm
from sklearn import neighbors
from sklearn import preprocessing
from sklearn.utils import shuffle

from utils.checkbox import *
from utils.data_utils import *
from utils.data_processing import *
from utils.svm_modeling import *
from utils.model_eval import *
from utils.neural_net import *

from __future__ import print_function

# This is a bit of magic to make matplotlib figures appear inline in the
# notebook rather than in a new window.
%matplotlib inline
plt.rcParams['figure.figsize'] = (15.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

%load_ext autoreload
%autoreload 2

In [3]:
# Load data.
path = 'dataset_new'
feature_dict = load_features(path, dont_show=True)
#show_feature_details(feature_dict)

Feature dict loaded.



In [5]:
# Select some features.
use_all = False
use_all = False
level1 = [7, 41, 25, 60, 16, 17, 23, 28, 30, 38, 40, 42, 43, 44, 46, 47, 48, 52, 56]
level2 = [7, 41, 25]
level3 = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 45, 46, 47, 48, 49, 50, 52, 60]
level4 = [7, 16, 17, 23, 25, 28, 30, 31, 38, 40, 41, 42, 43, 44, 46, 47, 48, 52, 60, 61, 62, 63, 64, 65, 66]
include_feature_groups = []#[1, 2, 3, 4, 5, 6, 7, 8, 9]
include_feature_index = level4
exclude_feature_index = []

if use_all:
    include_feature_index = np.arange(1, len(feature_dict)+1, 1)

feature_pre_selected = pre_select_feature(include_feature_groups, include_feature_index, exclude_feature_index, dont_show=True)
precheck_boxes = generate_precheck_boxes(feature_pre_selected, feature_dict, dont_show=True)

hbox = gen_checkbox(precheck_boxes, feature_dict)
HBox(hbox)

In [6]:
# Load data.
checked_features = review_checkbox(hbox, dont_show=False, log=True, to_file='nn_log')
X = load_using_features(feature_dict, checked_features, dont_show=True)

asm, asm_h2, sarcopenia, gender, height_squared, patient_id = load_asm(), load_asm_over_h2(), load_sarcopenia(), load_gender(), load_height_squared(), load_index()
select_patient = range(0, 132, 1)
X = X[select_patient, :]
asm, asm_h2, sarcopenia, gender, height_squared, patient_id = asm[select_patient], asm_h2[select_patient], sarcopenia[select_patient], gender[select_patient], height_squared[select_patient], patient_id[select_patient]
# Random shuffle. Comment this line if you want to keep the shuffling order.
shuffle_index = np.random.permutation(X.shape[0])

# Data Rescaling.
scaler = preprocessing.StandardScaler()

num_test = int(len(asm) * 2.5 / 16)
num_val = num_test
num_train = len(asm) - num_test - num_val
print(len(asm), num_train, num_val, num_test)


asm_train, asm_val, asm_test = shuffle_feature(asm, shuffle_index, num_train, num_val, num_test)
asm_h2_train, asm_h2_val, asm_h2_test = shuffle_feature(asm_h2, shuffle_index, num_train, num_val, num_test)
sarcopenia_train, sarcopenia_val, sarcopenia_test = shuffle_feature(sarcopenia, shuffle_index, num_train, num_val, num_test)
gender_train, gender_val, gender_test = shuffle_feature(gender, shuffle_index, num_train, num_val, num_test)
height_squared_train, height_squared_val, height_squared_test = shuffle_feature(height_squared, shuffle_index, num_train, num_val, num_test)
patient_id_train, patient_id_val, patient_id_test = shuffle_feature(patient_id, shuffle_index, num_train, num_val, num_test)
X_train, X_val, X_test = shuffle_feature(X, shuffle_index, num_train, num_val, num_test)
X_train, X_val, X_test = scaler.fit_transform(X_train), scaler.transform(X_val), scaler.transform(X_test)

train_dataset = SarcopeniaDataset(X_train, asm_train, asm_h2_train, sarcopenia_train,
                                 height_squared_train, patient_id_train, gender_train,
                                 transform=transforms.Compose([ToTensor()]))
val_dataset = SarcopeniaDataset(X_val, asm_val, asm_h2_val, sarcopenia_val,
                                 height_squared_val, patient_id_val, gender_val,
                                 transform=transforms.Compose([ToTensor()]))
test_dataset = SarcopeniaDataset(X_test, asm_test, asm_h2_test, sarcopenia_test,
                                 height_squared_test, patient_id_test, gender_test,
                                 transform=transforms.Compose([ToTensor()]))


train_loader = DataLoader(train_dataset, batch_size=40,
                        shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=num_val,
                        shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=num_test,
                        shuffle=False)


Checked features:
  [7, 16, 17, 23, 25, 28, 30, 31, 38, 40, 41, 42, 43, 44, 46, 47, 48, 52, 60, 61, 62, 63, 64, 65, 66]
Loading (25) features, done.
132 92 20 20


## Method IV: Neural network on asm/h2

In [7]:
# Set some params for neural network training.
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
lr = 5e-4
net_asm_h2 = None
net_asm_h2 = Net(len(checked_features)).to(device).double()
criterion = nn.MSELoss().to(device)
optimizer = optim.SGD(net_asm_h2.parameters(), lr=lr, momentum=0.9)

# Assume that we are on a CUDA machine, then this should print a CUDA device:
print(device)
print(net_asm_h2)

# Train neural network.
for epoch in range(1000):  # loop over the dataset multiple times

    running_loss = 0.0
    for batch_idx, data in enumerate(train_loader, 0):
        inputs, labels = data['X'], data['asm_h2']
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = net_asm_h2(inputs).reshape(-1)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    if epoch % 100 == 99:    # print every 2000 mini-batches
        print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.10f}'.format(
                epoch+1, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
print('Finished Training')

cpu
Net(
  (fc1): Linear(in_features=25, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=1, bias=True)
)
Finished Training


In [8]:
# Observe result asm_h2.
dataset_list = [train_dataset, val_dataset, test_dataset]
set_names = ['Training', 'Validation', 'Test']
for i in range(3):
    dataset = dataset_list[i]
    with torch.no_grad():
        inputs = dataset[:]['X'].to(device)
        outputs = net_asm_h2(inputs).reshape(-1)
    predicted_asm_h2 = outputs.cpu().numpy()
    result_asm_h2 = eval_sarcopenia_asm_h2_nn(predicted_asm_h2, dataset[:]['gender'], dataset[:]['sarcopenia'])
    eval_classifier(result_asm_h2, dataset[:]['sarcopenia'], show_detail=True, log=False, setname=set_names[i])
    observe_prediction_asm_h2_nn(predicted_asm_h2, dataset[:]['asm_h2'], dataset[:]['gender'], dataset[:]['sarcopenia'], dataset[:]['patient_id'], dont_show=False, log=True, setname=set_names[i])


Evaluating Training set:
Positive: 26, Negative: 66
TP: 26, FP: 0, TN: 66, FN: 0
Correct: 92(92), Precision: 1.000, Recall:, 1.000, F1-Score: 1.000

Observing Training Set:
All correct.

Evaluating Validation set:
Positive: 6, Negative: 14
TP: 1, FP: 3, TN: 11, FN: 5
Correct: 12(20), Precision: 0.250, Recall:, 0.167, F1-Score: 0.200

Observing Validation Set:
Truth: 7.10, Pred: 5.37, Error: -24.37%, Gender:  2, GT: -1, Pred:  1, Correct:  0, Patient_id: 103
Truth: 6.77, Pred: 7.47, Error:  10.44%, Gender:  1, GT:  1, Pred: -1, Correct:  0, Patient_id:  23
Truth: 6.09, Pred: 4.64, Error: -23.80%, Gender:  2, GT: -1, Pred:  1, Correct:  0, Patient_id: 126
Truth: 5.91, Pred: 9.41, Error:  59.26%, Gender:  2, GT: -1, Pred: -1, Correct:  1, Patient_id: 104
Truth: 5.44, Pred: 5.07, Error:  -6.76%, Gender:  2, GT: -1, Pred:  1, Correct:  0, Patient_id:  93
Truth: 5.36, Pred: 5.52, Error:   3.05%, Gender:  2, GT:  1, Pred: -1, Correct:  0, Patient_id: 106
Truth: 6.85, Pred: 5.62, Error: -17.9

## Method V: Neural network on asm

In [34]:
# Set some params for neural network training.
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
lr = 1e-4
net = None
net = Net(len(checked_features)).to(device).double()
criterion = nn.MSELoss()
optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9)

# Train neural network.
for epoch in range(400):  # loop over the dataset multiple times
    for batch_idx, data in enumerate(train_loader, 0):
        inputs, labels = data['X'], data['asm']
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = net(inputs).reshape(-1)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    if epoch % 100 == 99:    # print every 2000 mini-batches
        print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.10f}'.format(
                epoch+1, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

print('Finished Training')

Finished Training


In [37]:
# Observe result asm.
dataset_list = [train_dataset, val_dataset, test_dataset]
set_names = ['Train', 'Validation', 'Test']
for i in range(3):
    dataset = dataset_list[i]
    with torch.no_grad():
        inputs = dataset[:]['X'].to(device)
        outputs = net(inputs).reshape(-1)
    print(set_names[i] + ' set:')
    print("Predicted:")
    print(outputs.cpu().numpy())
    print("Ground truth:")
    print(dataset[:]['asm'].cpu().numpy())
    predicted_asm = outputs.cpu().numpy()
    result_asm = eval_sarcopenia_asm_nn(predicted_asm, dataset[:]['gender'], dataset[:]['height_squared'].cpu().numpy(), dataset[:]['sarcopenia'])
    eval_classifier(result_asm, dataset[:]['sarcopenia'], show_detail=True, log=False, setname=set_names[i])

Train set:
Predicted:
[21.51604286 18.76183458 15.11692752 23.49700909 18.62776534 21.80235034
 19.59790015 19.83081822 16.23207669 16.36752272 11.86420155 21.85897932
 17.49568971 11.66279535 13.27013715 22.4389429  16.07962757 13.01848611
 20.00859028 24.12797361 23.3324172  17.46294856 12.76198634 16.14692013
 16.88586018 20.03140751 19.66865723 22.92517148 14.03313084 14.95010851
 15.51702528 17.15974731 12.01655337 23.0866153  14.08385433 26.16185111
 16.20405545 15.88693947 14.45186425 24.40819023 21.85756808 18.27150885
 14.2375261  14.62191746 16.89733194 13.21182835 16.63676616 24.87231345
 18.88581743 24.95500087 22.464218   10.20455594 17.14273478 16.17055621
 17.1443673  20.74439733 18.65065771 18.76246945 17.7818623  19.84684773
 18.38222695 15.73360951 16.24394595 21.5387256  14.31557607 16.56460929
 17.26984221 20.26633385 13.16346538 14.55529087 17.65561206 23.65031072
 15.98034013 12.81167569 24.38316317 15.97383244 16.99009533 13.41891345
 24.33116476 26.63814459]
Gro