In this tutorial, we will analyze Lipoma dataset from the [WORC Database](https://github.com/MStarmans91/WORCDatabase/tree/development).

More details on the dataset as well as the original analysis performed by their authors can be found here:

`Starmans, M. P. A. et al. (2021). The WORC* database: MRI and CT scans, segmentations, and clinical labels for 932 patients from six radiomics studies. Submitted, preprint available from https://doi.org/10.1101/2021.08.19.21262238`

`The experiments are described in the following paper: Starmans, M. P. A. et al. (2021). Reproducible radiomics through automated machine learning validated on twelve clinical applications. Submitted, preprint available from https://arxiv.org/abs/2108.08618.`

In [None]:
# In case you haven't installed AutoRadiomics
!pip install autorad

In [1]:
from autorad.external.download_WORC import download_WORCDatabase
from pathlib import Path
import pandas as pd

# Set where we will save our data and results
base_dir = Path.cwd() / "autorad_tutorial"
data_dir = base_dir / "data"
result_dir = base_dir / "results"

%load_ext autoreload
%autoreload 2


download_WORCDatabase(
    dataset="Liver",
    data_folder=data_dir,
    n_subjects=25,
    )

KeyboardInterrupt: 

In [2]:
!ls $data_dir

[1m[36mLiver-006[m[m  [1m[36mLiver-030[m[m  [1m[36mLiver-038[m[m  [1m[36mLiver-122[m[m  [1m[36mLiver-147[m[m  [1m[36mLiver-166[m[m  [1m[36mLiver-185[m[m
[1m[36mLiver-008[m[m  [1m[36mLiver-031[m[m  [1m[36mLiver-067[m[m  [1m[36mLiver-124[m[m  [1m[36mLiver-154[m[m  [1m[36mLiver-168[m[m  labels.csv
[1m[36mLiver-011[m[m  [1m[36mLiver-035[m[m  [1m[36mLiver-079[m[m  [1m[36mLiver-128[m[m  [1m[36mLiver-156[m[m  [1m[36mLiver-180[m[m
[1m[36mLiver-023[m[m  [1m[36mLiver-037[m[m  [1m[36mLiver-107[m[m  [1m[36mLiver-146[m[m  [1m[36mLiver-159[m[m  [1m[36mLiver-184[m[m


In [3]:
from autorad.data.utils import get_paths_with_separate_folder_per_case

paths_df = get_paths_with_separate_folder_per_case(data_dir, relative=True)
paths_df

Unnamed: 0,ID,image_path,segmentation_path
0,Liver-035,Liver-035/image.nii.gz,Liver-035/segmentation.nii.gz
1,Liver-067,Liver-067/image.nii.gz,Liver-067/segmentation.nii.gz
2,Liver-146,Liver-146/image.nii.gz,Liver-146/segmentation.nii.gz
3,Liver-184,Liver-184/image.nii.gz,Liver-184/segmentation.nii.gz
4,Liver-124,Liver-124/image.nii.gz,Liver-124/segmentation.nii.gz
5,Liver-185,Liver-185/image.nii.gz,Liver-185/segmentation.nii.gz
6,Liver-147,Liver-147/image.nii.gz,Liver-147/segmentation.nii.gz
7,Liver-122,Liver-122/image.nii.gz,Liver-122/segmentation.nii.gz
8,Liver-107,Liver-107/image.nii.gz,Liver-107/segmentation.nii.gz
9,Liver-154,Liver-154/image.nii.gz,Liver-154/segmentation.nii.gz


In [4]:
from autorad.data.dataset import ImageDataset
from autorad.feature_extraction.extractor import FeatureExtractor

image_dataset = ImageDataset(
    paths_df,
    ID_colname="ID",
    root_dir=data_dir,
)
extractor = FeatureExtractor(image_dataset)
feature_df = extractor.run()

25it [00:38,  1.53s/it]


In [5]:
feature_df.head()

Unnamed: 0,ID,image_path,segmentation_path,diagnostics_Versions_PyRadiomics,diagnostics_Versions_Numpy,diagnostics_Versions_SimpleITK,diagnostics_Versions_PyWavelet,diagnostics_Versions_Python,diagnostics_Configuration_Settings,diagnostics_Configuration_EnabledImageTypes,...,wavelet-LLL_gldm_LargeDependenceLowGrayLevelEmphasis,wavelet-LLL_gldm_LowGrayLevelEmphasis,wavelet-LLL_gldm_SmallDependenceEmphasis,wavelet-LLL_gldm_SmallDependenceHighGrayLevelEmphasis,wavelet-LLL_gldm_SmallDependenceLowGrayLevelEmphasis,wavelet-LLL_ngtdm_Busyness,wavelet-LLL_ngtdm_Coarseness,wavelet-LLL_ngtdm_Complexity,wavelet-LLL_ngtdm_Contrast,wavelet-LLL_ngtdm_Strength
0,Liver-035,/Users/p.woznicki/git/AutoRadiomics/examples/a...,/Users/p.woznicki/git/AutoRadiomics/examples/a...,v3.0.1,1.22.1,2.1.1.2,1.2.0,3.10.4,"{'minimumROIDimensions': 1, 'minimumROISize': ...","{'Original': {}, 'LoG': {'sigma': [3.0, 5.0]},...",...,0.490335565711314,0.0092392563097354,0.1049786317887887,17.108519817338614,0.0019064162301314,2.195837937351879,0.0010111456662735,426.9975349731685,0.0619020314532844,0.284974468309886
1,Liver-067,/Users/p.woznicki/git/AutoRadiomics/examples/a...,/Users/p.woznicki/git/AutoRadiomics/examples/a...,v3.0.1,1.22.1,2.1.1.2,1.2.0,3.10.4,"{'minimumROIDimensions': 1, 'minimumROISize': ...","{'Original': {}, 'LoG': {'sigma': [3.0, 5.0]},...",...,0.1726950112519562,0.0084378080208024,0.2387784244404366,83.97021323093149,0.0023728396471364,0.7591978668706331,0.001202498835943,4314.0961725974375,0.0530871088832545,3.480993082112052
2,Liver-146,/Users/p.woznicki/git/AutoRadiomics/examples/a...,/Users/p.woznicki/git/AutoRadiomics/examples/a...,v3.0.1,1.22.1,2.1.1.2,1.2.0,3.10.4,"{'minimumROIDimensions': 1, 'minimumROISize': ...","{'Original': {}, 'LoG': {'sigma': [3.0, 5.0]},...",...,0.5235105253912582,0.0144098607235655,0.1017286419590698,9.821812246039771,0.0024608166414348,1.9618138907602145,0.0021490398335429,178.82845259100122,0.0331384950258136,0.3598787334834973
3,Liver-184,/Users/p.woznicki/git/AutoRadiomics/examples/a...,/Users/p.woznicki/git/AutoRadiomics/examples/a...,v3.0.1,1.22.1,2.1.1.2,1.2.0,3.10.4,"{'minimumROIDimensions': 1, 'minimumROISize': ...","{'Original': {}, 'LoG': {'sigma': [3.0, 5.0]},...",...,0.3036898583747069,0.006330218697061,0.107550038020076,33.0708290664619,0.0014813588608183,0.8257035290466124,0.0016166914766237,829.755720150696,0.0456811806461254,1.1242349494954111
4,Liver-124,/Users/p.woznicki/git/AutoRadiomics/examples/a...,/Users/p.woznicki/git/AutoRadiomics/examples/a...,v3.0.1,1.22.1,2.1.1.2,1.2.0,3.10.4,"{'minimumROIDimensions': 1, 'minimumROISize': ...","{'Original': {}, 'LoG': {'sigma': [3.0, 5.0]},...",...,0.0183362123564789,0.0012331934199789,0.2252979003070128,298.00432747313664,0.0006073601929044,0.105600506809663,0.0032715611247683,2602.6922196709124,0.0431363001913646,5.548545298631775


In [6]:
label_df = pd.read_csv(data_dir / "labels.csv")
label_df.head()

Unnamed: 0,patient_ID,diagnosis
0,Liver-185,1
1,Liver-166,0
2,Liver-180,0
3,Liver-067,0
4,Liver-035,0


In [7]:
from autorad.data.dataset import FeatureDataset

merged_feature_df = feature_df.merge(label_df, left_on="ID",
    right_on="patient_ID", how="left")
feature_dataset = FeatureDataset(
    merged_feature_df,
    target="diagnosis",
    ID_colname="ID"
)

Split the data into training/validation/test sets:

In [8]:
splits_path = result_dir / "splits.json"
feature_dataset.split(method="train_val_test", save_path=splits_path)

{'split_type': '60% train + 20% validation + 20% test',
 'train': ['Liver-122',
  'Liver-154',
  'Liver-166',
  'Liver-147',
  'Liver-011',
  'Liver-159',
  'Liver-031',
  'Liver-030',
  'Liver-006',
  'Liver-128',
  'Liver-037',
  'Liver-180',
  'Liver-124',
  'Liver-035',
  'Liver-008'],
 'val': ['Liver-067', 'Liver-107', 'Liver-079', 'Liver-038', 'Liver-184'],
 'test': ['Liver-023', 'Liver-146', 'Liver-168', 'Liver-156', 'Liver-185']}

In [11]:
from autorad.training.trainer import Trainer
from autorad.models.classifier import MLClassifier

models = MLClassifier.initialize_default_sklearn_models()
print(models)

[Random Forest, Logistic Regression, SVM, XGBoost]


In [12]:
trainer = Trainer(
    dataset=feature_dataset,
    models=models,
    result_dir=result_dir,
    experiment_name="Liver_detection",
)
trainer.run_auto_preprocessing(oversampling=False)

In [12]:
trainer.set_optimizer("optuna", n_trials=10)
trainer.run(auto_preprocess=True)

  mlfc = MLflowCallback(
[32m[I 2022-06-03 11:59:14,282][0m A new study created in memory with name: Liver_detection[0m
[32m[I 2022-06-03 11:59:14,486][0m Trial 0 finished with value: 0.5 and parameters: {'feature_selection_method': 'anova', 'oversampling_method': None, 'model': 'XGBoost', 'xgb_lambda': 0.014570271595538578, 'xgb_alpha': 0.00021300368327705253, 'colsample_bytree': 0.5136940145553204, 'xgb_subsample': 0.47454241292069554, 'xgb_booster': 'gbtree', 'xgb_max_depth': 5, 'xgb_min_child_weight': 8, 'xgb_eta': 2.883592210117951e-07, 'xgb_gamma': 2.53287670148807e-07, 'xgb_grow_policy': 'lossguide'}. Best is trial 0 with value: 0.5.[0m
2022/06/03 11:59:14 INFO mlflow.tracking.fluent: Experiment with name 'Liver_detection' does not exist. Creating a new experiment.
[32m[I 2022-06-03 11:59:14,872][0m Trial 1 finished with value: 0.5 and parameters: {'feature_selection_method': 'lasso', 'oversampling_method': None, 'model': 'Logistic Regression', 'lr_penalty': 'l1', 'lr_C'

In [16]:
from autorad.utils import io
from autorad.visualization import plotly_utils
from autorad.training.trainer import Inferrer

best_params = io.load_json(result_dir / "best_params.json")
inferrer = Inferrer(params=best_params, result_dir=result_dir)
inferrer.fit_eval(feature_dataset, result_name="test")

results = pd.read_csv(result_dir / "test.csv")
plotly_utils.plot_roc_curve(results.y_true, results.y_pred)


Function plot_roc_curve is deprecated; Function :func:`plot_roc_curve` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: :meth:`sklearn.metric.RocCurveDisplay.from_predictions` or :meth:`sklearn.metric.RocCurveDisplay.from_estimator`.



TypeError: plot_roc_curve() missing 1 required positional argument: 'y'

In [17]:
results

Unnamed: 0,y_true,y_pred
0,1,0.487579
1,1,0.31535
2,0,0.423619
3,0,0.306637
4,0,0.465888
