# Examples of feature set usage:

In [1]:
import os,sys
sys.path.append(os.path.abspath(".."))
from features_set import features_set
import pandas as pd

<b>Structure of a module</b>
  
![](../res/images/module.png)

## Binary classes

In [2]:
# set up the parameters
parameters = {
    'feature_path': "../data/features/extracted_features_full.xlsx", # path to csv/xls file with features
    'outcome_path': "../data/features/extended_clinical_df.xlsx", #path to csv/xls file with outcome
    'patient_column': 'Patient', # name of column with patient id
    'patient_in_outcome_column': 'PatientID', # name of column with patient id in clinical data file
    'outcome_column': '1yearsurvival' # name of outcome column
}

In [None]:
pd.DataFrame([])

In [3]:
# initialize feature set
fs = features_set(**parameters)

Number of observations: 149
Class labels: ['0' '1']
Classes balance: [0.4228187919463087, 0.5771812080536913]


In [4]:
# excluding patients with unknown outcome (in case they are represented)
fs.handle_nan(axis=0)

Number of observations: 149
Class labels: ['0' '1']
Classes balance: [0.4228187919463087, 0.5771812080536913]


In [5]:
fs._feature_outcome_dataframe.head(5)

Unnamed: 0_level_0,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,original_shape_Maximum3DDiameter,original_shape_MeshVolume,original_shape_MinorAxisLength,...,wavelet-LLL_gldm_HighGrayLevelEmphasis,wavelet-LLL_gldm_LargeDependenceEmphasis,wavelet-LLL_gldm_LargeDependenceHighGrayLevelEmphasis,wavelet-LLL_gldm_LargeDependenceLowGrayLevelEmphasis,wavelet-LLL_gldm_LowGrayLevelEmphasis,wavelet-LLL_gldm_SmallDependenceEmphasis,wavelet-LLL_gldm_SmallDependenceHighGrayLevelEmphasis,wavelet-LLL_gldm_SmallDependenceLowGrayLevelEmphasis,ROI,1yearsurvival
Patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LUNG1-001_20180209_CT_2_GTV-1_mask,0.732658,0.548834,46.151744,84.090588,95.336247,83.186537,95.425364,96.10411,155379.5,61.609664,...,14462.536758,33.609142,548084.071741,0.002759,0.000139,0.267761,2959.571494,5.8e-05,GTV-1_mask,1
LUNG1-002_000000_GTV-1_mask,0.878035,0.755488,70.110114,92.801132,116.931604,101.833197,104.316825,125.674182,358446.791667,81.482668,...,13208.125493,55.600107,832176.248523,0.004497,0.000162,0.188931,1733.836805,5.3e-05,GTV-1_mask,0
LUNG1-002_20180526_CT_1_GTV-1_mask,0.878035,0.755488,70.110114,92.801132,116.931604,101.833197,104.316825,125.674182,358446.791667,81.482668,...,13208.125493,55.600107,832176.248523,0.004497,0.000162,0.188931,1733.836805,5.3e-05,GTV-1_mask,0
LUNG1-003_000000_GTV-1_mask,0.544631,0.356597,25.559022,71.674815,56.639209,83.528438,62.26556,84.011904,34987.0,39.036358,...,9142.646956,17.909008,209143.444093,0.002673,0.000367,0.40293,2838.784544,0.000191,GTV-1_mask,0
LUNG1-003_20180209_CT_1_GTV-1_mask,0.544631,0.356597,25.559022,71.674815,56.639209,83.528438,62.26556,84.011904,34987.0,39.036358,...,9142.646956,17.909008,209143.444093,0.002673,0.000367,0.40293,2838.784544,0.000191,GTV-1_mask,0


In [6]:
# visualization of feature values distribution in classes (in .html report)
fs.plot_distribution(fs._feature_column[:100])

<b>Example of plotted distributions of feature values in classes:</b>
![](../res/images/distr.png)

In [7]:
# visualization of feature mutual (Spearman) correlation coefficient matrix (in .html report)
fs.plot_correlation_matrix(fs._feature_column[15:30])

<b>Example of feature correlation matrix:</b>
![](../res/images/corr.png)

In [8]:
# visualization of Mann-Whitney Bonferroni corrected p-values for binary classes test (in .html report)
fs.plot_MW_p(fs._feature_column[:100])

<b>Example of Mann-Whitney p-values:</b>
![](../res/images/p_MW.png)

In [9]:
# visualization of univariate ROC-curves (in .html report)
fs.plot_univariate_roc(fs._feature_column[:100])

<b>Example of univariate feature ROC:</b>
![](../res/images/roc.png)

In [10]:
# calculation of basic statistics for each feature (in .xlsx):
# number of NaN, mean, std, min, max; if applicable: MW-p, univariate ROC AUC, volume correlation
fs.calculate_basic_stats(volume_feature='original_shape_VoxelVolume')

In [15]:
# checking the excel table
print('Basic statistics for each feature')
pd.read_excel('../data/features/extracted_features_full_basic_stats.xlsx')

Basic statistics for each feature


Unnamed: 0.1,Unnamed: 0,NaN,Mean,Std,Min,Max,p_MW_corrected,univar_auc,volume_corr
0,original_shape_Elongation,0,0.720328,0.161721,0.062127,0.974104,1.000000,0.517996,0.037515
1,original_shape_Flatness,0,0.559677,0.154895,0.047315,0.856767,1.000000,0.516611,0.099032
2,original_shape_LeastAxisLength,0,32.055804,15.983620,6.643777,85.495660,0.117975,0.686877,0.973238
3,original_shape_MajorAxisLength,0,61.595666,35.336125,13.611433,240.822486,0.018917,0.705795,0.842114
4,original_shape_Maximum2DDiameterColumn,0,63.064956,33.057474,15.620499,157.632484,0.161563,0.681525,0.950909
...,...,...,...,...,...,...,...,...,...
1213,wavelet-LLL_gldm_LargeDependenceLowGrayLevelEm...,0,1.646810,8.820031,0.001036,79.946280,1.000000,0.524640,0.070593
1214,wavelet-LLL_gldm_LowGrayLevelEmphasis,0,0.007969,0.039767,0.000069,0.369746,0.471623,0.515319,-0.805811
1215,wavelet-LLL_gldm_SmallDependenceEmphasis,0,0.313546,0.177561,0.009452,0.755977,1.000000,0.654393,-0.634677
1216,wavelet-LLL_gldm_SmallDependenceHighGrayLevelE...,0,2431.019809,1077.764252,0.093515,5302.913608,1.000000,0.619970,-0.274001


In [16]:
# volume analysis
fs.volume_analysis(volume_feature='original_shape_VoxelVolume')

<b>Example of volume precision-recall curve:</b>
![](../res/images/vol_prc.png)

<b>Example of volume Spearman correlation coefficients:</b>
![](../res/images/vol_corr.png)

## Multi-class

In [17]:
# set up the parameters - now the outcome is multi-class
parameters = {
    'feature_path': "../data/features/extracted_features_full.xlsx", # path to csv/xls file with features
    'outcome_path': "../data/features/extended_clinical_df.xlsx", #path to csv/xls file with outcome
    'patient_column': 'Patient', # name of column with patient id
    'patient_in_outcome_column': 'PatientID', # name of column with patient id in clinical data file
    'outcome_column': 'Overall.Stage' # name of outcome column
}

In [18]:
# initialize feature set
fs = features_set(**parameters)

Number of observations: 149
Class labels: ['I' 'II' 'IIIa' 'IIIb' 'nan']
Classes balance: [0.24161073825503357, 0.09395973154362416, 0.2348993288590604, 0.4228187919463087, 0.006711409395973154]


In [19]:
# check the patient with unknown outcome
fs._feature_outcome_dataframe[fs._feature_outcome_dataframe['Overall.Stage'].isnull()]

Unnamed: 0_level_0,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,original_shape_Maximum3DDiameter,original_shape_MeshVolume,original_shape_MinorAxisLength,...,wavelet-LLL_gldm_HighGrayLevelEmphasis,wavelet-LLL_gldm_LargeDependenceEmphasis,wavelet-LLL_gldm_LargeDependenceHighGrayLevelEmphasis,wavelet-LLL_gldm_LargeDependenceLowGrayLevelEmphasis,wavelet-LLL_gldm_LowGrayLevelEmphasis,wavelet-LLL_gldm_SmallDependenceEmphasis,wavelet-LLL_gldm_SmallDependenceHighGrayLevelEmphasis,wavelet-LLL_gldm_SmallDependenceLowGrayLevelEmphasis,ROI,Overall.Stage
Patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LUNG1-272_000000_GTV-1_mask,0.713259,0.487347,14.2573,29.254915,25.495098,31.400637,35.608988,37.868192,5713.416667,20.86634,...,11955.686684,27.442472,349332.116971,0.002377,0.00028,0.233726,2319.671592,0.000215,GTV-1_mask,


In [20]:
# excluding patients with unknown outcome
fs.handle_nan(axis=0)

Number of observations: 148
Class labels: ['I' 'II' 'IIIa' 'IIIb']
Classes balance: [0.24324324324324326, 0.0945945945945946, 0.23648648648648649, 0.42567567567567566]


In [21]:
# visualization of feature values distribution in multiple classes (in .html report)
fs.plot_distribution(fs._feature_column[:100])

<b>Example of plotted distributions of feature values in classes:</b>
![](../res/images/distr_multiclass.png)

In [None]:
# visualization of feature values distribution in 2 selected classes (in .html report)
fs.plot_distribution(fs._feature_column[:100], ['I', 'IIIb'])

In [None]:
# visualization of feature mutual (Spearman) correlation coefficient matrix (in .html report)
fs.plot_correlation_matrix(fs._feature_column[:100])

In [None]:
# visualization of Mann-Whitney Bonferroni corrected p-values for binary classes test (in .html report)
fs.plot_MW_p(fs._feature_column[:100], ['I', 'IIIb'])

In [None]:
# visualization of univariate ROC-curves (in .html report)
fs.plot_univariate_roc(fs._feature_column[:100], ['I', 'IIIb'])

In [None]:
# calculation of basic statistics for each feature (in .xlsx):
# number of NaN, mean, std, min, max, volume correlation
fs.calculate_basic_stats(volume_feature='original_shape_VoxelVolume')

In [None]:
# checking the excel table
print('Basic statistics for each feature')
pd.read_excel('../data/features/features_basic_stats.xlsx')

In [None]:
# volume analysis
fs.volume_analysis(volume_feature='original_shape_VoxelVolume')