# Examples of feature set usage:

In [1]:
from features_set import features_set
import pandas as pd

## Binary classes

In [2]:
# set up the parameters
parameters = {
    'feature_path': "./data/features/features.xlsx", # path to csv/xls file with features
    'outcome_path': "./data/features/extended_clinical_df.xlsx", #path to csv/xls file with outcome
    'patient_column': 'Patient', # name of column with patient id
    'patient_in_outcome_column': 'PatientID', # name of column with patient id in clinical data file
    'outcome_column': '1yearsurvival' # name of outcome column
}

In [3]:
# initialize feature set
fs = features_set(**parameters)

Number of observations: 149
Class labels: ['0' '1']
Classes balance: [0.4228187919463087, 0.5771812080536913]


In [4]:
# excluding patients with unknown outcome (in case they are represented)
fs.handle_nan(axis=0)

Number of observations: 149
Class labels: ['0' '1']
Classes balance: [0.4228187919463087, 0.5771812080536913]


In [5]:
# checking patients' names
# print ('Patients names: ', fs._patient_name)

In [6]:
fs._feature_outcome_dataframe

Unnamed: 0_level_0,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,original_shape_Maximum3DDiameter,original_shape_MeshVolume,original_shape_MinorAxisLength,...,wavelet-LLL_gldm_HighGrayLevelEmphasis,wavelet-LLL_gldm_LargeDependenceEmphasis,wavelet-LLL_gldm_LargeDependenceHighGrayLevelEmphasis,wavelet-LLL_gldm_LargeDependenceLowGrayLevelEmphasis,wavelet-LLL_gldm_LowGrayLevelEmphasis,wavelet-LLL_gldm_SmallDependenceEmphasis,wavelet-LLL_gldm_SmallDependenceHighGrayLevelEmphasis,wavelet-LLL_gldm_SmallDependenceLowGrayLevelEmphasis,ROI,1yearsurvival
Patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LUNG1-001_20180209_CT_2_GTV-1_mask,0.732658,0.548834,46.151744,84.090588,95.336247,83.186537,95.425364,96.104110,155379.500000,61.609664,...,14462.536758,33.609142,548084.071741,0.002759,0.000139,0.267761,2959.571494,0.000058,GTV-1_mask,1
LUNG1-002_000000_GTV-1_mask,0.878035,0.755488,70.110114,92.801132,116.931604,101.833197,104.316825,125.674182,358446.791667,81.482668,...,13208.125493,55.600107,832176.248523,0.004497,0.000162,0.188931,1733.836805,0.000053,GTV-1_mask,0
LUNG1-002_20180526_CT_1_GTV-1_mask,0.878035,0.755488,70.110114,92.801132,116.931604,101.833197,104.316825,125.674182,358446.791667,81.482668,...,13208.125493,55.600107,832176.248523,0.004497,0.000162,0.188931,1733.836805,0.000053,GTV-1_mask,0
LUNG1-003_000000_GTV-1_mask,0.544631,0.356597,25.559022,71.674815,56.639209,83.528438,62.265560,84.011904,34987.000000,39.036358,...,9142.646956,17.909008,209143.444093,0.002673,0.000367,0.402930,2838.784544,0.000191,GTV-1_mask,0
LUNG1-003_20180209_CT_1_GTV-1_mask,0.544631,0.356597,25.559022,71.674815,56.639209,83.528438,62.265560,84.011904,34987.000000,39.036358,...,9142.646956,17.909008,209143.444093,0.002673,0.000367,0.402930,2838.784544,0.000191,GTV-1_mask,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LUNG1-342_000000_GTV-1_mask,0.974104,0.705922,31.213709,44.216933,57.245087,55.659680,55.659680,60.183054,38986.875000,43.071892,...,8810.961714,10.006351,104482.381761,0.001182,0.000208,0.412647,3056.038922,0.000126,GTV-1_mask,0
LUNG1-343_000000_GTV-1_mask,0.849862,0.612734,40.937354,66.810979,82.377181,79.246451,71.400280,88.600226,100987.458333,56.780113,...,7974.212295,49.051197,402459.929585,0.006028,0.000158,0.124182,838.463963,0.000049,GTV-1_mask,0
LUNG1-345_000000_GTV-1_mask,0.851225,0.478765,28.770221,60.092622,72.801099,64.815122,67.268120,74.155243,58463.250000,51.152343,...,9674.701245,36.001759,394648.688745,0.003500,0.000198,0.259454,1811.133205,0.000108,GTV-1_mask,1
LUNG1-347_000000_GTV-1_mask,0.551685,0.447612,13.158653,29.397432,21.260292,34.481879,24.041631,36.124784,3131.208333,16.218112,...,5185.837607,3.649573,25163.108895,0.001458,0.000878,0.663799,2953.583448,0.000784,GTV-1_mask,1


In [7]:
# visualization of feature values distribution in classes (in .html report)
fs.plot_distribution(fs._feature_column[:100])

Example of plotted distributions of feature values in classes:
![](./data/images/distr.png)

In [8]:
# visualization of feature mutual (Spearman) correlation coefficient matrix (in .html report)
fs.plot_correlation_matrix(fs._feature_column[:100])

Example of feature correlation matrix:
![](./data/images/corr.png)

In [9]:
# visualization of Mann-Whitney Bonferroni corrected p-values for binary classes test (in .html report)
fs.plot_MW_p(fs._feature_column[:100])

Example of Mann-Whitney p-values:
![](./data/images/p_MW.png)

In [10]:
# visualization of univariate ROC-curves (in .html report)
fs.plot_univariate_roc(fs._feature_column[:100])

Example of univariate feature ROC:
![](./data/images/roc.png)

In [11]:
# calculation of basic statistics for each feature (in .xlsx):
# number of NaN, mean, std, min, max; if applicable: MW-p, univariate ROC AUC, volume correlation
fs.calculate_basic_stats(volume_feature='original_shape_VoxelVolume')

In [12]:
# checking the excel table
print('Basic statistics for each feature')
pd.read_excel('./data/features/features_basic_stats.xlsx')

Basic statistics for each feature


Unnamed: 0.1,Unnamed: 0,NaN,Mean,Std,Min,Max,p_MW_corrected,univar_auc,volume_corr
0,original_shape_Elongation,0,0.720328,0.161721,0.062127,0.974104,1.000000,0.533869,0.037515
1,original_shape_Flatness,0,0.559677,0.154895,0.047315,0.856767,1.000000,0.532484,0.099032
2,original_shape_LeastAxisLength,0,32.055804,15.983620,6.643777,85.495660,0.117975,0.686877,0.973238
3,original_shape_MajorAxisLength,0,61.595666,35.336125,13.611433,240.822486,0.018917,0.694168,0.842114
4,original_shape_Maximum2DDiameterColumn,0,63.064956,33.057474,15.620499,157.632484,0.161563,0.658269,0.950909
...,...,...,...,...,...,...,...,...,...
1213,wavelet-LLL_gldm_LargeDependenceLowGrayLevelEm...,0,1.646810,8.820031,0.001036,79.946280,1.000000,0.540513,0.070593
1214,wavelet-LLL_gldm_LowGrayLevelEmphasis,0,0.007969,0.039767,0.000069,0.369746,0.471623,0.531192,-0.805811
1215,wavelet-LLL_gldm_SmallDependenceEmphasis,0,0.313546,0.177561,0.009452,0.755977,1.000000,0.654393,-0.634677
1216,wavelet-LLL_gldm_SmallDependenceHighGrayLevelE...,0,2431.019809,1077.764252,0.093515,5302.913608,1.000000,0.619970,-0.274001


In [13]:
# volume analysis
fs.volume_analysis(volume_feature='original_shape_VoxelVolume')

Example of volume precision-recall curve:
![](./data/images/vol_prc.png)

Example of volume Spearman correlation coefficients:
![](./data/images/vol_corr.png)

## Multi-class

In [2]:
parameters = {
    'feature_path': "./data/features/features.xlsx", # path to csv/xls file with features
    'outcome_path': "./data/features/extended_clinical_df.xlsx", #path to csv/xls file with outcome
    'patient_column': 'Patient', # name of column with patient id
    'patient_in_outcome_column': 'PatientID', # name of column with patient id in clinical data file
    'outcome_column': 'Overall.Stage' # name of outcome column
}

In [3]:
fs = features_set(**parameters)

Number of observations: 149
Class labels: ['I' 'II' 'IIIa' 'IIIb' 'nan']
Classes balance: [0.24161073825503357, 0.09395973154362416, 0.2348993288590604, 0.4228187919463087, 0.006711409395973154]


In [4]:
fs._feature_outcome_dataframe[fs._feature_outcome_dataframe['Overall.Stage'].isnull()]

Unnamed: 0_level_0,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,original_shape_Maximum3DDiameter,original_shape_MeshVolume,original_shape_MinorAxisLength,...,wavelet-LLL_gldm_HighGrayLevelEmphasis,wavelet-LLL_gldm_LargeDependenceEmphasis,wavelet-LLL_gldm_LargeDependenceHighGrayLevelEmphasis,wavelet-LLL_gldm_LargeDependenceLowGrayLevelEmphasis,wavelet-LLL_gldm_LowGrayLevelEmphasis,wavelet-LLL_gldm_SmallDependenceEmphasis,wavelet-LLL_gldm_SmallDependenceHighGrayLevelEmphasis,wavelet-LLL_gldm_SmallDependenceLowGrayLevelEmphasis,ROI,Overall.Stage
Patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LUNG1-272_000000_GTV-1_mask,0.713259,0.487347,14.2573,29.254915,25.495098,31.400637,35.608988,37.868192,5713.416667,20.86634,...,11955.686684,27.442472,349332.116971,0.002377,0.00028,0.233726,2319.671592,0.000215,GTV-1_mask,


In [5]:
fs.handle_nan(axis=0)

Number of observations: 148
Class labels: ['I' 'II' 'IIIa' 'IIIb']
Classes balance: [0.24324324324324326, 0.0945945945945946, 0.23648648648648649, 0.42567567567567566]


In [6]:
fs.plot_distribution(fs._feature_column[:100])

In [7]:
fs.plot_distribution(fs._feature_column[:100], ['I', 'IIIb'])

In [8]:
fs.plot_correlation_matrix(fs._feature_column[:100])

In [9]:
fs.plot_MW_p(fs._feature_column[:100], ['I', 'IIIb'])

In [10]:
fs.plot_univariate_roc(fs._feature_column[:100], ['I', 'IIIb'])

In [11]:
fs.calculate_basic_stats(volume_feature='original_shape_VoxelVolume')

In [12]:
fs.volume_analysis(volume_feature='original_shape_VoxelVolume')