# Examples of feature set usage:

In [1]:
from features_set import features_set
import pandas as pd

To perform feature analysis, more observations are needed. It is desirable to include outcomes into dataset. Currently, feature analysis is implemented for binary classes.
  
Demo radiomic features set: open dataset from  
https://www.kaggle.com/kmader/qbi-radiomics-analysis
* A. Lucchi Y. Li and P. Fua, Learning for Structured Prediction Using Approximate Subgradient Descent with Working Sets, Conference on Computer Vision and Pattern Recognition, 2013.  
* A. Lucchi, K.Smith, R. Achanta, G. Knott, P. Fua, Supervoxel-Based Segmentation of Mitochondria in EM Image Stacks with Learned Shape Features, IEEE Transactions on Medical Imaging, Vol. 30, Nr. 11, October 2011.  

In [2]:
# set up the parameters
parameters = {
    'feature_path': './data/demo_feature_set/em_radiomics.csv', # path to csv/xls file with features
    'patient_column': 'N', # name of column with patient id
    'outcome_column': 'positive_score', # name of outcome column
    'feature_column_to_drop': ['score'] # name of column to be excluded
}

In [3]:
# initialize feature set
fs = features_set(**parameters)

In [4]:
# checking class labels
fs._class_label

array(['BG', 'FG'], dtype='<U2')

In [5]:
# checking patients' names
fs._patient_name

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [6]:
# feature dataframe
fs._feature_dataframe.head(10)

Unnamed: 0_level_0,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxis,original_shape_MajorAxis,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,original_shape_Maximum3DDiameter,original_shape_MinorAxis,original_shape_Sphericity,...,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength,positive_score
N,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.749269,0.749269,17.281975,23.065125,23.600847,23.600847,19.79899,27.440845,17.281975,0.828666,...,0.128568,3.693791,0.012889,116134.1736,24.508673,0.001831,6.405953,0.017401,0.022679,BG
1,0.749269,0.749269,17.281975,23.065125,23.600847,23.600847,19.79899,27.440845,17.281975,0.828666,...,0.088495,4.090172,0.014222,95832.80859,25.455287,0.001811,6.3498,0.020359,0.022882,BG
2,0.749269,0.749269,17.281975,23.065125,23.600847,23.600847,19.79899,27.440845,17.281975,0.828666,...,0.072565,3.681307,0.012222,82551.05785,19.409929,0.00239,5.691306,0.029934,0.028021,BG
3,0.749269,0.749269,17.281975,23.065125,23.600847,23.600847,19.79899,27.440845,17.281975,0.828666,...,0.09336,3.769816,0.011556,101857.5562,17.3126,0.002117,8.688507,0.018861,0.037807,BG
4,0.749269,0.749269,17.281975,23.065125,23.600847,23.600847,19.79899,27.440845,17.281975,0.828666,...,0.068987,3.931658,0.014444,85360.73136,13.252829,0.001987,9.085562,0.017549,0.038477,BG
5,0.749269,0.749269,17.281975,23.065125,23.600847,23.600847,19.79899,27.440845,17.281975,0.828666,...,0.110411,3.993786,0.011111,91307.2,15.802137,0.002416,8.382285,0.023481,0.038965,BG
6,0.749269,0.749269,17.281975,23.065125,23.600847,23.600847,19.79899,27.440845,17.281975,0.828666,...,0.052561,4.68068,0.015778,68507.64412,12.439642,0.002212,8.40844,0.019849,0.043132,BG
7,0.749269,0.749269,17.281975,23.065125,23.600847,23.600847,19.79899,27.440845,17.281975,0.828666,...,0.070335,3.676369,0.017333,96749.54635,19.654297,0.001619,6.547911,0.012873,0.027357,BG
8,0.749269,0.749269,17.281975,23.065125,23.600847,23.600847,19.79899,27.440845,17.281975,0.828666,...,0.060115,3.885068,0.017778,75076.6125,18.570744,0.001949,6.050662,0.020541,0.02781,BG
9,0.749269,0.749269,17.281975,23.065125,23.600847,23.600847,19.79899,27.440845,17.281975,0.828666,...,0.069916,3.968565,0.014444,107494.2698,18.121052,0.001742,6.125324,0.019934,0.029706,BG


In [7]:
# deleting columns with only one unique value
fs.handle_constant()

In [8]:
# renewed feature dataframe
fs._feature_dataframe.head(10)

Unnamed: 0_level_0,original_firstorder_10Percentile,original_firstorder_90Percentile,original_firstorder_Energy,original_firstorder_Entropy,original_firstorder_InterquartileRange,original_firstorder_Kurtosis,original_firstorder_Maximum,original_firstorder_MeanAbsoluteDeviation,original_firstorder_Mean,original_firstorder_Median,...,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength,positive_score
N,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,97.0,145.0,65670461,1.68714,26.0,2.71832,182,14.948511,119.398,117.0,...,0.128568,3.693791,0.012889,116134.1736,24.508673,0.001831,6.405953,0.017401,0.022679,BG
1,98.0,151.0,70036585,1.792212,27.0,2.757777,186,15.945234,123.160222,122.0,...,0.088495,4.090172,0.014222,95832.80859,25.455287,0.001811,6.3498,0.020359,0.022882,BG
2,100.0,168.0,84903445,2.076328,43.0,2.009284,197,21.890909,134.946889,136.0,...,0.072565,3.681307,0.012222,82551.05785,19.409929,0.00239,5.691306,0.029934,0.028021,BG
3,101.0,163.0,76723903,1.968922,33.0,2.636773,200,18.893517,128.478444,125.0,...,0.09336,3.769816,0.011556,101857.5562,17.3126,0.002117,8.688507,0.018861,0.037807,BG
4,120.0,179.0,105447879,1.928108,33.0,2.723455,206,18.239565,151.444667,154.0,...,0.068987,3.931658,0.014444,85360.73136,13.252829,0.001987,9.085562,0.017549,0.038477,BG
5,101.0,171.0,84009856,2.110784,44.0,2.01582,200,22.441494,134.095111,131.5,...,0.110411,3.993786,0.011111,91307.2,15.802137,0.002416,8.382285,0.023481,0.038965,BG
6,118.0,184.0,108581259,2.044812,35.0,2.553462,210,20.272453,153.346889,156.0,...,0.052561,4.68068,0.015778,68507.64412,12.439642,0.002212,8.40844,0.019849,0.043132,BG
7,147.0,188.0,128974232,1.512482,22.0,3.419434,213,12.803641,168.536889,170.0,...,0.070335,3.676369,0.017333,96749.54635,19.654297,0.001619,6.547911,0.012873,0.027357,BG
8,135.0,190.0,124297606,1.828117,31.0,2.554571,211,17.377415,164.864,168.0,...,0.060115,3.885068,0.017778,75076.6125,18.570744,0.001949,6.050662,0.020541,0.02781,BG
9,143.0,191.0,130607156,1.711976,23.0,4.786458,212,15.587686,169.085333,173.0,...,0.069916,3.968565,0.014444,107494.2698,18.121052,0.001742,6.125324,0.019934,0.029706,BG


In [9]:
# visualization of feature values distribution in classes (in .html report)
fs.plot_binary_distribution()

![Example of distributions visualization](./data/demo_feature_set/distr.png)

In [10]:
# visualization of feature mutual (Spearman) correlation coefficient matrix (in .html report)
fs.plot_correlation_matrix()

![Example of correlation visualization](./data/demo_feature_set/corr.png)

In [11]:
# visualization of Mann-Whitney Bonferroni corrected p-values for binary classes test (in .html report)
fs.plot_MW_p()

![Example of Mann-Whitney p-values visualization](./data/demo_feature_set/p_MW.png)

In [12]:
# visualization of univariate ROC-curves (in .html report)
fs.plot_univariate_roc()

![Example of ROC-curves visualization](./data/demo_feature_set/roc.png)

In [13]:
# calculation of basic statistics for each feature (in .csv):
# number of NaN, mean, std, min, max; if applicable: MW-p, univariate ROC AUC, volume correlation
fs.calculate_basic_stats()