Extract features to be used in model training

In [1]:
# import modules
import numpy as np
import csv
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# load data
data = pd.read_csv('../eda/data_3_targets.csv')
data = data.set_index('image_id')
features = data.drop(["melanoma", "seborrheic_keratosis", "neither"], axis=1)
features.head()

Unnamed: 0_level_0,age_approximate,sex,red_mode,green_mode,blue_mode,red_median,green_median,blue_median,red_iqr,green_iqr,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ISIC_0000000,55,female,65,51,57,94,79,84,67,65,...,5.098648,0.109615,2.625431,0.50818,11.428232,10.780367,0.012018,0.329654,0.006513,0.046969
ISIC_0000001,30,female,87,53,46,116,72,55,86,58,...,4.441031,0.140831,2.777913,0.575211,2.318644,4.577623,0.02156,0.80848,0.00602,0.12474
ISIC_0000002,60,female,162,120,114,161,131,131,26,52,...,2.709728,0.168455,1.932127,0.540123,2.526498,0.231924,0.54776,0.007574,1.2e-05,0.941026
ISIC_0000003,30,male,182,128,51,161,110,71,60,57,...,2.58114,0.216727,2.214528,0.542617,2.665591,3.318969,0.040507,0.119264,0.000637,0.080628
ISIC_0000004,80,male,192,132,171,192,128,157,36,44,...,18.402358,0.129868,4.275156,0.564438,2.312946,0.506435,0.043551,3.153283,0.021736,2.213424


In [3]:
# Function to extract the features with correlation over a threshold
def correlation(dataset, threshold):
    col_corr = set() # Set of all the names of correlated columns
    corr_matrix = dataset.corr(numeric_only=1)
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i,j]) > threshold: # We are interested in abs coeff value
                colname = corr_matrix.columns[i] # getting the name of the column
                col_corr.add(colname)
    return col_corr

# extract correlated features
corr_features = correlation(features, 0.9)

# drop correlated features
best_features = features.drop(corr_features, axis=1)

In [4]:
print('Total number of features: ' + str(len(best_features.columns))+ '\n', best_features.columns)

Total number of features: 45
 Index(['age_approximate', 'sex', 'red_mode', 'green_mode', 'blue_mode',
       'red_median', 'red_iqr', 'green_iqr', 'blue_iqr',
       'original_firstorder_10Percentile', 'original_firstorder_90Percentile',
       'original_firstorder_Energy', 'original_firstorder_Entropy',
       'original_firstorder_InterquartileRange',
       'original_firstorder_Kurtosis', 'original_firstorder_Maximum',
       'original_firstorder_Minimum', 'original_firstorder_Range',
       'original_firstorder_Skewness', 'original_firstorder_Variance',
       'original_glcm_Autocorrelation', 'original_glcm_ClusterProminence',
       'original_glcm_ClusterShade', 'original_glcm_Contrast',
       'original_glcm_Correlation', 'original_glcm_Idmn', 'original_glcm_Imc1',
       'original_glcm_Imc2', 'original_glcm_MCC',
       'original_gldm_DependenceEntropy',
       'original_gldm_DependenceNonUniformity',
       'original_gldm_DependenceNonUniformityNormalized',
       'original_gldm

In [5]:
# Unit conversion for better speed

# convert all 64bit floats to 32bit floats
best_features[best_features.select_dtypes(np.float64).columns] = best_features.select_dtypes(np.float64).astype(np.float32)

# convert all 64bit ints to 16bit ints
best_features[best_features.select_dtypes(np.int64).columns] = best_features.select_dtypes(np.int64).astype(np.int16)

best_features.dtypes.unique()

array([dtype('O'), dtype('int16'), dtype('float32')], dtype=object)

In [6]:
best_features.head()

Unnamed: 0_level_0,age_approximate,sex,red_mode,green_mode,blue_mode,red_median,red_iqr,green_iqr,blue_iqr,original_firstorder_10Percentile,...,original_glrlm_LongRunEmphasis,original_glrlm_LongRunHighGrayLevelEmphasis,original_glrlm_LongRunLowGrayLevelEmphasis,original_glszm_SizeZoneNonUniformityNormalized,original_glszm_ZoneEntropy,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ISIC_0000000,55,female,65,51,57,94,67,65,60,169.600006,...,3.003552,17.415056,0.604887,0.44833,2.625431,10.780367,0.012018,0.329654,0.006513,0.046969
ISIC_0000001,30,female,87,53,46,116,86,58,40,112.0,...,1.455006,9.019741,0.299423,0.455025,2.777913,4.577623,0.02156,0.80848,0.00602,0.12474
ISIC_0000002,60,female,162,120,114,161,26,52,67,157.0,...,1.49971,6.014107,0.380191,0.415954,1.932127,0.231924,0.54776,0.007574,1.2e-05,0.941026
ISIC_0000003,30,male,182,128,51,161,60,57,48,176.0,...,1.505587,5.842686,0.429752,0.434164,2.214528,3.318969,0.040507,0.119264,0.000637,0.080628
ISIC_0000004,80,male,192,132,171,192,36,44,55,26.0,...,1.548163,50.350262,0.268533,0.432653,4.275156,0.506435,0.043551,3.153283,0.021736,2.213423


In [7]:
data['melanoma']

image_id
ISIC_0000000    0
ISIC_0000001    0
ISIC_0000002    1
ISIC_0000003    0
ISIC_0000004    1
               ..
ISIC_0015220    0
ISIC_0015233    0
ISIC_0015260    0
ISIC_0015284    1
ISIC_0015295    0
Name: melanoma, Length: 2000, dtype: int64