In [1]:
### FEATURE SELECTION ON THREE SEPARATE DATASETS ###

# import modules
import pandas as pd
import numpy as np

In [2]:
# load datasets

fifty_neither_vs_mel_keratosis = pd.read_csv('../dataset_versions/50neither_vs_mel_keratosis.csv')
mel_vs_keratosis = pd.read_csv('../dataset_versions/mel_vs_keratosis.csv')
neither_vs_melkeratosis = pd.read_csv('../dataset_versions/neither_vs_melkeratosis.csv')
print(len(fifty_neither_vs_mel_keratosis.columns), len(mel_vs_keratosis.columns), len(neither_vs_melkeratosis.columns))

108 107 107


In [3]:
fnmk_features = fifty_neither_vs_mel_keratosis.drop(["melanoma", "seborrheic_keratosis", "neither"], axis=1)
mk_features = mel_vs_keratosis.drop(["melanoma", "seborrheic_keratosis"], axis=1)
nm_features = neither_vs_melkeratosis.drop(["mel_keratosis", "neither"], axis=1)
print(len(fnmk_features.columns), len(mk_features.columns), len(nm_features.columns))

105 105 105


In [4]:
mk_features.columns.unique()

Index(['image_id', 'age_approximate', 'sex', 'red_mode', 'green_mode',
       'blue_mode', 'red_median', 'green_median', 'blue_median', 'red_iqr',
       ...
       'original_glszm_SmallAreaHighGrayLevelEmphasis',
       'original_glszm_SmallAreaLowGrayLevelEmphasis',
       'original_glszm_ZoneEntropy', 'original_glszm_ZonePercentage',
       'original_glszm_ZoneVariance', 'original_ngtdm_Busyness',
       'original_ngtdm_Coarseness', 'original_ngtdm_Complexity',
       'original_ngtdm_Contrast', 'original_ngtdm_Strength'],
      dtype='object', length=105)

In [5]:
# Function to extract the features with correlation over a threshold
def correlation(dataset, threshold):
    col_corr = set() # Set of all the names of correlated columns
    corr_matrix = dataset.corr(numeric_only=1)
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i,j]) > threshold: # We are interested in abs coeff value
                colname = corr_matrix.columns[i] # getting the name of the column
                col_corr.add(colname)
    return col_corr

In [6]:
# extract correlated features
fnmk_corr_features = correlation(fnmk_features, 0.9)
mk_corr_features = correlation(mk_features, 0.9)
nm_corr_features = correlation(nm_features, 0.9)

# drop correlated features
fnmk_best_features = fnmk_features.drop(fnmk_corr_features, axis=1)
mk_best_features = mk_features.drop(mk_corr_features, axis=1)
nm_best_features = nm_features.drop(nm_corr_features, axis=1)

In [7]:
# append target columns to selected features dataframe
fifty_neither_vs_mel_keratosis_features_targets = fnmk_best_features.join(fifty_neither_vs_mel_keratosis[["melanoma","seborrheic_keratosis","neither"]]).drop(columns=['age_approximate', 'sex'])
mel_vs_keratosis_features_targets = mk_best_features.join(mel_vs_keratosis[["melanoma", "seborrheic_keratosis"]]).drop(columns=['age_approximate', 'sex'])
neither_vs_melkeratosis_features_targets = nm_best_features.join(neither_vs_melkeratosis[["mel_keratosis", "neither"]]).drop(columns=['age_approximate', 'sex'])

In [8]:
# save 
fifty_neither_vs_mel_keratosis_features_targets.to_csv('./fifty_neither_vs_mel_keratosis_feature_targets.csv', index=False)
mel_vs_keratosis_features_targets.to_csv('./mel_vs_keratosis_features_targets.csv')
neither_vs_melkeratosis_features_targets.to_csv('./neither_vs_melkeratosis_features_targets.csv')