In [1]:
## This scripts serves as the extractor for all features used in this project
## it will iterate through each image in the dataset and extract features


In [2]:
# import modules
import pandas as pd
import numpy as np
import cv2 as cv
import matplotlib.pyplot as plt
%matplotlib inline 
from radiomics import featureextractor
from IPython.display import clear_output
from iteration_utilities import flatten, deepflatten



In [3]:
# Load training data csv. The original file contains the images names and patient data
training_data_csv = pd.read_csv('../merged_training_data.csv')
training_data_csv.head()

Unnamed: 0,image_id,age_approximate,sex,melanoma,seborrheic_keratosis
0,ISIC_0000000,55,female,0.0,0.0
1,ISIC_0000001,30,female,0.0,0.0
2,ISIC_0000002,60,female,1.0,0.0
3,ISIC_0000003,30,male,0.0,0.0
4,ISIC_0000004,80,male,1.0,0.0


In [4]:
## image and mask loading

def img_mask_load(img_name_load):
    
    ''' 
    
    Function that takes the image name 
    from the Dataframe, searches this name in the image directory
    and loads it, along with the corresponding mask. Also, applies
    the mask to perform the segmentation along with histogram normalization
    
    '''
    
    im_path = '../../ISIC-2017_Training_Data/' ## CHANGE THIS VARIABLE WITH THE PATH WHERE YOU HAVE THE IMAGES
    mask_path = '../../ISIC-2017_Training_Part1_GroundTruth/' ## CHANGE THIS VARIABLE WITH THE PATH WHERE THE MAKS ARE
    img = cv.imread(im_path + img_name_load + '.jpg',-1)
    img_rgb = cv.cvtColor(img, cv.COLOR_BGR2RGB)
    mask = cv.imread(mask_path + img_name_load + '_segmentation.png', -1)
    output_array = np.zeros((img.shape[0],img.shape[1])) # Array to store normalized image
    img_segmented = cv.normalize(cv.bitwise_and(img_rgb, img_rgb, mask=mask),  output_array, 0, 255, cv.NORM_MINMAX)
    
    # return the segmented image for color channel features, and bgr image and mask for pyradiomics
    return img_segmented, img, mask


In [5]:
## function for RGB features extraction

def features_rgb(img_name):
    '''
    This function extracts the following rgb features:
    color channel mode, 
    '''
    
    
    # segment the image using the segmentation function
    segmented, _, _, = img_mask_load(img_name)
    
    # Tuple with each color channel
    color = ('r','g','b')
    
    
    #  Declare list to store the most repeated pixel intensity in each channel
    mode_color_channel = []
    bp = []
    medians = []
    iqrs = []
    

    # Iterate through each color channel and obtain histogram
    for i,col in enumerate(color):   
        histr = cv.calcHist([segmented],[i],None,[256],[1,256]) # the range should be [1,256] to exclude black background    
        mode_color_channel.append(list(histr.flatten()).index(max(histr.flatten()))) # append mode to mode list
        bp.append(segmented[:,:,i].flatten()[segmented[:,:,i].flatten().nonzero()])
        
    for i in range(len(bp)):
        medians.append(round(np.median(bp[i]))) # get median
        q3, q1 = np.percentile(bp[i], [75,25]) # get 3rd quartile and 1st quartile
        iqr = round(q3 - q1) # calculate quartile diference
        iqrs.append(iqr) # append to interquartile range array
                    
    return mode_color_channel, medians, iqrs


In [6]:
## function for Radiomics features extraction

def features_radiomics(img_name):
    
    # Load image path
    _, img, mask = img_mask_load(img_name) # segment the image using img_mask_load function
    img_gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
    
    im, label = '../pyradiomicsdir/img_gray_1.jpg' , '../pyradiomicsdir/mask_1.jpg'
    # save grayscale image and mask as jpg
    cv.imwrite(im, img_gray) 
    cv.imwrite(label, mask)
    
  
    # extract features
    extractor = featureextractor.RadiomicsFeatureExtractor()
    result = extractor.execute(im, label)
    values = list(result.values())
    keys = list(result.keys())
    v = []
    k = []
    
    for i in deepflatten(values[22:]):
        v.append(i.tolist())
    
    for l in keys[22:]:   
        k.append(l)       
        
    return v, k

In [7]:
## function to iterate through the images, extract the features and append them to the dataframe

def feature_extraction():
    # columns for rgb features dataframe
    columns = ['image_id','red_mode', 'green_mode', 'blue_mode', 'red_median','green_median', 'blue_median', 'red_iqr', 'green_iqr', 'blue_iqr']
    
    # create dataframe with rgb columns
    features_dataframe = pd.DataFrame(columns = columns)
    
    
    # iterate through each image, extract features and append to dataframe
    for index, image in enumerate(training_data_csv['image_id']):  
        feat_rgb = features_rgb(image)   # extract rgb features with features_rgb function built previously
        feat_radio, c = features_radiomics(image)   
        col = columns[1:] + c
        data = list(deepflatten([feat_rgb, feat_radio]))
        features_dataframe = features_dataframe.append(pd.DataFrame([data], columns = col, index=[index]));
        features_dataframe.at[index, columns[0]] = image
        print(str(index+1) + '/' + str(len(training_data_csv['image_id']))+ ' images processed...')
        clear_output(wait=True)
    return features_dataframe

In [None]:
## Hit it! Extract the features 
## CAUTION: This might take a while
all_features = feature_extraction()
final_table = training_data_csv.set_index('image_id').join(all_features.set_index('image_id'), how='inner', on='image_id')
final_table.to_csv('data.csv')

Shape features are only available 3D input (for 2D input, use shape2D). Found 2D input
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated


42/2000 images processed...


  features_dataframe = features_dataframe.append(pd.DataFrame([data], columns = col, index=[index]));
