# Imports

In [21]:
# imports, including functions provided with the starter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
# additional library, to be install using pip install imbalanced-learn
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

if 'notebooks' in os.getcwd():
    os.chdir('..')

from groupXY_functions import *


# Defining functions and preprocessing

In [2]:
# Return smallest and largest x, smallest and largest y that contains a white pixel, so that we can crop it to measure asymmetry
def get_extremes(image):
    non_zeros = np.nonzero(image)
    return min(non_zeros[1]), max(non_zeros[1]), min(non_zeros[0]), max(non_zeros[0])

In [17]:
# Crops both the image and the mask to the actual size of the mask, saves to files
# Returns cropped image and mask in the array
def crop(image_id):
    path = 'data/example_image'
    path_cropped = path + '_cropped'
    path_mask = 'data/example_segmentation'
    path_mask_cropped = path_mask + '_cropped'
    if os.path.exists(path_cropped + image_id + '.jpg'):
        image = plt.imread(path_cropped + '/' + image_id + '.jpg')
        mask = plt.imread(path_mask_cropped + '/' + image_id + '.png')
        return image, mask
    if not os.path.exists(path_cropped):
        os.mkdir(path_cropped)
        os.mkdir(path_mask_cropped)
    image = plt.imread(path + '/' + image_id + '.jpg')
    mask = plt.imread(path_mask + '/' + image_id + '_segmentation.png')
    min_x, max_x, min_y, max_y = get_extremes(mask)
    mask = mask[min_y:max_y+1, min_x:max_x+1]
    plt.imsave(path_mask_cropped + '/' + image_id + '.png', mask, cmap=plt.cm.gray)
    image_cropped = image.copy()[min_y:max_y+1, min_x:max_x+1, :]
    to_black = mask==0
    image_cropped[to_black] = (0,0,0)
    plt.imsave(path_cropped + '/' + image_id + '.jpg', image_cropped)
    return image, mask

In [4]:
# Assymetry function
# 0 is a perfect circle, 1 has all the quarters of an image completely different
def asymmetry_score(image):

    height, width = np.shape(image)
    if width % 2:
        # Subparts need to be the same size to compare them
        left_half = image[:, 0:width//2+1]
    else:
        left_half = image[:, 0:width//2]
    # "folding" on vertical axis
    right_half = np.flip(image[:, width//2:], 1)
    
    if height % 2:
        top_half = image[0:height//2+1, :]
    else:
        top_half = image[0:height//2, :]
    # "folding" on hotizontal axis
    bottom_half = np.flip(image[height//2:, :], 0)
    
    hor_diff = np.where(left_half != right_half, 1, 0)
    ver_diff = np.where(top_half != bottom_half, 1, 0)
    return round((np.sum(hor_diff) + np.sum(ver_diff)) / (width * height),3)

In [5]:
# Border function, compactness
# Due to inherit limitations of digital images, for some masks returend compactness value is below 1, which is wrong.
# Nonetheless, such an imprecission is acceptable given that multiple variables are considred in diagnoses 
def get_compactness(area, perimeter):
    return round(perimeter**2 / (4 * np.pi * area),3)

In [6]:
def colour_extraction (cropped_image):
    
    # split the image into its respective RGB components
    r, g, b = np.array_split(cropped_image, 3, 2)
    
    return r,g,b

def colour_variance_avg(r,g,b):
    return np.mean((np.var(r[r>0]),np.var(g[g>0]),np.var(b[b>0])))

In [7]:
def sensitivity(tp, fn):
    return tp / (tp + fn)

def specificity(tn, fp):
    return tn / (tn + fp)

In [12]:
# Import a data frame with file names and associated diseases
df_conditions = pd.read_csv('data/example_ground_truth.csv')

# A new data frame is created, which will store results of ABC tests 
df_evaluations = pd.DataFrame(df_conditions['image_id'])
for name in ['asymmetry', 'border', 'colour']:
    df_evaluations[name] = 0.0
    
for i in range(df_evaluations.shape[0]):
    image, mask = crop(df_evaluations.at[i, 'image_id'])
    df_evaluations.at[i, 'asymmetry'] = asymmetry_score(mask)
    area, perimeter = measure_area_perimeter(mask)
    df_evaluations.at[i, 'border'] = get_compactness(area, perimeter)
    r,g,b = colour_extraction(image)
    df_evaluations.at[i,'colour'] = colour_variance_avg(r,g,b)

# Standarisation of features' values

In [None]:
#data before standarisation
fig, axs = plt.subplots(3,figsize=(10,12))
axs[0].hist(df_evaluations['asymmetry'])
axs[1].hist(df_evaluations['border'])
axs[2].hist(df_evaluations['colour'])
fig.tight_layout()


In [None]:
df_evaluations

In [14]:
for name in ['asymmetry', 'border', 'colour']:
    mean = np.mean(df_evaluations[name])
    sd = np.std(df_evaluations[name])
    for i in range(df_evaluations.shape[0]):
        df_evaluations.at[i, name] = (df_evaluations.at[i, name] - mean) / sd

In [15]:
df_evaluations

Unnamed: 0,image_id,asymmetry,border,colour
0,ISIC_0001769,-0.023781,0.578983,-0.174316
1,ISIC_0001852,-0.709402,-0.476142,0.245641
2,ISIC_0001871,0.612156,-0.442752,-0.137239
3,ISIC_0003462,-1.365212,-1.012609,0.392070
4,ISIC_0003539,2.172190,-0.311418,0.478796
...,...,...,...,...
145,ISIC_0015443,-0.977688,-0.039845,-0.709203
146,ISIC_0015445,3.702415,2.646940,1.660420
147,ISIC_0015483,-1.245974,0.550045,0.628639
148,ISIC_0015496,0.035838,0.652441,4.570101


In [None]:
#data after standarisation
fig, axs = plt.subplots(3,figsize=(10,12))
axs[0].hist(df_evaluations['asymmetry'])
axs[1].hist(df_evaluations['border'])
axs[2].hist(df_evaluations['colour'])
fig.tight_layout()

In [None]:
##Loading data#

In [19]:
df = df_conditions.drop('seborrheic_keratosis', axis=1).merge(df_evaluations, on='image_id').drop('image_id', axis=1)
print(df.head())

   melanoma  asymmetry    border    colour
0       0.0  -0.023781  0.578983 -0.174316
1       0.0  -0.709402 -0.476142  0.245641
2       0.0   0.612156 -0.442752 -0.137239
3       0.0  -1.365212 -1.012609  0.392070
4       0.0   2.172190 -0.311418  0.478796


In [22]:
# Split dataset to select feature and evaluate the classifier
x = df[['asymmetry', 'border', 'colour']]
y = df['melanoma']

# X_train and y_train used for training, X_val and y_val are for predicting and validating the ML model
x_train, x_val, y_train, y_val = train_test_split(
        x, y, stratify=y, test_size=0.4, random_state=0)
sm = SMOTE(random_state=0)
x_train, y_train = sm.fit_resample(x_train, y_train)

In [25]:
accuracy = dict()

for n_neighbors in range(1, 10):
    # Train a classifier
    knn = KNeighborsClassifier(n_neighbors=n_neighbors) # other hyperparameters possible
    knn_trained = knn.fit(x_train, y_train)
    y_val_knn = knn_trained.predict(x_val)
    # Accuracy - often used in ML but not suitable for medical imaging in general
    tn, fp, fn, tp = confusion_matrix(y_val, y_val_knn).ravel()
    accuracy[n_neighbors] = [np.sum(y_val_knn == y_val) / np.size(y_val) * 100, sensitivity(tp, fn), specificity(tn, fp)]
    
accuracy

{1: [65.0, 0.25, 0.75],
 2: [68.33333333333333, 0.25, 0.7916666666666666],
 3: [63.33333333333333, 0.5, 0.6666666666666666],
 4: [65.0, 0.3333333333333333, 0.7291666666666666],
 5: [65.0, 0.5, 0.6875],
 6: [63.33333333333333, 0.3333333333333333, 0.7083333333333334],
 7: [58.333333333333336, 0.4166666666666667, 0.625],
 8: [63.33333333333333, 0.3333333333333333, 0.7083333333333334],
 9: [63.33333333333333, 0.3333333333333333, 0.7083333333333334]}

# Scatters

In [None]:
k = pd.read_csv("ManualDataAnalysis.csv")
name = k[0:51]
asymmetry = name.loc[:,"Asymmetry"]
border = name.loc[:,"Border"]
Color = name.loc[:,"Color"]
Filename = name.loc[:,"Filename"]
x = range(1,52)


In [None]:
plt.figure(figsize=(12, 12), dpi=100)

plt.subplot(2,3,1)
plt.scatter(x,asymmetry,)
plt.title("Asymetry")

plt.subplot(2,3,2)
plt.scatter(x, border)
plt.title("Border")

plt.subplot(2,3,3)
plt.scatter(x, Color)
plt.title("Color")

plt.suptitle("Manual Analysis")

