In [39]:
import os
import pandas as pd
import numpy as np

# Default packages for the minimum example
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GroupKFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler


import pickle #for saving/loading trained classifiers


In [2]:
# Finding our files - make sure to follow the same file/folder structure.
file_data = '.' + os.sep + 'data' + os.sep +'metadata.csv'
path_image = '.' + os.sep + 'data' + os.sep + 'images' + os.sep + 'imgs_part_1'

# read the metadata csv and find the diagnostic labels.
df = pd.read_csv(file_data)

# Find the features from the feature extraction.
file_features = 'features/features.csv'
feature_names = ['file_name','asymmetry','color','blue-white_veil']

# Load up the features in a separate dataframe to filter our metadata.
df_features = pd.read_csv(file_features)

# our_list for all images in the metadata that we also have in our features.csv
our_list = list(np.array(df_features["file_name"]))
filtered_data = df[df["img_id"].isin(our_list)]
label = np.array(filtered_data['diagnostic'])
image_id = list(filtered_data['img_id'])

In [3]:
# Make the dataset
x = np.array(df_features[feature_names[1:]])

y = np.zeros(len(label))  # Initialize the labels array with zeros
y[label == 'BCC'] = 1     # Set BCC samples to 1
y[label == 'SCC'] = 2     # Set SCC samples to 2
y[label == 'MEL'] = 3     # Set MEL samples to 3
patient_id = filtered_data['patient_id']



'''
# IGNORE - old Y
y =  np.int_(filtered_label == 'MEL', 'BCC', 'SCC')   #now True means healthy nevus, False means something else

''' 

"\n# IGNORE - old Y\ny =  np.int_(filtered_label == 'MEL', 'BCC', 'SCC')   #now True means healthy nevus, False means something else\n\n"

In [4]:
patient_id

0       PAT_1516
1         PAT_46
2       PAT_1545
3       PAT_1989
4        PAT_684
          ...   
1178     PAT_931
1179     PAT_678
1180     PAT_810
1181      PAT_83
1182    PAT_1220
Name: patient_id, Length: 1081, dtype: object

In [5]:
x_train_data, x_test_data, y_train_data, y_test_data, patient_id_train_data, patient_id_test_data = train_test_split(
    x, y, patient_id, test_size=0.2, train_size=0.8, random_state=42, shuffle=True, stratify=y)

In [6]:
#Prepare cross-validation -
# GroupKFold makes sure patients with the same ID will not be split between the training and validation sets.

num_folds = 5
group_kfold = GroupKFold(n_splits=num_folds)
group_kfold.get_n_splits(x_train_data, y_train_data, patient_id_train_data)

5

In [35]:
# Our classifiers are defined here. We use K-NN

classifiers = [
    KNeighborsClassifier(1),
    KNeighborsClassifier(5)
]
num_classifiers = len(classifiers)  


In [77]:
# set up np arrays for the eventual accuracy- and F1-scores.
acc_val = np.empty([num_folds,num_classifiers])
f1_val = np.empty([num_folds, num_classifiers])
roc_auc_val = np.empty([num_folds, num_classifiers])
specificity_val = np.empty([num_folds, num_classifiers])

f1_list = []
# Splits up our data into training and validation sets at a 80/20 ratio. The group_kfold does training across folds,
# with a default of 5 folds it will give us 5 outputs.
for i, (train_index, val_index) in enumerate(group_kfold.split(x_train_data, y_train_data, patient_id_train_data)):
    
    # x_train = 80%
    # y_train = truth for 80%
    # x_val = 20%
    # y_val = truth for 20%
    x_train = x[train_index,:]
    y_train = y[train_index]
    x_val = x[val_index,:]
    y_val = y[val_index]
    
    # Initialize StandardScaler
    scaler = StandardScaler()
 
    # Standardize features
    x_train_scaled = scaler.fit_transform(x_train)
    x_val_scaled = scaler.transform(x_val)
    
    
    specificity = []
    sensitivity = []
    
    for j, clf in enumerate(classifiers): 
        # Train the classifier with the 80%.
        clf.fit(x_train_scaled, y_train)
        
        # Predict labels for validation data
        y_pred = clf.predict(x_val_scaled)
    
        # Evaluate accuracy score (mostly useless)
        accuracy = accuracy_score(y_val, y_pred)
        acc_val[i, j] = accuracy
        
        # Evaluate F1 score (Shows us the ratio of false positives and true negatives)
        f1 = f1_score(y_val, y_pred, average="macro")  # Use macro-average for multi-class classification (We have 4 classes -> the 3 skin cancers and neither of them.)
        f1_val[i, j] = f1
        
        # Calculate ROC AUC score
        roc_auc = roc_auc_score(y_val, clf.predict_proba(x_val_scaled), multi_class='ovr')
        roc_auc_val[i, j] = roc_auc
        
        # Calculate specificity
        c1, c2, c3, c4 = confusion_matrix(y_val, y_pred)
        
        tn = c1[0] # true negative
        tp = c2[1] + c3[2] + c4[3] # true positive
        fp = np.sum(c1[1:]) + np.sum(c2[2:]) + c3[3] + c3[1] + np.sum(c4[1:2]) # false positive
        fn = c2[0] + c3[0] + c4[0] # false negative
        
        specificity.append(tn/(tn+fp))
        specificity_avg = np.mean(specificity)
        
        sensitivity.append(tp/(tp+fn))
        sensitivity_avg = np.mean(sensitivity)
        
        
        # Print our scores - Classifier 1 is K-NN(1) and Classifier 2 is K-NN(5):
        print(f"F1 score (Fold {i + 1}, Classifier {j + 1}): {f1}")
        print(f"ROC AUC score (Fold {i + 1}, Classifier {j + 1}): {roc_auc}")
        print(f"Specificity (Fold {i + 1}, Classifier {j + 1}): {specificity}")
        print(f"Sensitivity (Fold {i + 1}, Classifier {j + 1}): {sensitivity}")  
        
        
        f1_list.append(f1)
        


F1 score (Fold 1, Classifier 1): 0.2771048698468053
ROC AUC score (Fold 1, Classifier 1): 0.5225883837874823
Specificity (Fold 1, Classifier 1): [0.5]
Sensitivity (Fold 1, Classifier 1): [0.3968253968253968]
F1 score (Fold 1, Classifier 2): 0.22566722268557132
ROC AUC score (Fold 1, Classifier 2): 0.532763842478447
Specificity (Fold 1, Classifier 2): [0.5, 0.6979166666666666]
Sensitivity (Fold 1, Classifier 2): [0.3968253968253968, 0.2077922077922078]
F1 score (Fold 2, Classifier 1): 0.29800591595267234
ROC AUC score (Fold 2, Classifier 1): 0.5396652044781591
Specificity (Fold 2, Classifier 1): [0.46153846153846156]
Sensitivity (Fold 2, Classifier 1): [0.4927536231884058]
F1 score (Fold 2, Classifier 2): 0.24898989898989898
ROC AUC score (Fold 2, Classifier 2): 0.5272483269614651
Specificity (Fold 2, Classifier 2): [0.46153846153846156, 0.6082474226804123]
Sensitivity (Fold 2, Classifier 2): [0.4927536231884058, 0.32894736842105265]
F1 score (Fold 3, Classifier 1): 0.3090828440147053
R

In [69]:
print(testing)


[[65 14  0  0]
 [48 26  0  0]
 [12  6  0  0]
 [ 1  0  0  0]]


In [68]:
c1[0] # true negative

c2[1] + c3[2] + c4[3] # true positive

c0[1:] + c1[2:] + c2[3] + c2[1] + c3[1:2] # false positive

c1[0] + c2[0] + c3[0] # false negative



1

In [9]:
#Average over all folds
average_acc = np.mean(acc_val,axis=0) 
   
print('Classifier 1 average accuracy={:.3f} '.format(average_acc[0]))
print('Classifier 2 average accuracy={:.3f} '.format(average_acc[1]))

Classifier 1 average accuracy=0.451 
Classifier 2 average accuracy=0.501 


In [10]:
# FINAL PART - To be expanded on. Basically this is the data we will want to compare to the external pictures
# If we want to include all the pictures here, we no longer have any "true" set to compare our results to. 
# We should consider creating a 3-way split, so we have a final, unused picture set to work on our model.

# Let's say you now decided to use the 5-NN 
classifier = KNeighborsClassifier(n_neighbors = 5)

#It will be tested on external data, so we can try to maximize the use of our available data by training on 
#ALL of x and y
classifier = classifier.fit(x,y)

#This is the classifier you need to save using pickle, add this to your zip file submission
filename = 'groupXY_classifier.sav'
pickle.dump(classifier, open(filename, 'wb'))