In [3]:
import os
import pandas as pd
import numpy as np

# Default packages for the minimum example
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.svm import SVC


import pickle #for saving/loading trained classifiers


In [4]:
# Finding our files - make sure to follow the same file/folder structure.
file_data = '.' + os.sep + 'data' + os.sep +'metadata.csv'
path_image = '.' + os.sep + 'data' + os.sep + 'images' + os.sep + 'imgs_part_1'

# read the metadata csv and find the diagnostic labels.
df = pd.read_csv(file_data)

# Find the features from the feature extraction.
file_features = 'features/features.csv'
feature_names = ['file_name','asymmetry','color','blue-white_veil']

# Load up the features in a separate dataframe to filter our metadata.
df_features = pd.read_csv(file_features)

# our_list for all images in the metadata that we also have in our features.csv
our_list = list(np.array(df_features["file_name"]))
filtered_data = df[df["img_id"].isin(our_list)]
label = np.array(filtered_data['diagnostic'])
image_id = list(filtered_data['img_id'])

Unnamed: 0,patient_id,lesion_id,smoke,drink,background_father,background_mother,age,pesticide,gender,skin_cancer_history,...,diameter_2,diagnostic,itch,grew,hurt,changed,bleed,elevation,img_id,biopsed
0,PAT_1516,1765,,,,,8,,,,...,,NEV,False,False,False,False,False,False,PAT_1516_1765_530.png,False
1,PAT_46,881,False,False,POMERANIA,POMERANIA,55,False,FEMALE,True,...,5.0,BCC,True,True,False,True,True,True,PAT_46_881_939.png,True
2,PAT_1545,1867,,,,,77,,,,...,,ACK,True,False,False,False,False,False,PAT_1545_1867_547.png,False
3,PAT_1989,4061,,,,,75,,,,...,,ACK,True,False,False,False,False,False,PAT_1989_4061_934.png,False
4,PAT_684,1302,False,True,POMERANIA,POMERANIA,79,False,MALE,True,...,5.0,BCC,True,True,False,False,True,True,PAT_684_1302_588.png,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1152,PAT_1761,3329,,,,,70,,,,...,,SEK,False,False,False,False,False,False,PAT_1761_3329_837.png,False
1153,PAT_564,1079,True,False,BRAZIL,BRAZIL,73,False,FEMALE,False,...,8.0,BCC,True,True,False,True,True,True,PAT_564_1079_430.png,True
1154,PAT_747,1409,False,False,UNK,BRAZIL,55,False,MALE,True,...,10.0,ACK,True,True,False,False,True,True,PAT_747_1409_116.png,True
1155,PAT_566,179,False,False,ITALY,BRAZIL,54,False,MALE,False,...,3.0,ACK,True,False,False,False,False,False,PAT_566_179_23.png,False


In [6]:
# Make the dataset, you can select different classes (see task 0)
x = np.array(df_features[feature_names[1:]])

y = np.zeros(len(label))  # Initialize the labels array with zeros
y[label == 'BCC'] = 1     # Set BCC samples to 1
y[label == 'SCC'] = 2     # Set SCC samples to 2
y[label == 'MEL'] = 3     # Set MEL samples to 3
patient_id = filtered_data['patient_id']



'''
# IGNORE - old Y
y =  np.int_(filtered_label == 'MEL', 'BCC', 'SCC')   #now True means healthy nevus, False means something else

'''

In [8]:
#Prepare cross-validation -
# GroupKFold makes sure patients with the same ID will not be split between the training and validation sets.

num_folds = 5
group_kfold = GroupKFold(n_splits=num_folds)
group_kfold.get_n_splits(x, y, patient_id)

5

In [10]:
# Our classifiers are defined here. We use K-NN

classifiers = [
    KNeighborsClassifier(1),
    KNeighborsClassifier(5)
]
num_classifiers = len(classifiers)  


In [12]:
# set up np arrays for the eventual accuracy- and F1-scores.
acc_val = np.empty([num_folds,num_classifiers])
f1_val = np.empty([num_folds, num_classifiers])

# Splits up our data into training and validation sets at a 80/20 ratio. The group_kfold does training across folds,
# with a default of 5 folds it will give us 5 outputs.
for i, (train_index, val_index) in enumerate(group_kfold.split(x, y, patient_id)):
    
    # x_train = 80%
    # y_train = truth for 80%
    # x_val = 20%
    # y_val = truth for 20%
    x_train = x[train_index,:]
    y_train = y[train_index]
    x_val = x[val_index,:]
    y_val = y[val_index]
    
    
    
    for j, clf in enumerate(classifiers): 
        # Train the classifier with the 80%.
        clf.fit(x_train, y_train)
        
        # Predict labels for validation data
        y_pred = clf.predict(x_val)
    
        # Evaluate accuracy score (mostly useless)
        accuracy = accuracy_score(y_val, y_pred)
        acc_val[i, j] = accuracy
        
        # Evaluate F1 score (Shows us the ratio of false positives and true negatives)
        f1 = f1_score(y_val, y_pred, average="macro")  # Use macro-average for multi-class classification (We have 4 classes -> the 3 skin cancers and neither of them.)
        f1_val[i, j] = f1
        
        # Print our scores - Classifier 1 is K-NN(1) and Classifier 2 is K-NN(5):
        print(f"F1 score (Fold {i + 1}, Classifier {j + 1}): {f1}")


F1 score (Fold 1, Classifier 1): 0.268974358974359
F1 score (Fold 1, Classifier 2): 0.23748612652608211
F1 score (Fold 2, Classifier 1): 0.21566635709310428
F1 score (Fold 2, Classifier 2): 0.23515016982696374
F1 score (Fold 3, Classifier 1): 0.24162550169805497
F1 score (Fold 3, Classifier 2): 0.2551558296877972
F1 score (Fold 4, Classifier 1): 0.25230366173872293
F1 score (Fold 4, Classifier 2): 0.23582995951417007
F1 score (Fold 5, Classifier 1): 0.25819911029298975
F1 score (Fold 5, Classifier 2): 0.23251046142612405


In [15]:
#Average over all folds
average_acc = np.mean(acc_val,axis=0) 
   
print('Classifier 1 average accuracy={:.3f} '.format(average_acc[0]))
print('Classifier 2 average accuracy={:.3f} '.format(average_acc[1]))

Classifier 1 average accuracy=0.425 
Classifier 2 average accuracy=0.482 


In [14]:
# FINAL PART - To be expanded on. Basically this is the data we will want to compare to the external pictures
# If we want to include all the pictures here, we no longer have any "true" set to compare our results to. 
# We should consider creating a 3-way split, so we have a final, unused picture set to work on our model.

# Let's say you now decided to use the 5-NN 
classifier = KNeighborsClassifier(n_neighbors = 5)

#It will be tested on external data, so we can try to maximize the use of our available data by training on 
#ALL of x and y
classifier = classifier.fit(x,y)

#This is the classifier you need to save using pickle, add this to your zip file submission
filename = 'groupXY_classifier.sav'
pickle.dump(classifier, open(filename, 'wb'))