In [87]:
import os
import pandas as pd
import numpy as np

# Default packages for the minimum example
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score


import pickle #for saving/loading trained classifiers


In [56]:
#Where are the files
file_data = '.' + os.sep + 'data' + os.sep +'metadata.csv'
df = pd.read_csv(file_data)
label = np.array(df['diagnostic'])

path_image = '.' + os.sep + 'data' + os.sep + 'images' + os.sep + 'imgs_part_1'

#Where did we store the features?
file_features = 'features/features.csv'
feature_names = ['file_name','asymmetry','color','blue-white_veil']

# Load the features - remember the example features are not informative
df_features = pd.read_csv(file_features)


our_list = list(np.array(df_features["file_name"]))
filtered_data = df[df["img_id"].isin(our_list)]
filtered_label = np.array(filtered_data['diagnostic'])
image_id = list(filtered_data['img_id'])
filtered_data

Unnamed: 0,patient_id,lesion_id,smoke,drink,background_father,background_mother,age,pesticide,gender,skin_cancer_history,...,diameter_2,diagnostic,itch,grew,hurt,changed,bleed,elevation,img_id,biopsed
0,PAT_1516,1765,,,,,8,,,,...,,NEV,False,False,False,False,False,False,PAT_1516_1765_530.png,False
1,PAT_46,881,False,False,POMERANIA,POMERANIA,55,False,FEMALE,True,...,5.0,BCC,True,True,False,True,True,True,PAT_46_881_939.png,True
2,PAT_1545,1867,,,,,77,,,,...,,ACK,True,False,False,False,False,False,PAT_1545_1867_547.png,False
3,PAT_1989,4061,,,,,75,,,,...,,ACK,True,False,False,False,False,False,PAT_1989_4061_934.png,False
4,PAT_684,1302,False,True,POMERANIA,POMERANIA,79,False,MALE,True,...,5.0,BCC,True,True,False,False,True,True,PAT_684_1302_588.png,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1152,PAT_1761,3329,,,,,70,,,,...,,SEK,False,False,False,False,False,False,PAT_1761_3329_837.png,False
1153,PAT_564,1079,True,False,BRAZIL,BRAZIL,73,False,FEMALE,False,...,8.0,BCC,True,True,False,True,True,True,PAT_564_1079_430.png,True
1154,PAT_747,1409,False,False,UNK,BRAZIL,55,False,MALE,True,...,10.0,ACK,True,True,False,False,True,True,PAT_747_1409_116.png,True
1155,PAT_566,179,False,False,ITALY,BRAZIL,54,False,MALE,False,...,3.0,ACK,True,False,False,False,False,False,PAT_566_179_23.png,False


In [112]:
# Make the dataset, considering all three types of cancerous skin lesions
# Update the target labels to consider BCC, SCC, and MEL
x = np.array(df_features[feature_names[1:]])
y = np.zeros(len(filtered_label))  # Initialize the labels array with zeros
y[filtered_label == 'BCC'] = 1     # Set BCC samples to 1
y[filtered_label == 'SCC'] = 2     # Set SCC samples to 2
y[filtered_label == 'MEL'] = 3     # Set MEL samples to 3
patient_id = filtered_data['patient_id']


array([0., 1., 0., ..., 0., 0., 3.])

In [128]:
#Prepare cross-validation - images from the same patient must always stay together
num_folds = 5
group_kfold = GroupKFold(n_splits=num_folds)
group_kfold.get_n_splits(x, y, patient_id)


5

In [129]:
group_kfold

GroupKFold(n_splits=5)

In [130]:
#Different classifiers to test out
classifiers = [
    KNeighborsClassifier(1),
    KNeighborsClassifier(5)
]
num_classifiers = len(classifiers)     

In [131]:
acc_val = np.empty([num_folds,num_classifiers])
f1_val = np.empty([num_folds, num_classifiers])

In [133]:
for i, (train_index, val_index) in enumerate(group_kfold.split(x, y, patient_id)):
    x_train = x[train_index, :]
    y_train = y[train_index]
    x_val = x[val_index, :]
    y_val = y[val_index]
    
    for j, clf in enumerate(classifiers): 
        # Train the classifier
        clf.fit(x_train, y_train)
        
        # Predict labels for validation data
        y_pred = clf.predict(x_val)
    
        # Evaluate accuracy
        accuracy = accuracy_score(y_val, y_pred)
        acc_val[i, j] = accuracy
        
        # Evaluate F1 score
        f1 = f1_score(y_val, y_pred, average="macro")  # Use macro-average for multi-class classification
        f1_val[i, j] = f1
        
        print(f"F1 score (Fold {i + 1}, Classifier {j + 1}): {f1}")

# After the loop, you can compute the average accuracy and F1 score across all folds and classifiers if needed
average_accuracy = np.mean(acc_val, axis=0)
average_f1 = np.mean(f1_val, axis=0)

print('Average accuracy for all classifiers:', average_accuracy)
print('Average F1 score for all classifiers:', average_f1)


F1 score (Fold 1, Classifier 1): 0.266312166580623
F1 score (Fold 1, Classifier 2): 0.23379281537176277
F1 score (Fold 2, Classifier 1): 0.21566635709310428
F1 score (Fold 2, Classifier 2): 0.23515016982696374
F1 score (Fold 3, Classifier 1): 0.24162550169805497
F1 score (Fold 3, Classifier 2): 0.2551558296877972
F1 score (Fold 4, Classifier 1): 0.2470047327284867
F1 score (Fold 4, Classifier 2): 0.23582995951417007
F1 score (Fold 5, Classifier 1): 0.2557528671846097
F1 score (Fold 5, Classifier 2): 0.23599910394265233
Average accuracy for all classifiers: [0.42089202 0.48209319]
Average F1 score for all classifiers: [0.24527233 0.23918558]


In [136]:
#Let's say you now decided to use the 5-NN 
classifier = KNeighborsClassifier(n_neighbors = 5)

#It will be tested on external data, so we can try to maximize the use of our available data by training on 
#ALL of x and y
classifier = classifier.fit(x,y)

#This is the classifier you need to save using pickle, add this to your zip file submission
filename = 'groupXY_classifier.sav'
pickle.dump(classifier, open(filename, 'wb'))

In [138]:
classifier