In [82]:
import os
import pandas as pd
import numpy as np

# Default packages for the minimum example
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score, f1_score


import pickle #for saving/loading trained classifiers


In [64]:
#Where are the files
file_data = '.' + os.sep + 'data' + os.sep +'metadata.csv'
df = pd.read_csv(file_data)
label = np.array(df['diagnostic'])

path_image = '.' + os.sep + 'data' + os.sep + 'images' + os.sep + 'imgs_part_1'

our_list = list(os.listdir(path_image))
filtered_data = df[df["img_id"].isin(our_list)]
filtered_label = np.array(filtered_data['diagnostic'])
image_id = list(filtered_data['img_id'])
filtered_data

Unnamed: 0,patient_id,lesion_id,smoke,drink,background_father,background_mother,age,pesticide,gender,skin_cancer_history,...,diameter_2,diagnostic,itch,grew,hurt,changed,bleed,elevation,img_id,biopsed
976,PAT_330,1440,False,False,ITALY,ITALY,72,False,MALE,False,...,4.0,SCC,True,UNK,False,UNK,False,True,PAT_330_1440_532.png,True
983,PAT_373,2598,False,False,BRAZIL,GERMANY,80,False,MALE,False,...,5.0,SEK,False,True,False,False,False,True,PAT_373_2598_514.png,False
984,PAT_1221,766,,,,,54,,,,...,,SEK,True,True,False,False,False,True,PAT_1221_766_430.png,False
988,PAT_150,1799,True,True,POMERANIA,POMERANIA,50,True,FEMALE,True,...,6.0,SCC,True,UNK,True,UNK,False,True,PAT_150_1799_644.png,True
989,PAT_192,296,False,False,ITALY,ITALY,74,True,MALE,False,...,10.0,SCC,True,True,False,True,False,False,PAT_192_296_824.png,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1185,PAT_901,1716,False,True,GERMANY,GERMANY,60,True,MALE,False,...,10.0,BCC,False,True,False,True,False,True,PAT_901_1716_497.png,True
1188,PAT_467,904,False,False,POMERANIA,POMERANIA,46,True,FEMALE,False,...,5.0,BCC,True,True,False,False,False,True,PAT_467_904_535.png,True
1342,PAT_270,417,False,False,POMERANIA,POMERANIA,90,False,FEMALE,False,...,7.0,MEL,False,True,False,True,False,False,PAT_270_417_257.png,True
1348,PAT_801,1518,False,False,ITALY,ITALY,43,False,FEMALE,False,...,8.0,MEL,False,True,False,False,False,False,PAT_801_1518_574.png,True


In [52]:
#Where did we store the features?
file_features = 'features/features.csv'
feature_names = ['file_name','asymmetry','color','blue-white_veil']

# Load the features - remember the example features are not informative
df_features = pd.read_csv(file_features)

In [53]:
df_features

Unnamed: 0,file_name,asymmetry,color,blue-white_veil
0,PAT_330_1440_532.png,0.336,2,0.160917
1,PAT_373_2598_514.png,0.192,3,0.004871
2,PAT_1221_766_430.png,0.259,2,0.000000
3,PAT_150_1799_644.png,0.519,3,0.000000
4,PAT_192_296_824.png,0.359,2,0.023872
...,...,...,...,...
119,PAT_901_1716_497.png,0.178,1,0.000000
120,PAT_467_904_535.png,0.335,3,0.001336
121,PAT_270_417_257.png,0.716,3,0.000000
122,PAT_801_1518_574.png,0.297,2,0.000000


In [70]:
# Make the dataset, you can select different classes (see task 0)
x = np.array(df_features[feature_names[1:]])
y =  np.int_(filtered_label == 'MEL')   #now True means healthy nevus, False means something else
patient_id = filtered_data['patient_id']


In [73]:
x

array([[3.36000000e-01, 2.00000000e+00, 1.60917031e-01],
       [1.92000000e-01, 3.00000000e+00, 4.87130930e-03],
       [2.59000000e-01, 2.00000000e+00, 0.00000000e+00],
       [5.19000000e-01, 3.00000000e+00, 0.00000000e+00],
       [3.59000000e-01, 2.00000000e+00, 2.38715591e-02],
       [1.27000000e-01, 3.00000000e+00, 1.19453319e-02],
       [1.50000000e-01, 2.00000000e+00, 0.00000000e+00],
       [4.77000000e-01, 2.00000000e+00, 1.33020752e-02],
       [8.55000000e-01, 2.00000000e+00, 0.00000000e+00],
       [1.17000000e-01, 2.00000000e+00, 0.00000000e+00],
       [1.48000000e-01, 0.00000000e+00, 2.40613176e-02],
       [3.58000000e-01, 3.00000000e+00, 0.00000000e+00],
       [4.30000000e-01, 2.00000000e+00, 1.58764785e-01],
       [9.96000000e-01, 2.00000000e+00, 0.00000000e+00],
       [1.26200000e+00, 1.00000000e+00, 0.00000000e+00],
       [1.30000000e-01, 1.00000000e+00, 0.00000000e+00],
       [6.21000000e-01, 4.00000000e+00, 0.00000000e+00],
       [6.84000000e-01, 3.00000

In [56]:
patient_id

976      PAT_330
983      PAT_373
984     PAT_1221
988      PAT_150
989      PAT_192
          ...   
1185     PAT_901
1188     PAT_467
1342     PAT_270
1348     PAT_801
1424     PAT_622
Name: patient_id, Length: 124, dtype: object

In [57]:
#Prepare cross-validation - images from the same patient must always stay together
num_folds = 5
group_kfold = GroupKFold(n_splits=num_folds)
group_kfold.get_n_splits(x, y, patient_id)


5

In [58]:
group_kfold

GroupKFold(n_splits=5)

In [59]:
#Different classifiers to test out
classifiers = [
    KNeighborsClassifier(1),
    KNeighborsClassifier(5)
]
num_classifiers = len(classifiers)     

In [85]:
acc_val = np.empty([num_folds,num_classifiers])
f1_val = np.empty([num_folds, num_classifiers])

In [109]:
for i, (train_index, val_index) in enumerate(group_kfold.split(x, y, patient_id)):
    
    x_train = x[train_index,:]
    y_train = y[train_index]
    x_val = x[val_index,:]
    y_val = y[val_index]
    
    
    for j, clf in enumerate(classifiers): 
        
        #Train the classifier
        clf.fit(x_train,y_train)
    
        #Evaluate your metric of choice (accuracy is probably not the best choice)
        acc_val[i,j] = accuracy_score(y_val, clf.predict(x_val))
        f1_val[i,j] = f1_score(y_val, clf.predict(x_val), average="binary")


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


In [75]:
#Average over all folds
average_acc = np.mean(acc_val,axis=0) 
   
print('Classifier 1 average accuracy={:.3f} '.format(average_acc[0]))
print('Classifier 2 average accuracy={:.3f} '.format(average_acc[1]))



Classifier 1 average accuracy=0.960 
Classifier 2 average accuracy=0.976 


In [76]:
#Let's say you now decided to use the 5-NN 
classifier = KNeighborsClassifier(n_neighbors = 5)

#It will be tested on external data, so we can try to maximize the use of our available data by training on 
#ALL of x and y
classifier = classifier.fit(x,y)

#This is the classifier you need to save using pickle, add this to your zip file submission
filename = 'groupXY_classifier.sav'
pickle.dump(classifier, open(filename, 'wb'))