In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
import matplotlib.pyplot as plt
import time
from keras.preprocessing.image import ImageDataGenerator

In [None]:
# metadata_path = './skin-cancer-mnist-ham10000/HAM10000_metadata.csv'
# metadat = pd.read_csv(metadata_path)
# num_samples = len(metadat)
# metadat = metadat.sample(frac=1,random_state=12).reset_index(drop=True) #shuffling data

# #metadat = metadat['image_id'].add('.jpg')
# metadat.head()

In [None]:
from keras.applications.vgg19 import VGG19
from keras.preprocessing.image import ImageDataGenerator
from keras import optimizers
from keras.models import Sequential, Model 
from keras.layers import Dropout, Flatten, Dense, GlobalAveragePooling2D
from keras import backend as k 
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, TensorBoard, EarlyStopping

img_width, img_height = 224, 224
train_data_dir = "./train"
val_data_dir = "./val"
batch_size = 1

vgg = VGG19(weights = "imagenet", include_top=False, input_shape = (img_width, img_height, 3))

In [None]:
# Initiate the train and test generators with data Augumentation 
train_datagen = ImageDataGenerator(
rescale = 1./255,
horizontal_flip = True,
zoom_range = 0.2,
shear_range=0.2
)

val_datagen = ImageDataGenerator(
rescale = 1./255,
horizontal_flip = True,
zoom_range = 0.2,
shear_range=0.2
)

train_generator = train_datagen.flow_from_directory(
train_data_dir,
target_size = (img_height, img_width),
batch_size = batch_size, 
class_mode = "categorical", 
shuffle = True, 
seed = 12
)

val_generator = val_datagen.flow_from_directory(
val_data_dir,
target_size = (img_height, img_width),
batch_size = batch_size, 
class_mode = "categorical", 
shuffle = True, 
seed = 12
)

In [None]:
train_feat = []
train_lbl  = []

val_feat   = []
val_lbl    = []

In [None]:
for i in range(len(train_generator)):
    nxt_data = train_generator.next()
    feature = vgg.predict(nxt_data[0])
    train_feat.append(feature.flatten())
    train_lbl.append(nxt_data[1])
# for i in range(len(train_generator)):
#     feature = vgg.predict(train_generator[i][0])
#     train_feat.append(feature.flatten())
#     train_lbl.append(train_generator[i][1])

In [None]:
for i in range(len(val_generator)):
    nxt_data = val_generator.next()
    feature = vgg.predict(nxt_data[0])
    val_feat.append(feature.flatten())
    val_lbl.append(nxt_data[1])
# for i in range(len(val_generator)):
#     feature = vgg.predict(val_generator[i][0])
#     val_feat.append(feature.flatten())
#     val_lbl.append(val_generator[i][1])

Confusion Matrix

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

#     # Compute confusion matrix
#     cm = confusion_matrix(y_true, y_pred)
#     # Only use the labels that appear in the data
#     classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

class_names = np.array(['akiec','bcc','bkl','df','mel','nv','vasc'])
np.set_printoptions(precision=2)

# # Plot non-normalized confusion matrix
# plot_confusion_matrix(cm, classes=class_names,
#                       title='Confusion matrix, without normalization')

KMEANS

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, confusion_matrix

kmeans = KMeans(n_clusters=7, random_state=0).fit(train_feat)


y_tr = np.argmax(train_lbl,axis=2)
yhat_tr = kmeans.predict(train_feat)
count = np.zeros((7,7))
for i in range(len(y_tr)):
    count[int(y_tr[i])][yhat_tr[i]] += 1

mapping = np.argmax(count,axis=0) #index is the cluster label, value is the 'true' label

#since labels are repeated, tiebreaker was the next highest prediction value
count1 = np.exp(count) / np.sum(np.exp(count), axis=0)
np.set_printoptions(precision=2)
#manually picked after looking at training data predictions
mapping1 = np.array([6, 5, 1, 4, 0, 2, 3])

kmeans_val_pred = kmeans.predict(val_feat)

y_val = np.argmax(val_lbl,axis=2)

acc = 0
for i in range(len(kmeans_val_pred)):
    if y_val[i] == mapping[kmeans_val_pred[i]]:
        acc += 1/len(kmeans_val_pred)
print('accuracy = ' + str(acc))

kmeans_cm = confusion_matrix(y_val,kmeans_val_pred)
# Plot normalized confusion matrix
plot_confusion_matrix(kmeans_cm, classes=class_names, normalize=True,
                      title='K means confusion matrix')
plt.savefig('kmeans_conf_mat.png')
plt.show()

KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix  
knn = KNeighborsClassifier(n_neighbors=5)  
knn.fit(train_feat, y_tr)  

yhat = knn.predict(val_feat)


knn_cm = confusion_matrix(y_val, yhat)
print(knn_cm)
knn_acc = np.sum(np.diag(knn_cm))/np.sum(knn_cm)
print('accuracy = ' + str(knn_acc))
print(classification_report(y_val, yhat)) 

# Plot normalized confusion matrix
plot_confusion_matrix(knn_cm, classes=class_names, normalize=True,
                      title='KNN confusion matrix')
plt.savefig('10nn_conf_mat.png')
plt.show()

SVM

In [None]:
from sklearn.svm import SVC
from sklearn.utils.class_weight import compute_class_weight,compute_sample_weight

y_tr = np.argmax(train_lbl,axis=2)

class_weights = compute_class_weight('balanced', np.unique(y_tr[:,0]), y_tr[:,0])
svc = SVC(class_weight='balanced')

svc.fit(train_feat,y_tr,sample_weight=compute_sample_weight(class_weight='balanced', y=y_tr))

yhat = svc.predict(val_feat)

svc_cm = confusion_matrix(y_val, yhat)
print(svc_cm)
svc_acc = np.sum(np.diag(svc_cm))/np.sum(svc_cm)
print(classification_report(y_val, yhat)) 

# Plot normalized confusion matrix
plot_confusion_matrix(svc_cm, classes=class_names, normalize=True,
                      title='KNN confusion matrix')
plt.savefig('svc_conf_mat.png')
plt.show()