1. Libraries & Loading the Data

In [13]:
# data science libraries
import pandas as pd
import cv2
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(1000)

# keras and tf
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import optimizers

# other tf imports
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# print validation statement
print("all resources loaded")

all resources loaded


In [14]:
df = pd.read_csv('data/HAM10000_metadata.csv')
df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [15]:
df['dx'].value_counts().to_dict()

{'nv': 6705,
 'mel': 1113,
 'bkl': 1099,
 'bcc': 514,
 'akiec': 327,
 'vasc': 142,
 'df': 115}

In [16]:
class_weight = {0: 1 / (df['dx'].value_counts().to_dict()['nv'] / df['dx'].count()),
    1: 1 / (df['dx'].value_counts().to_dict()['mel'] / df['dx'].count()),
    2: 1 / (df['dx'].value_counts().to_dict()['bkl'] / df['dx'].count()),
    3: 1 / (df['dx'].value_counts().to_dict()['bcc'] / df['dx'].count()),
    4: 1 / (df['dx'].value_counts().to_dict()['akiec'] / df['dx'].count()),
    5: 1 / (df['dx'].value_counts().to_dict()['vasc'] / df['dx'].count()),
    6: 1 / (df['dx'].value_counts().to_dict()['df'] / df['dx'].count())
}
class_weight

{0: 1.493661446681581,
 1: 8.998203054806828,
 2: 9.112829845313922,
 3: 19.48443579766537,
 4: 30.62691131498471,
 5: 70.52816901408451,
 6: 87.08695652173913}

In [17]:
sum = 0
for i in class_weight:
    sum += class_weight[i]
sum

for weight in class_weight:
    class_weight[weight] /= sum
class_weight

{0: 0.0065704208816761995,
 1: 0.03958191555403316,
 2: 0.04008614377765142,
 3: 0.08570947862186559,
 4: 0.13472376761969088,
 5: 0.31024416909604874,
 6: 0.383084104449034}

In [18]:
(df['dx'].count() - df['dx'].value_counts().to_dict()['df']) / df['dx'].count()

0.9885172241637543

In [19]:
df['image_id'] = df['image_id'].apply(lambda x: '{}.jpg'.format(x))

In [20]:
df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419.jpg,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030.jpg,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769.jpg,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661.jpg,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633.jpg,bkl,histo,75.0,male,ear


In [21]:
from sklearn.utils import shuffle
df_shuffled = shuffle(df)

In [22]:
datagen = ImageDataGenerator(
    rescale=1.0/255.0,
    horizontal_flip = True,
    vertical_flip = True,
    width_shift_range = 0.1,
    height_shift_range = 0.1,
    channel_shift_range = 0,
    zoom_range = 0.2,
    rotation_range = 20,
    validation_split=0.2
)

In [23]:
# datagen.fit()

In [24]:
# the different dataframes created with the ImageDatagenerator
train_df = datagen.flow_from_dataframe(
    df_shuffled,
    directory = 'data/HAM10000_images/',
    x_col = 'image_id',
    y_col = 'dx',
    subset='training',
    class_mode='sparse',
    classes=['nv', 'mel', 'bkl', 'bcc', 'akiec', 'vasc', 'df']
)
val_df = datagen.flow_from_dataframe(
    df_shuffled,
    directory = 'data/HAM10000_images/',
    x_col = 'image_id',
    y_col = 'dx',
    subset='validation',
    class_mode='sparse',
    classes=['nv', 'mel', 'bkl', 'bcc', 'akiec', 'vasc', 'df']
)

Found 8012 validated image filenames belonging to 7 classes.
Found 2003 validated image filenames belonging to 7 classes.


In [25]:
# calculating weights
weight = [0,0,0,0,0,0,0]
for x, y in train_df:
    for i in y:
        i = i.astype(int)
        weight[i] = weight[i] + 1
weight

KeyboardInterrupt: 

In [None]:
for x, y in train_df:
    print(x.shape)
    print(y[0])
    break

for x, y in val_df:
    print(x.shape)
    print(y[0])
    break

(32, 256, 256, 3)
5.0
(32, 256, 256, 3)
5.0


In [None]:
class CNN(tf.keras.Model):
    def __init__(self, out=7):
        # the model variables
        super().__init__()
        self.conv1 = Conv2D(64, kernel_size=3, activation='relu')
        self.maxpool1 = MaxPool2D(pool_size=(2,2))
        self.dropout1 = Dropout(0.1)
        self.conv2 = Conv2D(64, kernel_size=3, activation='relu')
        self.maxpool2 = MaxPool2D(pool_size=(2,2))
        self.dropout2 = Dropout(0.1)
        self.conv3 = Conv2D(64, kernel_size=3, activation='relu')
        self.maxpool3 = MaxPool2D(pool_size=(2,2))
        self.dropout3 = Dropout(0.1)

        # for the dnn
        self.flatten = Flatten()
        self.dense1 = Dense(512, activation='relu')
        self.dense2 = Dense(128, activation='relu')
        self.dense3 = Dense(out, activation='softmax')

    def call(self, x):
        x = self.conv1(x)
        x = self.maxpool1(x)
        x = self.dropout1(x)
        x = self.conv2(x)
        x = self.maxpool2(x)
        x = self.dropout2(x)
        x = self.conv3(x)
        x = self.maxpool3(x)
        x = self.dropout3(x)
        x = self.flatten(x)
        x = self.dense1(x)
        x = self.dense2(x)
        x = self.dense3(x)
        return x

    # def model(self):
    #     x = Input(shape=(256, 256, 3))
    #     return Model([x], self.call(x))

In [None]:
# creating model
model = CNN(7)
# model.build(input_shape=[None, 256, 256, 3])
# model.summary()

In [None]:
from tensorflow.keras.optimizers import SGD, Adam

# adam = optimizers.Adam(lr=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-7)
optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
# sgd = SGD(lr = 0.01)
model.compile(
    optimizer=optimizer,
    # loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

  super(SGD, self).__init__(name, **kwargs)


In [48]:
model.fit(
    train_df,
    validation_data=val_df,
    epochs=5,
    verbose=1,
    class_weight=class_weight
)

Epoch 1/5


  return dispatch_target(*args, **kwargs)




KeyboardInterrupt: 