This is a starter attempt at basic data exploration and a basic convolutional neural network architecture for our problem.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
from tqdm import tqdm
import glob
import cv2
from numpy.random import seed
seed(42)
from tensorflow import set_random_seed
set_random_seed(42)

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train_images = os.listdir('../input/train_images/')
print (len(train_images),len(train))

In [None]:
test_images = os.listdir('../input/test_images/')
print (len(test_images),len(test))

In [None]:
train_path = '../input/train_images/'
test_path = '../input/test_images/'

train_ids = train['id_code'].values
test_ids = test['id_code'].values

train_paths = []
for train_id in train_ids:
    image = train_id + '.png'
    path = os.path.join(train_path,image)
    train_paths.append(path)
    
train_paths = np.array(train_paths)
train['path'] = train_paths

In [None]:
train.head()

In [None]:
test_paths = []
for test_id in test_ids:
    image = test_id + '.png'
    path = os.path.join(test_path,image)
    test_paths.append(path)
    
test_paths = np.array(test_paths)
test['path'] = test_paths

In [None]:
test.head()

The preprocessing function below is used to crop the image according to approximate radius estimation such that surrounding black pixels along the width can be removed. 

In [None]:
def find_radius(mid_pixels,mid_y_pixels,threshold_x,threshold_y):
    
    start_x = 0
    end_x = mid_pixels.shape[0] - 1
    
    start_y = 0
    end_y = mid_y_pixels.shape[0] - 1
    
    while True:
        if np.sum(mid_pixels[start_x,:])>threshold_x:
            break
        start_x +=1
    while True:
        if np.sum(mid_pixels[end_x,:])>threshold_x:
            break
        end_x -= 1
        
    while True:
        if np.sum(mid_y_pixels[start_y,:])>threshold_y:
            break
        start_y +=1
    while True:
        if np.sum(mid_y_pixels[end_y,:])>threshold_y:
            break
        end_y -= 1
        
    return start_x,end_x,start_y,end_y
    
    
    
def preprocess_image(img):
    mid = img.shape[1]//2
    mid_pixels = img[mid,:]
    mid_y_pixels = img[:,mid]
    threshold_x = np.mean(mid_pixels)
    threshold_y = np.mean(mid_y_pixels)
    startx,endx,starty,endy = find_radius(mid_pixels,mid_y_pixels,threshold_x,threshold_y)
    return cv2.resize(img[starty:endy,startx:endx],(img.shape[0],img.shape[1]))
    

In [None]:
fig1,axs1 = plt.subplots(1,2)
img = cv2.resize(cv2.imread('../input/train_images/9e2ba2b979f1.png'),(150,150))
img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
axs1[0].imshow(img)

img = preprocess_image(img)
axs1[1].imshow(img)


In [None]:
fig,axs = plt.subplots(3,2,figsize=(8,12))
train_paths = train['path'].values
for i,path in enumerate(train_paths[150:153]):
    img = cv2.imread(path)
    img = cv2.resize(img,(150,150))
    img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
    axs[i,0].imshow(img)
    axs[i,0].set_title('Original')
    img = preprocess_image(img)
    print (img.shape,np.amin(img),np.amax(img))
    axs[i,1].imshow(img)
    axs[i,1].set_title('After Preprocessing')
    
fig.suptitle('Some Train images',fontsize=15)

In [None]:
fig,axs = plt.subplots(1,3,figsize=(10,4))
test_paths = test['path'].values
for i,path in enumerate(test_paths[:3]):
    img = cv2.imread(path)
    img = cv2.resize(img,(150,150))
    img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
    print (img.shape,np.amin(img),np.amax(img))
    axs[i].imshow(img)
    
            
fig.suptitle('Some Test images',fontsize=15)

The image distributions of the train and test images are different.
We see that the test images seem to zoomed in images. The train images also has noise in the form of black background in the corners. 

Now we will visualize the class distributions in train set.

In [None]:
train['diagnosis'].value_counts().plot.bar()

In [None]:
y_train = train['diagnosis'].values

Computer class weights to penalize misclassifications of lower frequent classes

In [None]:
from collections import Counter
def get_class_weights(y):
    counter = Counter(y)
    majority = max(counter.values())
    return  {cls: round(float(majority)/float(count), 2) for cls, count in counter.items()}

class_weights = get_class_weights(y_train)
class_weights

Splitting the training set into training and validation sets

In [None]:
from sklearn.model_selection import train_test_split
train_df,validation_df = train_test_split(train,test_size = 0.2,stratify=y_train,random_state = 42)
print (len(train_df),len(validation_df))

Confirming the split is indeed a stratified one!

In [None]:
train_df['diagnosis'].value_counts().plot.bar()

In [None]:
validation_df['diagnosis'].value_counts().plot.bar()

In [None]:
from keras.models import Sequential
from keras.layers import Conv2D,MaxPooling2D,Dropout,Activation,Dense,Flatten,BatchNormalization,GlobalAveragePooling2D
from keras.preprocessing.image import ImageDataGenerator


In [None]:
datagen = ImageDataGenerator(
        zoom_range=0.4,
        rescale = 1./255,
        fill_mode = 'constant',
        horizontal_flip = True,
        vertical_flip = True,
        preprocessing_function = preprocess_image
)

In [None]:
train_df.head()

In [None]:
train_df['id'] = train_df['id_code'].apply(lambda x: str(x)+'.png')

In [None]:
train_df['diagnosis'] = train_df['diagnosis'].apply(lambda x:str(x))


In [None]:
train_df.head()

In [None]:
train_generator = datagen.flow_from_dataframe(
dataframe=train_df,
directory="../input/train_images/",
x_col="id",
y_col="diagnosis",
batch_size=32,
seed=42,
shuffle=True,
class_mode="categorical",
color_mode = 'rgb',
target_size=(150,150))

In [None]:
train_generator.class_indices

In [None]:
disp_x,disp_y = next(train_generator)

In [None]:
plt.imshow(disp_x[3])

In [None]:
disp_y[:5]

In [None]:
validation_df.head()

In [None]:
validation_x = []
for path in tqdm(validation_df['path'].values):
    img = cv2.resize(cv2.imread(path),(150,150))
    img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
    validation_x.append(img)
    
   

In [None]:
validation_x = np.array(validation_x)
validation_x = validation_x.astype(np.float32)/255.0
print (validation_x.shape)
print (np.amin(validation_x),np.amax(validation_x))

In [None]:
from keras.utils import to_categorical
validation_y = to_categorical(validation_df['diagnosis'].values,5)
print (validation_y.shape)
print (validation_y[:5])

In [None]:
model = Sequential()
model.add(Conv2D(32,(3,3),input_shape=(150,150,3)))
model.add(Activation('relu'))
model.add(MaxPooling2D((2,2)))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(Conv2D(64,(3,3)))
model.add(Activation('relu'))
model.add(MaxPooling2D((2,2)))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(Conv2D(128,(3,3)))
model.add(Activation('relu'))
model.add(MaxPooling2D((2,2)))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(Conv2D(256,(3,3)))
model.add(Activation('relu'))
model.add(MaxPooling2D((2,2)))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(GlobalAveragePooling2D())

model.add(Dense(64,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(5,activation='softmax'))
model.summary()


In [None]:
from sklearn.metrics import cohen_kappa_score
from keras.callbacks import Callback
class Metrics(Callback):
    def on_train_begin(self, logs={}):
        self.val_kappas = []

    def on_epoch_end(self, epoch, logs={}):
        X_val, y_val = self.validation_data[0],self.validation_data[1]
                
        y_val = np.argmax(y_val,axis=1)
        
        y_pred = self.model.predict(X_val)
        y_pred = np.argmax(y_pred,axis=1)

        _val_kappa = cohen_kappa_score(
            y_val,
            y_pred, 
            weights='quadratic'
        )

        self.val_kappas.append(_val_kappa)

        print(f"val_kappa: {_val_kappa:.4f}")
        
        if _val_kappa == max(self.val_kappas):
            print("Validation Kappa has improved. Saving model.")
            self.model.save('model.h5')

        return

In [None]:
model.compile(loss= 'categorical_crossentropy',metrics=['accuracy'],optimizer='Adam')

In [None]:
from keras.callbacks import ModelCheckpoint
metric = Metrics()
callback = [metric]

In [None]:
history = model.fit_generator(train_generator,validation_data = (validation_x,validation_y),
                              epochs = 20,steps_per_epoch = len(train_df)/32,callbacks = callback,verbose=1,
                              class_weight = class_weights)

In [None]:
fig, axs = plt.subplots(1,3,figsize=(12,8))
axs[0].plot(history.history['acc'])
axs[0].plot(history.history['val_acc'])
axs[0].set_xlabel('Epoch')
axs[0].set_ylabel('Accuracy')
axs[0].legend(['train','validation'],loc='upper left')
axs[0].set_title('Train and validation accuracy')

axs[1].plot(history.history['loss'])
axs[1].plot(history.history['val_loss'])
axs[1].set_xlabel('Epoch')
axs[1].set_ylabel('Loss')
axs[1].legend(['train','validation'],loc='upper left')
axs[1].set_title('Train and validation loss')

axs[2].plot(metric.val_kappas)
axs[2].set_ylabel('Quadratic weighted kappa')
axs[2].set_xlabel('Epochs')
axs[2].set_title('Validation data weighted cohen kappa scores')

In [None]:
from keras.models import load_model
model = load_model('model.h5')
model.evaluate(validation_x,validation_y)

In [None]:
test.head()

In [None]:
test_x = []
for path in tqdm(test['path'].values):
    img = cv2.resize(cv2.imread(path),(150,150))
    img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
    test_x.append(img)
    
test_x = np.array(test_x)
test_x = test_x.astype(np.float32)/255.0
print (test_x.shape)
print (np.amin(test_x),np.amax(test_x))

In [None]:
test_y = model.predict(test_x)
test_y = np.argmax(test_y,axis=1)

In [None]:
Counter(test_y)

In [None]:
output = pd.read_csv('../input/sample_submission.csv')
output.head()

In [None]:
output['diagnosis'] = test_y
output.head()

In [None]:
output.to_csv('submission.csv',index=False)