In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns


from sklearn.preprocessing import OneHotEncoder, StandardScaler

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Conv2D, SeparableConv2D, MaxPooling2D, GlobalAveragePooling2D, \
BatchNormalization, Dropout, Input, Flatten, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.metrics import AUC
from tensorflow.keras.callbacks import LearningRateScheduler

# from keras_preprocessing.image import ImageDataGenerator

!pip install -q efficientnet

import efficientnet.tfkeras as efn

In [None]:
WORKING_PATH = "/kaggle/input/siim-isic-melanoma-classification"
TRAIN_DIR = WORKING_PATH + "/jpeg/train/"
TEST_DIR = WORKING_PATH + "/jpeg/test/"
TRAIN_DF_PATH = WORKING_PATH + "/train.csv"
TEST_DF_PATH = WORKING_PATH + "/test.csv"


BATCH_SIZE = 32
EPOCHS = 3
IMAGE_SIZE = (256, 256)

## Data preprocessing

We perform the following cleaning and processing to the data: 

**Imputation** 

From a quick EDA that I did, there are some missing values. I decided to fill these in with the modes of each respective field.

**Standard Scaling**

The age_approx variable is scaled. 

**One hot Encoding**

The categorical variables sex and anatom_site_general_challenge are one-hot encoded. 



In [None]:
df_train = pd.read_csv(TRAIN_DF_PATH)

SS = StandardScaler()
SS.fit(df_train[["age_approx"]])

def preprocess_data(df, training=True):
    # Imputation
    df = df.fillna({"sex": "male",
                    "age_approx": df_train["age_approx"].mode().iloc[0],
                    "anatom_site_general_challenge": "torso"})

    # get file path 
    DIR = TRAIN_DIR if training else TEST_DIR
    df["file_path"] = df["image_name"].apply(lambda x : DIR + x + ".jpg")

    # Scale the age variable
    df["age_approx"] = SS.transform(df[["age_approx"]])

    # One hot encoding for sex and anatom_site_general_challenge
    df = pd.get_dummies(df, columns=["sex", "anatom_site_general_challenge"])
    
    return df
    
FEATURE_COLUMNS = ['sex_female',
 'sex_male',
 'anatom_site_general_challenge_head/neck',
 'anatom_site_general_challenge_lower extremity',
 'anatom_site_general_challenge_oral/genital',
 'anatom_site_general_challenge_palms/soles',
 'anatom_site_general_challenge_torso',
 'anatom_site_general_challenge_upper extremity',
 'age_approx']

df_train = preprocess_data(df_train)

In [None]:
df_test = pd.read_csv(TEST_DF_PATH)

df_test = preprocess_data(df_test, training=False)

## TF datasets

This is my first experience with these, so I'm still learning. The code below is from [this kernel](https://www.kaggle.com/reighns/groupkfold-efficientbnet-and-augmentations/#Splitting-the-dataset-according-to-GroupKFold). It's a great kernel, so make sure to check it out.

In [None]:
def decode_image(file_path, label=None):
    bits = tf.io.read_file(file_path)
    image = tf.image.decode_jpeg(bits, channels=3)
    image = tf.cast(image, tf.float32) / 255.0
    image = tf.image.resize(image, size = IMAGE_SIZE)
    
    if label is None:
        return image
    else:
        return image, label
    
def data_augment(image, label=None):
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
    image = tf.image.random_contrast(image, lower = 1, upper = 2)
    
    if label is None:
        return image
    else:
        return image, label

In [None]:
training_image_dataset = (tf.data.Dataset.from_tensor_slices(
    (df_train["file_path"], df_train[FEATURE_COLUMNS]))
    .map(decode_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    .map(data_augment, num_parallel_calls=tf.data.experimental.AUTOTUNE))


training_labels_dataset = (tf.data.Dataset.from_tensor_slices(df_train["target"]))

training_dataset = (tf.data.Dataset.zip((training_image_dataset, training_labels_dataset))
           .repeat()
           .shuffle(512)
           .batch(BATCH_SIZE)
           .prefetch(tf.data.experimental.AUTOTUNE)
          )


testing_dataset = (tf.data.Dataset.from_tensor_slices(
    (df_test["file_path"], df_test[FEATURE_COLUMNS]))
    .map(decode_image, num_parallel_calls=tf.data.experimental.AUTOTUNE))

testing_dataset = (tf.data.Dataset.zip((testing_dataset, )).batch(BATCH_SIZE))

## Learning Rate

In [None]:
def build_lrfn(lr_start=0.00001, lr_max=0.00005, 
               lr_min=0.00001, lr_rampup_epochs=5, 
               lr_sustain_epochs=0, lr_exp_decay=.8):
    
    def lrfn(epoch):
        if epoch < lr_rampup_epochs:
            lr = (lr_max - lr_start) / lr_rampup_epochs * epoch + lr_start
        elif epoch < lr_rampup_epochs + lr_sustain_epochs:
            lr = lr_max
        else:
            lr = (lr_max - lr_min) * lr_exp_decay**(epoch - lr_rampup_epochs - lr_sustain_epochs) + lr_min
        return lr
    
    return lrfn

lrfn = build_lrfn()
lr_schedule = LearningRateScheduler(lrfn, verbose=1)

## Checkpoint

In [None]:
model_checkpoint = tf.keras.callbacks.ModelCheckpoint('EffNet.h5', save_best_only=True)

## Model

In [None]:
def build_model():
    
    # Build the CNN part of the model 
    base_model = efn.EfficientNetB4(weights="imagenet", include_top=False, input_shape=(*IMAGE_SIZE, 3))
    
    # Freeze layers 
    for layer in base_model.layers:
        layer.trainable = False
    
    # output -> GlobalPool -> Dense
    
    GAP1 = GlobalAveragePooling2D()(base_model.output)
    CNN_D1 = Dense(400, activation="relu", kernel_initializer="he_normal")(GAP1)
    CNN_BN1 = BatchNormalization()(CNN_D1)
    # Add dropout here if overfitting
    CNN_OUT = Dense(400, activation="relu")(CNN_BN1)
    
    # Build the DNN part of the model 
    
    input_dense = Input((len(FEATURE_COLUMNS), ))
    DNN_D1 = Dense(400, activation="elu", kernel_initializer="he_normal")(input_dense)
    DNN_BN1 = BatchNormalization()(DNN_D1)
    DNN_DO1 = Dropout(0.2)(DNN_BN1)
    DNN_D2 = Dense(400, activation="elu", kernel_initializer="he_normal")(DNN_D1)
    DNN_BN2 = BatchNormalization()(DNN_D2)
    DNN_DO2 = Dropout(0.2)(DNN_BN2)
    DNN_OUT = Dense(400, activation="relu")(DNN_DO2)
    
    # Combine the two 
    
    concat = concatenate([CNN_OUT, DNN_OUT])
    output = Dense(1, activation="sigmoid")(concat)
    
    model =  Model(inputs=[base_model.input, input_dense], outputs=[output])
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=[AUC()])
    return model 
    
    
model = build_model()
    

In [None]:
hist = model.fit(training_dataset, epochs=EPOCHS, steps_per_epoch=df_train.shape[0] // BATCH_SIZE, 
                 callbacks = [model_checkpoint, lr_schedule], verbose=1)

In [None]:
predictions = model.predict(testing_dataset, verbose=1)

In [None]:
df_sub = pd.read_csv(WORKING_PATH + "/sample_submission.csv")
df_sub["target"] = predictions

df_sub.head()

df_sub.to_csv("submission.csv", index=False)

model.save("EffNet.h5")