In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
plt.rcParams['figure.figsize'] = (15, 8)

In [None]:
!pip install git+https://github.com/qubvel/efficientnet

In [None]:
import json
import math
import os

from keras import layers
from keras.applications import DenseNet121
from keras.callbacks import Callback, ModelCheckpoint
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score, accuracy_score, auc, roc_auc_score, roc_curve
import sklearn
import scipy
import tensorflow as tf
from tqdm import tqdm
from keras.preprocessing import image
from keras.models import Model
from keras.layers import BatchNormalization, Dropout, Conv2D, MaxPooling2D, GlobalAveragePooling2D, Flatten, Dense

from efficientnet.tfkeras import EfficientNetB7 as effnetb7


%matplotlib inline

## CONSTANTS

In [None]:
SEED = 2020

## Functions

### Perc_Data

In [None]:
def perc_data(column, df):
    cls = column.unique()
    x = column.value_counts()
    rows = df.shape[0]
    print("CLASS\t\t : \tPERCENTAGE")
    print("------------------------------------")
    for i in cls:
        print(f"{i}\t\t : \t{(x[i]/rows)*100}")
    print(f"NULL\t\t : \t{(column.isna().sum()/rows)*100}")

### BGR2RGB

In [None]:
def bgr2rgb(img):
    return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

### Sample Images

In [None]:
def sample_images(df, train_img_path):
    img_l = df.loc[df["target"]==1].sample(7)["image_name"].values
    print("Target : 1")
    plt.figure(figsize = (30, 15))
    for i, img_name in enumerate(img_l):
        img = bgr2rgb(cv2.imread(f"{train_img_path}/{img_name}.jpg"))
        plt.subplot(1,7,i+1)
        plt.axis("off")
        plt.imshow(img)
    plt.show()
    img_l = df.loc[df["target"]==0].sample(7)["image_name"].values
    print("Target : 0")
    plt.figure(figsize = (30, 15))
    for i, img_name in enumerate(img_l):
        img = bgr2rgb(cv2.imread(f"{train_img_path}/{img_name}.jpg"))
        plt.subplot(1,7,i+1)
        plt.axis("off")
        plt.imshow(img)
    plt.draw()

## 2019 Dataset

In [None]:
train_df = pd.read_csv("../input/jpeg-isic2019-512x512/train.csv")
train_df.head()

In [None]:
train_df.shape

In [None]:
perc_data(train_df.diagnosis, train_df)

In [None]:
perc_data(train_df.benign_malignant, train_df)

In [None]:
perc_data(train_df.sex, train_df)

## 2020 Dataset

In [None]:
train_df2020 = pd.read_csv("../input/jpeg-melanoma-512x512/train.csv")
train_df2020.head()

In [None]:
train_df2020.info()

In [None]:
perc_data(train_df2020.diagnosis, train_df2020)

In [None]:
perc_data(train_df2020.target, train_df2020)

In [None]:
perc_data(train_df2020.benign_malignant, train_df2020)

## Plot images

### 2019 data

In [None]:
sample_images(train_df, "../input/jpeg-isic2019-512x512/train/")

### 2020 data

In [None]:
sample_images(train_df2020, "../input/jpeg-melanoma-512x512/train/")

## Combine 2019 and 2020 datasets

### Mapping 2020 diagnosis with 2019 diagnosis

In [None]:
temp2020 = train_df2020
temp2020.loc[temp2020['diagnosis']=='seborrheic keratosis', 'diagnosis'] = 'BKL'
temp2020.loc[temp2020['diagnosis']=='lichenoid keratosis', 'diagnosis'] = 'BKL'
temp2020.loc[temp2020['diagnosis']=='solar lentigo', 'diagnosis'] = 'BKL'
temp2020.loc[temp2020['diagnosis']=='lentigo NOS', 'diagnosis'] = 'BKL'
temp2020.loc[temp2020['diagnosis']=='cafe-au-lait macule', 'diagnosis'] = 'unknown'
temp2020.loc[temp2020['diagnosis']=='atypical melanocytic proliferation', 'diagnosis'] = 'unknown'
temp2020.loc[temp2020['diagnosis']=='nevus', 'diagnosis'] = 'NV'
temp2020.loc[temp2020['diagnosis']=='melanoma', 'diagnosis'] = 'MEL'

In [None]:
perc_data(temp2020['diagnosis'], temp2020)

### Adding path column

In [None]:
temp2020['path'] = "../input/jpeg-melanoma-512x512/train/"+temp2020['image_name']+".jpg"

In [None]:
img = cv2.imread(temp2020.iloc[0]['path'])
plt.imshow(img)
print(img.shape)

In [None]:
temp2020.head()

In [None]:
temp2019 = train_df

In [None]:
temp2019['path'] = "../input/jpeg-isic2019-512x512/train/"+temp2019['image_name']+".jpg"
temp2019.head()

In [None]:
train_temp = temp2019.append(temp2020, ignore_index=True)

In [None]:
train_temp

In [None]:
train_temp = train_temp.sample(frac = 1, random_state = SEED).reset_index(drop=True)

In [None]:
perc_data(train_temp['diagnosis'], train_temp)

In [None]:
df = train_temp

In [None]:
df['diagnosis'].value_counts()

In [None]:
mel = df[df['diagnosis']=="MEL"].sample(1000, random_state = SEED)
nv = df[df["diagnosis"]=="NV"].sample(750, random_state = SEED)
unknown = df[df["diagnosis"]=="unknown"].sample(625, random_state = SEED)
bcc = df[df["diagnosis"]=="BCC"].sample(250, random_state = SEED)
bkl = df[df["diagnosis"]=="BKL"].sample(125, random_state = SEED)
ak = df[df["diagnosis"]=="AK"].sample(125, random_state = SEED)
vasc = df[df["diagnosis"]=="VASC"].sample(125, random_state = SEED)

In [None]:
mel = mel.append(nv, ignore_index = True)
mel = mel.append(unknown, ignore_index = True)
mel = mel.append(bcc, ignore_index = True)
mel = mel.append(bkl, ignore_index = True)
mel = mel.append(ak, ignore_index = True)
mel = mel.append(vasc, ignore_index = True)
mel

In [None]:
perc_data(mel['diagnosis'], mel)

In [None]:
train=mel
train = train.sample(frac = 1, random_state = SEED).reset_index(drop=True)
train

## Image Processing

In [None]:
path1 = train.loc[train['target']==1, 'path'].sample(1).values[0]
path0 = train.loc[train['target']==0, 'path'].sample(1).values[0]

In [None]:
img1 = bgr2rgb(cv2.imread(path1))
img0 = bgr2rgb(cv2.imread(path0))
plt.figure(figsize = (30, 15))
plt.subplot(1, 2, 1)
plt.imshow(img1)
plt.title("Target : 1")
plt.subplot(1,2,2)
plt.imshow(img0)
plt.title("Target : 0")
plt.show()

In [None]:
def clahe_lab(img):
    lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
    # lab = img
    lab_planes = cv2.split(lab)
    clahe = cv2.createCLAHE(clipLimit=1.0)
    lab_planes[0] = clahe.apply(lab_planes[0])
    lab_planes[1] = clahe.apply(lab_planes[1])
    lab_planes[2] = clahe.apply(lab_planes[2])
    lab = cv2.merge(lab_planes)
    rgb = cv2.cvtColor(lab, cv2.COLOR_LAB2RGB)
    return rgb

def crop_image_from_gray(img,tol=30):
    if img.ndim ==2:
        mask = img>tol
        return img[np.ix_(mask.any(1),mask.any(0))]
    elif img.ndim==3:
        gray_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
        mask = gray_img>tol
      
        check_shape = img[:,:,0][np.ix_(mask.any(1),mask.any(0))].shape[0]
        if (check_shape == 0): # image is too dark so that we crop out everything,
            return img # return original image
        else:
            img1=img[:,:,0][np.ix_(mask.any(1),mask.any(0))]
            img2=img[:,:,1][np.ix_(mask.any(1),mask.any(0))]
            img3=img[:,:,2][np.ix_(mask.any(1),mask.any(0))]
  #         print(img1.shape,img2.shape,img3.shape)
            img = np.stack([img1,img2,img3],axis=-1)
  #         print(img.shape)
    if img.shape!=(512, 512, 3):
        img = cv2.resize(img, (512, 512))
    return img

def preprocess(img):
    hist_eq = clahe_lab(img)
    img = crop_image_from_gray(hist_eq)
    return img

In [None]:
N = len(train['path'].values)
img_train = np.empty((N, 512, 512, 3), dtype = np.uint8)

for i, img_path in enumerate(tqdm(train['path'].values)):
    img = cv2.imread(img_path)
    img = preprocess(img)
    img_train[i, :, :, :] = img

np.save("x_train", img_train)

## Preapring the dataset

In [None]:
diag_dict = {}
diag_unique = train.diagnosis.unique()
for i in range(len(diag_unique)):
    diag_dict[diag_unique[i]] = i

diag_dict

In [None]:
train["y"] = train['diagnosis'].map(diag_dict)

In [None]:
train

In [None]:
y_onehot = pd.get_dummies(train["y"]).values

multi = y_onehot

In [None]:
for i in range(len(multi)):
    l = multi[i]
    for j in range(5, -1, -1):
        l[j] = np.logical_or(l[j], l[j+1])
    multi[i] = l

In [None]:
multi[0]

In [None]:
np.save("multilabel_y", multi)

In [None]:
train.to_csv("dataset.csv", index = False)