In [None]:
! pip install efficientnet_pytorch torchtoolbox

In [None]:
import numpy as np
import pandas as pd
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype

import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torchvision
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import ReduceLROnPlateau , CyclicLR
from torchvision import transforms

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold, GroupKFold

from efficientnet_pytorch import EfficientNet

import os 
import gc
from tqdm import tqdm
import cv2
import datetime
import random
import warnings
import time

from plotly.offline import iplot
import cufflinks
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')

warnings.simplefilter('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
train_csv = pd.read_csv('/content/drive/My Drive/Melanoma_Images/train.csv')
test_csv = pd.read_csv('/content/drive/My Drive/Melanoma_Images/test.csv')

In [None]:
train_csv.columns

First, Let's look for missing values

In [None]:
def missing(df):
  
  missing_per = (pd.isnull(df).sum()*100/len(df)).sort_values(ascending= False)
  missing_count = pd.isnull(df).sum().sort_values(ascending= False)

  missing_stats = pd.DataFrame({'Missing Values Percentage': missing_per, 'Missing Values Count': missing_count})
  missing_data = missing_stats.loc[missing_stats['Missing Values Percentage'] > 0]

  return missing_data


In [None]:
missing(train_csv)

In [None]:
missing(test_csv)

Apart from site, our training dataset also have missing values in age & sex

Time for EDA

In [None]:
# Let's plot some features
plt.style.use('fivethirtyeight')
fig, ax = plt.subplots(1,2, figsize=(20,5))

sns.countplot(x = "age_approx", data = train_csv, ax = ax[0] )
ax[0].set_title('Age Distribution in Train Data')

sns.countplot(x= "age_approx", data = test_csv, ax= ax[1])
ax[1].set_title('Age Distribution in Test Data')


In [None]:
fig, ax = plt.subplots(1,2, figsize=(20,5))

sns.distplot(train_csv.patient_id.value_counts(), ax = ax[0], color= 'orangered', kde= True)
ax[0].set_xlabel('Counts')
ax[0].set_ylabel('freq')
ax[0].set_title('Patient ID distribution in train data')

sns.distplot(test_csv.patient_id.value_counts(), ax = ax[1], kde = True)
ax[1].set_xlabel('Counts')
ax[1].set_ylabel('freq')
ax[1].set_title('Patient ID distribution in test data')




In [None]:
#let's see number of images we have for each patient
train_csv.patient_id.value_counts()

In [None]:
test_csv.patient_id.value_counts()

Patient Id: 'IP_3579794' in test data accounts for 240 images, approx 2% of test data (pretty big number as compared to other patient Id

In [None]:
#fig, ax = plt.subplots(1,2, figsize = (20,5))
sns.countplot( x = 'benign_malignant', data = train_csv )

In [None]:
fig, ax = plt.subplots(1,2, figsize = (20,5))

sns.countplot( x = 'anatom_site_general_challenge', data = train_csv, ax = ax[0])
ax[0].set_title('Sites in Train Data')

sns.countplot(x = 'anatom_site_general_challenge', data = test_csv, ax = ax[1])
ax[1].set_title('Sites in Test Data')


In [None]:
fig, ax = plt.subplots(1,1,figsize = (20,5))
sns.countplot( x = 'anatom_site_general_challenge',hue = train_csv['benign_malignant'], data = train_csv)
ax.set_title('Sites in Train Data')


In [None]:
fig, ax = plt.subplots(1,1,figsize = (20,10))
sns.countplot( x = 'anatom_site_general_challenge',hue = train_csv['diagnosis'], data = train_csv)
ax.set_title('Sites in Train Data')


In [None]:
fig, ax = plt.subplots(1,2, figsize = (10,10))

sns.countplot(x = 'sex',  data = train_csv, ax = ax[0])
ax[0].set_title('Sex Distribution in Train Data')

sns.countplot(x = 'sex', data = test_csv, ax = ax[1])
ax[1].set_title('Sex Distribution in Test Data')

All the above graph suggests that our Data is highly imbalanced

In [None]:
#Removing all the data with NaN
train_1 = train_csv.loc[pd.notnull(train_csv['sex'])]
train_2 = train_1.loc[pd.notnull(train_csv['age_approx'])]
train_3 = train_2.loc[pd.notnull(train_csv['anatom_site_general_challenge'])]

In [None]:
site = pd.get_dummies(train_3['anatom_site_general_challenge'], prefix = 'site')

In [None]:
train_3['sex'] = train_3['sex'].map({'male':1, 'female':0 })
test_csv['sex'] = test_csv['sex'].map({'male':1, 'female': 0})

In [None]:
train_3  = pd.concat([train_3 , site] , axis=1)

In [None]:
meta_features = ['sex', 'age_approx']  + [ f for f in site.columns]

In [None]:
site_test = pd.get_dummies(test_csv['anatom_site_general_challenge'], prefix= 'site')
test_csv = pd.concat([test_csv, site_test], axis=1)

In [None]:
test_csv.drop(['anatom_site_general_challenge'], axis=1, inplace = True)
train_3.drop(['anatom_site_general_challenge'],axis=1, inplace=True)

In [None]:
test_csv.head

In [None]:
train_3.patient_id.nunique()

In [None]:
train_3['sex'] = train_csv['sex']

In [None]:
train_3['sex'] = train_3['sex'].map({'male':1, 'female': 0})

In [None]:
train_3.head()

In most of the image you will notice that there are hairs over the lesion area. We'll introduced a Data Augmentation technique to remove the hairs



In [None]:
meta_features = ['sex', 'age_approx'] + [c for c in train_3.columns if 'site' in c]

In [None]:
meta_features

In [None]:
c= cv2.imread('/content/drive/My Drive/Melanoma_Images/300x300/train/ISIC_0068279.jpg')

In [None]:
print(c)

In [None]:
class RemoveHair:
  """
      Remove Hairs from images
  """
  def __init__(self):
    pass
    
  
    
  def __call__(self, image):

    grayscale = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)

    kernel = cv2.getStructuringElement(1, (17,17))

    blackhat = cv2.morphologyEx(grayscale, cv2.MORPH_BLACKHAT, kernel)

    _, threshold = cv2.threshold(blackhat, 10, 255, cv2.THRESH_BINARY)

    final_image = cv2.inpaint(image, threshold, 1, cv2.INPAINT_TELEA)

    return final_image



In [None]:
class Microscope:

  def __init__(self, p):
    
    self.p = p

  def __call__(self, img):


    if random.random() < self.p:
      circle = cv2.circle((np.ones(img.shape) * 255).astype(np.uint8),
                          (img.shape[0]//2, img.shape[1]//2),
                          (random.randint(img.shape[0]//2 , img.shape[1]//2)),
                          (0,0,0),
                          -1)
      mask  = circle - 255
      img = np.multiply(img,mask)
    
    return img



In [None]:
class Net(nn.Module):

  def __init__(self, model, n_meta_features):
    super().__init__()
    
    self.model = model
    if 'EfficientNet' in str(model.__class__):
      self.model._fc = nn.Linear(in_features=1280, out_features=500, bias=True)
    
    self.meta = nn.Sequential(nn.Linear(n_meta_features, 500),
                              nn.BatchNorm1d(500),
                              nn.ReLU(),
                              nn.Dropout(p=0.2),
                              nn.Linear(500,250),
                              nn.BatchNorm1d(250),
                              nn.ReLU(),
                              nn.Dropout(p=0.2))
    
    self.output = nn.Linear(500 + 250, 1)

  def forward(self, inputs):

    x, meta = inputs
    cnn_features = self.model(x)
    meta_features = self.meta(meta)
    features  = torch.cat((cnn_features, meta_features), dim = 1)
    output = self.output(features)
    
    return output

In [None]:
class MelanomaDataset(Dataset):
  """
    Our Dataset for Melanoma Classification
    img_folder: Path to images directory
    meta_features: Additional Data features to be used
    df: Contains Meta_features
    transforms: Data Augmentation Techniques to be applied
  """
  def __init__(self, img_folder, df, meta_features = None, train = True, transforms = None ):
    
    super().__init__()

    self.img_folder = img_folder
    self.df = df
    self.meta_features = meta_features
    self.transforms = transforms
    self.train = train
    
  
  def __len__(self):
    return len(self.df)

  
  def __getitem__(self, index):

    img = os.path.join(self.img_folder , self.df.iloc[index]['image_name'] + '.jpg')
    meta = np.array(self.df.iloc[index][meta_features].values, dtype= np.float32)
    x = cv2.imread(img)

    if self.transforms:
      x = self.transforms(x)

    if self.train:
      y = self.df.iloc[index]['target']
      return (x,meta) , y
    else:
      return (x,meta)





In [None]:
train_aug = transforms.Compose([RemoveHair(),
                                Microscope(p=0.5),
                                transforms.ToPILImage(),
                                transforms.RandomHorizontalFlip(),
                                transforms.RandomVerticalFlip(),
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.485,0.456,0.406], std = [0.229, 0.224, 0.225])])

test_aug = transforms.Compose([RemoveHair(),
                               transforms.ToTensor(),
                               transforms.Normalize(mean=[0.485,0.456,0.406], std = [0.229, 0.224, 0.225])])


                                
                               



In [None]:
skf = GroupKFold(n_splits=10)
model = EfficientNet.from_pretrained('efficientnet-b1')

In [None]:
torch.cuda.empty_cache()

In [None]:
epochs = 10
model_path = 'model.pth'

oof = np.zeros((len(train_3), 1))
preds = torch.zeros((len(test_csv), 1), dtype = torch.float32, device=device)


In [None]:
test = MelanomaDataset(img_folder = '/content/drive/My Drive/Melanoma_Images/300x300/test',
                      df = test_csv,
                      meta_features = meta_features,
                      train = False,
                      transforms = test_aug)

In [None]:
for fold, (train_idx, val_idx) in enumerate(skf.split(X = np.zeros(len(train_3)), 
                                                      y = train_3['target'], 
                                                      groups= train_3['patient_id'].tolist()), 
                                                      1):

  print('='*20, 'Fold', fold, '='*20 )
  model = EfficientNet.from_pretrained('efficientnet-b1')
  best_val = None
  model = Net(model = model, n_meta_features = len(meta_features))

  model = model.to(device)

  optim = torch.optim.SGD(model.parameters(), lr=0.01)
  scheduler = CyclicLR(optimizer = optim, base_lr = 0.001, max_lr=0.01)
  criterion = nn.BCEWithLogitsLoss()

  train = MelanomaDataset(df = train_3.iloc[train_idx].reset_index(drop=True),
                          img_folder = '/content/drive/My Drive/Melanoma_Images/300x300/train',
                          meta_features = meta_features,
                          train = True,
                          transforms = train_aug)
  
  val = MelanomaDataset(df = train_3.iloc[val_idx].reset_index(drop=True),
                        img_folder = '/content/drive/My Drive/Melanoma_Images/300x300/train',
                        meta_features = meta_features,
                        train = True,
                        transforms = test_aug)
  
  
  
  
  
  train_loader = DataLoader(dataset = train,
                            batch_size = 32,
                            shuffle = True,
                            num_workers = 0)

  
  val_loader =  DataLoader(dataset = val,
                            batch_size = 16,
                            shuffle = False,
                            num_workers = 0)
  
  
  test_loader = DataLoader(dataset = test,
                            batch_size = 16,
                            shuffle = False,
                            num_workers = 0)
  
  for epoch in range(epochs):

    start_time = time.time()
    correct = 0
    epoch_loss = 0
    model.train()

    for x,y in train_loader:
      
      x[0] = torch.tensor(x[0], device = device, dtype= torch.float32)
      x[1] = torch.tensor(x[1], device = device, dtype= torch.float32)
      y    = torch.tensor( y, device = device, dtype= torch.float32)

      optim.zero_grad()
      z = model(x)

      loss = criterion(z, y.unsqueeze(1))
      loss.backward()
      
      optim.step()

      pred = torch.round(torch.sigmoid(z))
      correct += (pred.cpu() == y.cpu().unsqueeze(1)).sum().item()
      epoch_loss += loss.item()

    train_acc = correct / len(train_idx)
    model.eval()
    val_preds = torch.zeros((len(val_idx), 1), dtype = torch.float32, device = device)

    with torch.no_grad():

      for j, (x_val, y_val) in enumerate(val_loader):

        x_val[0] = torch.tensor(x_val[0], device = device, dtype= torch.float32)
        x_val[1] = torch.tensor(x_val[1], device = device, dtype= torch.float32)
        y_val    = torch.tensor(y_val   , device = device, dtype= torch.float32)
        
        z_val = model(x_val)
        val_pred = torch.sigmoid(z_val)
        val_preds[j*x_val[0].shape[0]:j*x_val[0].shape + x_val[0].shape[0]] = val_pred

      val_acc = accuracy_score( train_3.iloc[val_idx]['target'].values, torch.round(val_preds.cpu()))
      val_roc = roc_auc_score( train_3.iloc[val_idx]['target'].values, val.preds.cpu())

      print('Epoch {:03}: | Loss: {:.3f} | Train acc: {:.3f} | Val acc: {:.3f} | Val roc_auc: {:.3f} | Training time: {}'.format(
            epoch + 1, 
            epoch_loss, 
            train_acc, 
            val_acc, 
            val_roc, 
            str(datetime.timedelta(seconds=time.time() - start_time))[:7]))

      scheduler.step(val_roc)

      if val_roc >= best_val:
        best_val = val_roc
        torch.save(model, model_path)
  
  model = torch.load(model_path)
  model.eval()
  val_preds = torch.zeros((len(val_idx), 1), dtpe = torch.float32, device=device)

  with torch.no_grad():

    for j, (x_val, y_val) in enumerate(val_loader):
      x_val[0] = torch.tensor(x_val[0], device = device, dtype= torch.float32)
      x_val[1] = torch.tensor(x_val[1], device = device, dtype= torch.float32)
      y_val    = torch.tensor(y_val   , device = device, dtype= torch.float32)

      z_val = model(x_val)
      val_pred = torch.sigmoid(z_val)
      val_preds[j*x_val[0].shape[0]:j*x_val[0].shape[0] + x_val[0].shape[0]] = val_pred
    oof[val_idx] = val_preds.cpu().numpy()
      
    for i, x_test in enumerate(test_loader):
      x_test[0] = torch.tensor(x_test[0], device = device, dtype = torch.float32)
      x_test[1] = torch.tensor(x_test[1], device = device, dtype = torch.float32)
      z_test    = model(x_test)
      z_test    = torch.sigmoid(z_test)
      z_test = torch.sigmoid(z_test)
      preds[i*x_test[0].shape[0]:i*x_test[0].shape[0] + x_test[0].shape[0]] += z_test

  del train, val, train_loader, val_loader, x, y, x_val, y_val
  gc.collect

preds /= skf.n_splits