In [None]:
!pwd

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
train_path = '../input/siim-isic-melanoma-classification/jpeg/train'

Exploratory Data Analysis EDA

In [None]:
df= pd.read_csv('../input/siim-isic-melanoma-classification/train.csv')
df.head(2)

In [None]:
#Drop Duplicates

dup = pd.read_csv("../input/melanoma-duplicate/2020_Challenge_duplicates.csv")

drop_idx_list = []
for dup_image in dup.ISIC_id_paired:
    for idx,image in enumerate(df.image_name):
        if image == dup_image:
            drop_idx_list.append(idx)

print("no. of duplicates in training dataset:",len(drop_idx_list))

df.drop(drop_idx_list,inplace=True)

print("updated dimensions of the training dataset:",df.shape)

In [None]:
df.value_counts(['target'])

We will try to minimize the data by deleting patients with a target picture of 0 but at some point they had the disease before and they hava a picture with label 1 -even if it's an old picture- .

In [None]:
pos_pat_id=df[df.target==1].patient_id.unique()
pos_pat_id.size

In [None]:
df['ever_pos']=df['patient_id'].apply(lambda x:1 if x in pos_pat_id else 0)
df.head(2)

In [None]:
df[df.ever_pos==0].patient_id.value_counts()

In [None]:
df_processed=pd.concat([df[df.target==1],df[df.ever_pos==0]]).sort_index()
df_processed

In [None]:
df_processed.value_counts(['target'])

In [None]:
## they were 32120 picture with label 0 , now they are only 25813.

In [None]:
def split(df,train_size,stratify_label,randomstate=42):
    
    from sklearn.model_selection import train_test_split
    
    train,valid=train_test_split(df,train_size=train_size,stratify=df[stratify_label],random_state=randomstate)
    
    positive_index=train[train[stratify_label]==1].index.tolist()
    random_negative_index=train[train[stratify_label]==0].sample(len(positive_index),random_state=randomstate).index.tolist()
    
    balanced_train=train.loc[positive_index+random_negative_index]
    
    return balanced_train,valid

In [None]:
## I can change the train_size in compared to valid size if i am going to use validation
splitting=split(df_processed,train_size=0.99,stratify_label='target')
train,valid=splitting
for dataframe in splitting:
    print(dataframe['target'].value_counts(),end='\n\n')

In [None]:
import shutil
import os
train_path = '../input/siim-isic-melanoma-classification/jpeg/train'
model_path = 'model_images'

In [None]:
def prepare_dirs(directory,dataframe,target_column):
    target_classes=dataframe[target_column].unique().tolist()
    if directory in os.listdir():
        shutil.rmtree(directory)
    os.mkdir(directory)
    for use in 'train','valid':
        os.mkdir(f'{directory}/{use}')
        for x in target_classes:
            os.mkdir(f'{directory}/{use}/{x}')

In [None]:
prepare_dirs(model_path,df_processed,'target')

In [None]:
df_processed.head(2)

In [None]:
for i,ddf in enumerate(splitting):
    phase={0:'train',1:'valid'}
    phase=phase[i]
    ddf.apply(lambda x: shutil.copy2(f'{train_path}/{x[0]}.jpg',f'{model_path}/{phase}/{x[-1]}'),axis=1)

In [None]:
#### Test CSV File
data_dir ='../input/siim-isic-melanoma-classification'
df_test = pd.read_csv(os.path.join(data_dir, 'test.csv'))
df_test.head(1)

In [None]:
df_test['filepath'] = df_test['image_name'].apply(lambda x: os.path.join(data_dir, 'test', f'{x}.jpg'))
df_test.head(3)

In [None]:
## sample sumption
df_subm = pd.read_csv(os.path.join(data_dir, 'sample_submission.csv'))
df_subm.head(3)

In [None]:
#IMPORT LIBRARIES
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models # add models to the list
from torchvision.utils import make_grid
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# ignore harmless warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
import random
random.seed(42)
import numpy as np
np.random.seed(42)
import torch
torch.manual_seed(42)

In [None]:
# Define Transformers
train_transform = transforms.Compose([
        transforms.RandomRotation(10),      # rotate +/- 10 degrees
        transforms.RandomHorizontalFlip(),  # reverse 50% of images
        transforms.Resize(224),             # resize shortest side to 300 pixels
        transforms.CenterCrop(224),         # crop longest side to 300 pixels at center
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],
                             [0.229, 0.224, 0.225])
    ])

test_transform = transforms.Compose([
        transforms.Resize(224),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],
                             [0.229, 0.224, 0.225])
    ])

In [None]:
train_data = datasets.ImageFolder('./model_images/train', transform=train_transform)
val_data = datasets.ImageFolder('./model_images/valid', transform=test_transform)
#test_data = datasets.ImageFolder('../input/siim-isic-melanoma-classification/jpeg/test', transform=test_transform)

torch.manual_seed(42)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True , pin_memory=True)
val_loader = DataLoader(val_data, batch_size=64, shuffle=True)
#test_loader = DataLoader(test_data, batch_size=10, shuffle=True)

class_names = train_data.classes

print(class_names)
print(f'Training images available: {len(train_data)}')
print(f'Validation images available: {len(val_data)}')
#print(f'Testing images available:  {len(test_data)}')

In [None]:
# Grab the first batch of 64 images
for images,labels in train_loader: 
    break


im = make_grid(images[:15], nrow=5)  # the default nrow is 8

# Inverse normalize the images
inv_normalize = transforms.Normalize(
    mean=[-0.485/0.229, -0.456/0.224, -0.406/0.225],
    std=[1/0.229, 1/0.224, 1/0.225]
)
im_inv = inv_normalize(im)

# Print the images
plt.figure(figsize=(20,10))
plt.imshow(np.transpose(im_inv.numpy(), (1, 2, 0)));
#plt.imshow(im.permute(1,2,0))

In training we will use a new technique called MIXED-PRECISION which will make a very large speed up in performance. It requires Tensor Cores, will work and make the training process faster if it's available.

In [None]:
####Training Function#######

from tqdm.notebook import tqdm,trange
def train(n_epochs, train_loader, val_loader, model, optimizer, criterion, use_cuda, save_path):
    """returns trained model"""
    # initialize tracker for minimum validation loss
    valid_loss_min = np.Inf
    losses=np.zeros((n_epochs,2))

    for epoch in trange(n_epochs):
        # initialize variables to monitor training and validation loss
        train_loss = 0.0
        valid_loss = 0.0

        ###################
        # train the model #
        ###################
        # set the module to training mode
        model.train()
        for data, target in train_loader:
            # move to GPU
            if use_cuda:
                data, target = data.cuda(), target.cuda()

            optimizer.zero_grad()
            with torch.cuda.amp.autocast():
                out = model(data)
                loss = criterion(out, target.unsqueeze(1).float())
            train_loss += loss.item()
            
            
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            
            
        train_loss /= len(train_loader)
        print(f'Epoch: {epoch+1} \tTraining Loss: {train_loss:.6f} \t',end='')

        ######################
        # validate the model #
        ######################
        # set the model to evaluation mode
        model.eval()
        for data, target in val_loader:
            with torch.no_grad():
                # move to GPU
                if use_cuda:
                    data, target = data.cuda(), target.cuda()
                
                with torch.cuda.amp.autocast():
                    val_out = model(data)
                    loss = criterion(val_out, target.unsqueeze(1).float())
                valid_loss += loss.item()

        valid_loss /= len(val_loader)
        losses[epoch]=(train_loss,valid_loss)

        # print training/validation statistics
        print(f'Validation Loss: {valid_loss:.6f}')

        if valid_loss <= valid_loss_min:
            print('Validation loss decreased ... Model saved ...')
            valid_loss_min = valid_loss
            torch.save(model.state_dict(), save_path)

    return model, losses

In [None]:
use_cuda = torch.cuda.is_available()
use_cuda

In [None]:
## Modify the classifier
model = models.googlenet(pretrained=True)
model.fc = nn.Sequential(nn.Linear(1024, 128 , bias= False),
                                 nn.BatchNorm1d(128),
                                 nn.ReLU(),
                                 nn.Dropout(0.25),
                                 nn.Linear(128, 1),
                                 )

In [None]:
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scaler=torch.cuda.amp.GradScaler()

In [None]:
####TRAINING####
import time
start_time = time.time()
if use_cuda:
    model = model.cuda()

epochs = 1
model, losses = train(epochs,train_loader, val_loader, model, optimizer, criterion , use_cuda, 'model_melanoma.pt')
model.load_state_dict(torch.load('model_melanoma.pt'))

print(f'\nDuration: {time.time() - start_time:.0f} seconds') # print the time elapsed

In [None]:
### Train function without Validation

####Training Function#######

from tqdm.notebook import tqdm,trange
def train2(n_epochs, train_loader, model, optimizer, criterion, use_cuda, save_path):
    """returns trained model"""
    # initialize tracker for minimum validation loss
    valid_loss_min = np.Inf
    losses=np.zeros((n_epochs,1))

    for epoch in trange(n_epochs):
        # initialize variables to monitor training and validation loss
        train_loss = 0.0
        valid_loss = 0.0

        ###################
        # train the model #
        ###################
        # set the module to training mode
        model.train()
        for data, target in train_loader:
            # move to GPU
            if use_cuda:
                data, target = data.cuda(), target.cuda()

            optimizer.zero_grad()
            with torch.cuda.amp.autocast():
                out = model(data)
                loss = criterion(out, target.unsqueeze(1).float())
            train_loss += loss.item()
            
            
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            
            
        train_loss /= len(train_loader)
        print(f'Epoch: {epoch+1} \tTraining Loss: {train_loss:.6f} \t',end='')

        losses[epoch]=(train_loss)

        torch.save(model.state_dict(), save_path)

    return model, losses

In [None]:
#### TRAINING WITHOUT VALIDATION ####
import time
start_time = time.time()
if use_cuda:
    model = model.cuda()

epochs = 30
model, losses = train2(epochs,train_loader, model, optimizer, criterion , use_cuda, 'model_melanoma1.pt')
model.load_state_dict(torch.load('model_melanoma1.pt'))

print(f'\nDuration: {time.time() - start_time:.0f} seconds') # print the time elapsed

In [None]:
from PIL import Image

model.load_state_dict(torch.load('./model_melanoma1.pt'))
model.eval()

def predict(image_name):
    path= f'../input/siim-isic-melanoma-classification/jpeg/test/{image_name}.jpg'
    with torch.no_grad():
        img=test_transform(Image.open(path)).unsqueeze(0).cuda()
        return model.forward(img).sigmoid().item()
    
predict('ISIC_0073313')

In [None]:
#load the model
#model.load_state_dict(torch.load('./model_melanoma1.pt'))

#predict and add it to the dataframe to subbmit
import time
df_test['target'] = df_test['image_name'].apply(predict)

print(f'\nDuration: {time.time() - start_time:.0f} seconds') # print the time elapsed

In [None]:
df_test.head()

In [None]:
# take just the 2 columns i need from the test ( Result)
subm =pd.DataFrame(df_test,columns=['image_name', 'target'])
subm.head(2)

In [None]:
# save the o/p dataframe to csv file so i could download it to my computer or use it to submit
subm.to_csv('./sub.csv' , index = False)

In [None]:
# When testing on test data and submitt the result to kaggle i got a private accuracy score of ( 0.8370 )