In [57]:
# %matplotlib inline
# python libraties
import os, cv2,itertools
import matplotlib.pyplot as plt
import matplotlib.image as mpimg 
import numpy as np
import pandas as pd
from tqdm import tqdm
from glob import glob
from PIL import Image
# pytorch libraries
import torch
from torch import optim,nn
from torch.autograd import Variable
from torch.utils.data import DataLoader,Dataset
from torchvision import models,transforms

# sklearn libraries
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# to make the results are reproducible
np.random.seed(10)
torch.manual_seed(10)
torch.cuda.manual_seed(10)

print(os.listdir("../input/skin-cancer-mnist-ham10000/"))

['hmnist_28_28_RGB.csv', 'ham10000_images_part_1', 'HAM10000_images_part_2', 'hmnist_28_28_L.csv', 'HAM10000_images_part_1', 'HAM10000_metadata.csv', 'hmnist_8_8_RGB.csv', 'hmnist_8_8_L.csv', 'ham10000_images_part_2']


# Feature eng and data preprocessing

In [58]:
data_dir = '../input/skin-cancer-mnist-ham10000'
all_image_path = glob(os.path.join(data_dir, '*', '*.jpg'))
#extracts the image id to match it with the .csv label file
imageid_path_dict = {os.path.splitext(os.path.basename(x))[0] : x for x in all_image_path}
lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'dermatofibroma',
    'bkl': 'Benign keratosis-like lesions ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}
input_size = 224

In [59]:
# print(os.path.join(data_dir,'*','*.jpg'))
# print(glob(os.path.join(data_dir,'*','*.jpg')))

In [60]:
# {os.path.splitext(os.path.basename(x))[0] : x for x in all_image_path}

In [61]:
#use this training model from scratch or not using pretrained model
def compute_img_mean_std(image_paths):
    '''
    normalising data from 0-255 to 0-1
    mean and std of three channnel
    '''
    img_h, img_w = input_size,input_size
    imgs = []
    means, stds = [],[]
    #resizing 
    for i in tqdm(image_paths):
        img = cv2.imread(i)
        img.resize(img_w,img_h)
        imgs.append(img)
        
    # (224,224,3),nof_img -> 224,224,3,nof_img
    imgs = np.stack(imgs, axis=3)
    #normalising 
    imgs=imgs.astype(np.float32)/255.
    
    print(imgs.shape)
    #mean and std of each channel
    for i in range(3):
        pixels = imgs[:,:,i,:].ravel() #resizing to one row
        mean = pixels.mean()
        std = pixels.std()
        
        means.append(mean)
        stds.append(std)
        
    means.reverse() #bgr to rgb
    std.reverse()
    
    print(f'mean {means}')
    print(f'std {stds}')
    return means, stds

In [62]:
# mean, std = compute_img_mean_std(all_image_path)

In [63]:
# pd.read_csv(os.path.join(data_dir,'HAM10000_metadata.csv'))

In [64]:
#creating three new columns image_path (path)
#disease type full name from lesion_types (cell_type)
#categorical code of label or disease type (cell_type_idx)
df = pd.read_csv(os.path.join(data_dir,'HAM10000_metadata.csv'))
# .map maps value from keys
#.get returns all values
df['path'] = df['image_id'].map(imageid_path_dict.get) 
df['cell_type'] = df['dx'].map(lesion_type_dict.get)
df['cell_type_idx'] = pd.Categorical(df['cell_type']).codes
df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,path,cell_type,cell_type_idx
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,../input/skin-cancer-mnist-ham10000/HAM10000_i...,Benign keratosis-like lesions,2
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,../input/skin-cancer-mnist-ham10000/HAM10000_i...,Benign keratosis-like lesions,2
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,../input/skin-cancer-mnist-ham10000/HAM10000_i...,Benign keratosis-like lesions,2
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,../input/skin-cancer-mnist-ham10000/HAM10000_i...,Benign keratosis-like lesions,2
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,../input/skin-cancer-mnist-ham10000/ham10000_i...,Benign keratosis-like lesions,2


In [65]:
# df.groupby('lesion_id').count()

Duplicates have same HAM number or lesion id
Hence they have more than 1 count values under groupby

In [66]:
#finding number of images in each group
ndf = df.groupby('lesion_id').count()
#finding out lesion id that have only one image
ndf = ndf[ndf['image_id']==1]
ndf.head()

Unnamed: 0_level_0,image_id,dx,dx_type,age,sex,localization,path,cell_type,cell_type_idx
lesion_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
HAM_0000001,1,1,1,1,1,1,1,1,1
HAM_0000003,1,1,1,1,1,1,1,1,1
HAM_0000004,1,1,1,1,1,1,1,1,1
HAM_0000007,1,1,1,1,1,1,1,1,1
HAM_0000008,1,1,1,1,1,1,1,1,1


In [67]:
ndf.reset_index(inplace=True)
ndf.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,path,cell_type,cell_type_idx
0,HAM_0000001,1,1,1,1,1,1,1,1,1
1,HAM_0000003,1,1,1,1,1,1,1,1,1
2,HAM_0000004,1,1,1,1,1,1,1,1,1
3,HAM_0000007,1,1,1,1,1,1,1,1,1
4,HAM_0000008,1,1,1,1,1,1,1,1,1


In [None]:
#identify ones with duplicate images and only one image
def get_duplicate(x):
    uniq = list(ndf['lesion_id'])
    if x in uniq:
        return 'unduplicate'
    return 'duplicated'

#new column of lesion id
df['duplicates'] = df['lesion_id']
#applying function to this column
df['duplicates'] = df['duplicates'].apply(get_duplicate)
df.head()

In [None]:
df['duplicates'].value_counts()

In [None]:
#filtering images which are not duplicated
df_undup = df[df['duplicates']=='unduplicate']
df_undup.shape

In [None]:
#creating validation set
y = df_undup['cell_type_idx']
_, df_val = train_test_split(df_undup, test_size=0.20,random_state=101, stratify=y)
df_val.shape

In [None]:
df_val['cell_type_idx'].value_counts()

In [None]:
#creating training set on df (including duplicates)
#Function identifies if an image is part of the train or val set.
def get_val_rows(x):
    val_list = list(df_val['image_id'])
    if str(x) in val_list:
        return 'val'
    return 'train'

#applying it
df['train_or_val'] = df['image_id']
df['train_or_val'] = df['train_or_val'].apply(get_val_rows)
#filter out train rows
df_train = df[df['train_or_val']=='train']
len(df_train), len(df_val)

In [None]:
df_train['cell_type_idx'].value_counts(), df_val['cell_type'].value_counts()

as seen above there is a serious imbalance in images per class

In [None]:
#Creating copies to balance
data_aug_rate = [15,10,5,50,0,40,5]
for i in range(7):
    if data_aug_rate[i]>0:
        df_train = df_train.append([df_train.loc[df_train['cell_type_idx']==i,:]]*(data_aug_rate[i]-1),ignore_index=True)

In [None]:
df_train['cell_type'].value_counts()

In [None]:
df_train.head()

In [None]:
df_train.reset_index().head()

In [None]:
df_train=df_train.reset_index()
df_val=df_val.reset_index()



# Model Building

In [None]:
# transfer learning 
#learning by updating pre trained weights or not and updating last layer only
def set_para_req_grad(model, grad):
    if grad == True:
        for param in mode.parameters():
            param.requires_grad = False

In [None]:
models.resnet50(),models.densenet121()

In [None]:
def initialise_model(model_name, num_classes, grad, use_pretrained=True):
    '''
    grad = if the pretrained weights be updated
    use_pretrained = use pre trained weights or start from scratch
    '''
    model = None
    
    if model_name == 'resnet':
        model = models.resnet50(pretrained=use_pretrained)
        set_para_req_grad(model, grad)
        num_ftrs = model.fc.in_features
        model.fc = nn.Linear(num_ftrs, num_classes)
    elif model_name == 'densenet':
        model = models.densenet121(pretrained=use_pretrained)
        set_para_req_grad(model,grad)
        num_ftrs = model.classifier.in_features
        model.classifier = nn.Linear(num_ftrs, num_classes)
    return model

In [None]:
num_classes = 7
model_name = 'densenet'
grad = False
#initialise model to run
model = initialise_model(model_name, num_classes, grad, use_pretrained=True)

In [None]:
#training model on gpu 
device = torch.device('cuda:0')
model = model.to(device)

In [None]:
#if not training from scratch and using pretrained
mean = (0.49139968, 0.48215827, 0.44653124)
std = (0.24703233, 0.24348505, 0.26158768)
#data augmentation
train_transform = transforms.Compose([transforms.Resize((input_size,input_size)),
                                     transforms.RandomHorizontalFlip(),
                                     transforms.RandomVerticalFlip(),
                                     transforms.RandomRotation(20),
                                     transforms.ColorJitter(brightness=0.1, contrast=0.1, hue=0.1),
                                     transforms.ToTensor(),
                                     transforms.Normalize(mean, std)])
val_transform = transforms.Compose([transforms.Resize((input_size,input_size)),
                                   transforms.ToTensor(),
                                   transforms.Normalize(mean,std)])

In [None]:
#dataloader or dataset
class HAM1000(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform
    
    def __len__(self):
        return(len(self.df))
    
    def __getitem__(self, index):
        X = Image.open(self.df['path'][index])
        y = torch.tensor(int(self.df['cell_type_idx'][index]))
        
        if self.transform:
            X = self.transform(X)
        return X,y

In [None]:
#dataset
train_ds = HAM1000(df_train, transform=train_transform)
val_ds = HAM1000(df_val, transform=val_transform)

batch_size = 32
epochs = 10
lr = 1e-3

#dataloader
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True,num_workers=4)
val_dl = DataLoader(val_ds, batch_size=batch_size,shuffle=False, num_workers=3)

In [None]:
# optimiser and criterion
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss().to(device)

# Training


In [None]:
#Loss and accuracy calculator
class AverageMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        
    def update(self, val, n=1):
        self.val = val
        self.sum += val*n
        self.count += n
        self.avg = self.sum/self.count

In [None]:
tloss_train , tacc_train =[],[]
def train(train_loader, model, criterion, optimizer, epoch):
    model.train()
    train_loss = AverageMeter()
    train_acc = AverageMeter()
    curr_iter = (epoch-1)*len(train_dl)
    
    for i,data in enumerate(train_loader):
        images, labels = data
        n = images.size(0)
        
        images = Variable(images).to(device)
        labels = Variable(labels).to(device)
     
        optimizer.zero_grad()
        outputs = model(images)
        
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        pred = torch.argmax(outputs,dim=1)
        train_acc.update(pred.eq(labels).sum().item()/n)
        train_loss.update(loss.item())
        curr_iter += 1
        if (i+1)%100 == 0:
            print(f'epoch {epoch} [iter {i+1}/{len(train_dl)}] [train loss {train_loss.avg:.5f}] [train acc {train_acc.avg:.5f}]')
            tloss_train.append(train_loss.avg)
            tacc_train.append(train_acc.avg)
        
    return train_loss.avg, train_acc.avg

In [None]:
def validate(val_loader, model, criterion, optimizer, epoch):
    model.eval()
    val_loss = AverageMeter()
    val_acc = AverageMeter()
    with torch.no_grad():
        for i, data in enumerate(val_loader):
            images, labels = data
            n = images.size(0)
            images = Variable(images).to(device)
            labels = Variable(labels).to(device)
            
            output = model(images)
            pred = torch.argmax(output, dim=1)
            
            val_acc.update(pred.eq(labels).sum().item()/n)
            val_loss.update(criterion(output, labels).item())
            
        print('-----------------------------------------')
        print(f'[epoch {epoch}] [iter{i/len(val_dl)}] [val loss {val_loss.avg:.5f}] [val_acc {val_acc.avg:.5f}]')
        print('-----------------------------------------')
            
        return val_loss.avg, val_acc.avg

In [None]:
best_val_acc = 0
tloss_val, tacc_val = [],[]

for epoch in range(epochs):
    loss_train, acc_train = train(train_dl, model, criterion, optimizer, epoch)
    loss_val, acc_val = validate(val_dl, model, criterion, optimizer, epoch)
    tloss_val.append(loss_val)
    tacc_val.append(acc_val)
    
    if acc_val > best_val_acc:
        best_val_acc = acc_val
        print('******************')
        print(f'best [epoch{epoch}] [loss {loss_val:.5f}] [acc {acc_val:.5f}]')
        print('******************')

In [None]:
len(train_dl)

# Model evaluation

In [None]:
fig = plt.figure(num=2)
fig1 = fig.add_subplot(2,1,1)
fig2 = fig.add_subplot(2,1,2)

fig1.plot(tloss_train, label='training loss')
fig1.plot(tacc_train, label='training acc')

fig2.plot(tloss_val, label='val loss')
fig2.plot(tacc_val, label='val acc')

plt.legend()
plt.show()

In [None]:
model.eval()
y_label = []
y_predict = []

with torch.no_grad():
    for i,(images,labels) in enumerate(val_dl):
        print(images.size)
        n = images.size(0)
        images = Variable(images).to(device)
        output = model(images)
        pred = torch.argmax(output, dim=1)
        y_label.extend(labels.cpu().numpy())
        y_predict.extend(pred.cpu().numpy())
plot_labels = ['akiec', 'bcc', 'bkl', 'df', 'nv', 'vasc','mel']

In [None]:
#Classification report
report = classification_report(y_label, y_predict, target_names=plot_labels)
print(report)