### EDA

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import torchvision
from torchvision import transforms
from torchvision import models

import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, recall_score, precision_score

In [2]:
!pip install GPUtil

[0m

In [3]:
from GPUtil import showUtilization as gpu_usage

gpu_usage()

| ID | GPU | MEM |
------------------
|  0 |  0% |  0% |


In [4]:
path_1 = '../input/skin-cancer-mnist-ham10000/HAM10000_images_part_1'
path_2 = '../input/skin-cancer-mnist-ham10000/HAM10000_images_part_2'

files_path_1 = []
for dirname, _, filenames in os.walk(path_1):
    for filename in filenames:
        files_path_1.append(filename[:-4])

In [5]:
data = pd.read_csv('../input/skin-cancer-mnist-ham10000/HAM10000_metadata.csv')

In [6]:
path_list = [(path_1 + '/' + i + '.jpg') if i in files_path_1 else (path_2 + '/' + i + '.jpg') for i in list(data['image_id'])]

In [7]:
data['path'] = path_list

In [8]:
label_types = list(set(data['dx']))

In [9]:
sorted([(l, len(data[data['dx'] == l])/len(data)) for l in label_types], key=lambda x: x[1], reverse=True)

[('nv', 0.6694957563654518),
 ('mel', 0.11113330004992511),
 ('bkl', 0.10973539690464304),
 ('bcc', 0.05132301547678482),
 ('akiec', 0.032651023464802795),
 ('vasc', 0.014178731902146779),
 ('df', 0.011482775836245632)]

In [10]:
labels_dict = {
    'nv': 0,
    'mel': 1,
    'bkl': 2,
    'bcc': 3,
    'akiec': 4,
    'vasc': 5,
    'df': 6
}

In [11]:
data['label'] = [labels_dict[i] for i in data['dx']]

In [12]:
data = data.drop(columns=['lesion_id', 'dx', 'dx_type', 'age', 'sex', 'localization'])

In [13]:
train, test = train_test_split(data, stratify=list(data['label']), test_size = 0.2)

### Dataset

In [14]:
class CustomDataset(Dataset):
    
    def __init__(self, images, labels):
        self.images = images
        self.labels = labels
        
    def __getitem__(self, index):
        image_path = self.images[index]
        label = self.labels[index]
        image = Image.open(image_path)
        image = image.resize((224, 224))
        image = transforms.ToTensor()(image)
        image = transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))(image)
        return image, label
        
    def __len__ (self):
        return len(self.images)

In [15]:
train_dataset = CustomDataset(list(train['path']), list(train['label']))
test_dataset = CustomDataset(list(test['path']), list(test['label']))

In [16]:
BATCH_SIZE = 100

In [17]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

### Helpers

In [18]:
def train_nn(model, data_loader, device):
        loss_fn = torch.nn.CrossEntropyLoss()
        model.train()
        for data in tqdm(data_loader):
            inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

In [19]:
def eval_nn(model, data_loader, device):
    predicted = []
    labels = []
    model.eval()
    with torch.no_grad():
        for data in tqdm(data_loader):
            x, y = data
            x = x.to(device)

            outputs = model(x)
            _, predict = torch.max(outputs.data, 1)
            predict = predict.cpu().detach().numpy().tolist()
            predicted += predict
            labels += y
        print(f1_score(labels, predicted, average=None))
    return labels, predicted

### Finetuning

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [21]:
device

device(type='cuda')

In [22]:
model = models.resnet18(pretrained=True)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 7)
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


  0%|          | 0.00/44.7M [00:00<?, ?B/s]

In [23]:
for epoch in range(3):
    train_nn(model, train_loader, device)
    eval_nn(model, test_loader, device)

100%|██████████| 81/81 [03:01<00:00,  2.24s/it]
100%|██████████| 21/21 [00:43<00:00,  2.09s/it]


[0.86732819 0.18796992 0.46944444 0.52702703 0.05405405 0.77777778
 0.        ]


100%|██████████| 81/81 [02:02<00:00,  1.52s/it]
100%|██████████| 21/21 [00:31<00:00,  1.49s/it]


[0.87055667 0.07758621 0.55364807 0.43478261 0.3655914  0.65714286
 0.        ]


100%|██████████| 81/81 [02:04<00:00,  1.53s/it]
100%|██████████| 21/21 [00:30<00:00,  1.48s/it]

[0.88513514 0.39664804 0.43887147 0.69642857 0.03030303 0.81632653
 0.33333333]





In [24]:
gpu_usage()

| ID | GPU | MEM |
------------------
|  0 |  3% | 26% |


In [25]:
torch.cuda.empty_cache()

In [26]:
gpu_usage()

| ID | GPU | MEM |
------------------
|  0 | 48% | 10% |


### Dataset balancing

In [27]:
from torch.utils.data.sampler import WeightedRandomSampler
y_train = np.array(list(train['label']))
counts = np.bincount(y_train.astype(int))
labels_weights = 1. / counts
labels_weights = 1. / counts
weights = labels_weights[y_train.astype(int)]
sampler = WeightedRandomSampler(weights, len(weights), replacement=True)

In [28]:
train_loader = DataLoader(dataset=train_dataset, sampler=sampler, batch_size=BATCH_SIZE)

In [29]:
model = models.resnet18(pretrained=True)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 7)
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

In [30]:
for epoch in range(3):
    train_nn(model, train_loader, device)
    eval_nn(model, test_loader, device)

100%|██████████| 81/81 [02:04<00:00,  1.54s/it]
100%|██████████| 21/21 [00:31<00:00,  1.51s/it]


[0.71132075 0.40343348 0.39058824 0.64285714 0.32571429 0.65882353
 0.23300971]


100%|██████████| 81/81 [02:05<00:00,  1.55s/it]
100%|██████████| 21/21 [00:32<00:00,  1.53s/it]


[0.82150185 0.28037383 0.42829077 0.42105263 0.29577465 0.72727273
 0.20689655]


100%|██████████| 81/81 [02:04<00:00,  1.53s/it]
100%|██████████| 21/21 [00:31<00:00,  1.51s/it]

[0.87651515 0.47136564 0.58503401 0.63043478 0.32941176 0.81818182
 0.54545455]





In [31]:
torch.cuda.empty_cache()

### Augmentation

In [32]:
class CustomDataset2(Dataset):
    
    def __init__(self, images, labels):
        self.images = images
        self.labels = labels
        
    def __getitem__(self, index):
        image_path = self.images[index]
        label = self.labels[index]
        image = Image.open(image_path)
        image = image.resize((300, 300))
        image = transforms.RandomCrop(224)(image)
        image = transforms.RandomHorizontalFlip()(image)
        image = transforms.RandomPerspective()(image)
        image = transforms.RandomRotation(180)(image)
        image = transforms.RandomVerticalFlip()(image)
        image = transforms.ToTensor()(image)
        image = transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))(image)
        return image, label
        
    def __len__ (self):
        return len(self.images)

In [33]:
train_dataset = CustomDataset2(list(train['path']), list(train['label']))
test_dataset = CustomDataset2(list(test['path']), list(test['label']))

In [34]:
train_loader = DataLoader(dataset=train_dataset, sampler=sampler, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [35]:
model = models.resnet18(pretrained=True)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 7)
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

In [36]:
for epoch in range(3):
    train_nn(model, train_loader, device)
    eval_nn(model, test_loader, device)

100%|██████████| 81/81 [02:21<00:00,  1.74s/it]
100%|██████████| 21/21 [00:35<00:00,  1.71s/it]


[0.6610338  0.38135593 0.5087108  0.50793651 0.32967033 0.52631579
 0.25225225]


100%|██████████| 81/81 [02:21<00:00,  1.74s/it]
100%|██████████| 21/21 [00:36<00:00,  1.73s/it]


[0.65602322 0.47040971 0.35761589 0.31081081 0.23611111 0.12903226
 0.0610687 ]


100%|██████████| 81/81 [02:20<00:00,  1.74s/it]
100%|██████████| 21/21 [00:36<00:00,  1.73s/it]

[0.78736123 0.38736842 0.50903614 0.647343   0.33684211 0.83018868
 0.21176471]





In [37]:
torch.cuda.empty_cache()

### Classificstion: selfmade network

In [38]:
class SimpleConvNet(torch.nn.Module):

    
    def __init__(self):
        super(SimpleConvNet, self).__init__()
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(2, 2)
        self.dropout = nn.Dropout(p=0.1)
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.conv3 = nn.Conv2d(16, 32, 3)
        self.fc1 = nn.Linear(32 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 7)
        self.batchnorm1 = nn.BatchNorm1d(120)
        self.batchnorm2 = nn.BatchNorm1d(84)

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = self.pool(self.relu(self.conv3(x)))
        x = x.reshape(-1, 32 * 5 * 5)
        x = self.relu(self.fc1(x))
        x = self.batchnorm1(x)
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.fc3(x)
        return x

In [39]:
class CustomDataset3(Dataset):
    
    def __init__(self, images, labels):
        self.images = images
        self.labels = labels
        
    def __getitem__(self, index):
        image_path = self.images[index]
        label = self.labels[index]
        image = Image.open(image_path)
        image = image.resize((80, 80))
        image = transforms.RandomCrop(64)(image)
        image = transforms.RandomHorizontalFlip()(image)
        image = transforms.RandomRotation(180)(image)
        image = transforms.RandomVerticalFlip()(image)
        image = transforms.ToTensor()(image)
        image = transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))(image)
        return image, label
        
    def __len__ (self):
        return len(self.images)

In [40]:
train_dataset = CustomDataset3(list(train['path']), list(train['label']))
test_dataset = CustomDataset3(list(test['path']), list(test['label']))

In [41]:
BATCH_SIZE = 100

train_loader = DataLoader(dataset=train_dataset, sampler=sampler, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [42]:
# Xavier init

def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)

In [43]:
model = SimpleConvNet()

model.apply(init_weights)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

In [44]:
torch.cuda.empty_cache()
gpu_usage()

| ID | GPU | MEM |
------------------
|  0 | 39% |  5% |


In [45]:
for epoch in range(30):
    train_nn(model, train_loader, device)
    eval_nn(model, test_loader, device)

100%|██████████| 81/81 [01:49<00:00,  1.35s/it]
100%|██████████| 21/21 [00:27<00:00,  1.30s/it]


[0.62345374 0.28205128 0.06949807 0.03571429 0.14893617 0.1938326
 0.05479452]


100%|██████████| 81/81 [01:48<00:00,  1.35s/it]
100%|██████████| 21/21 [00:27<00:00,  1.29s/it]


[0.68245529 0.34910277 0.39895013 0.33333333 0.17073171 0.22564103
 0.15686275]


100%|██████████| 81/81 [01:48<00:00,  1.34s/it]
100%|██████████| 21/21 [00:27<00:00,  1.31s/it]


[0.49944383 0.32890365 0.34226804 0.29816514 0.2283737  0.36
 0.08108108]


100%|██████████| 81/81 [01:49<00:00,  1.35s/it]
100%|██████████| 21/21 [00:26<00:00,  1.27s/it]


[0.82307093 0.30681818 0.32380952 0.36578171 0.34615385 0.31506849
 0.1969697 ]


100%|██████████| 81/81 [01:49<00:00,  1.35s/it]
100%|██████████| 21/21 [00:27<00:00,  1.29s/it]


[0.72508113 0.40236686 0.38888889 0.38431373 0.40236686 0.44444444
 0.15028902]


100%|██████████| 81/81 [01:49<00:00,  1.35s/it]
100%|██████████| 21/21 [00:27<00:00,  1.30s/it]


[0.49327354 0.29226361 0.359401   0.2745098  0.21072797 0.42222222
 0.10714286]


100%|██████████| 81/81 [01:49<00:00,  1.36s/it]
100%|██████████| 21/21 [00:27<00:00,  1.32s/it]


[0.72582905 0.36697248 0.38095238 0.34161491 0.27568922 0.54545455
 0.13043478]


100%|██████████| 81/81 [01:48<00:00,  1.34s/it]
100%|██████████| 21/21 [00:27<00:00,  1.31s/it]


[0.75280395 0.3973064  0.41       0.32369942 0.27678571 0.58536585
 0.125     ]


100%|██████████| 81/81 [01:48<00:00,  1.34s/it]
100%|██████████| 21/21 [00:27<00:00,  1.29s/it]


[0.75325843 0.39352428 0.43512974 0.38787879 0.44878049 0.63888889
 0.22857143]


100%|██████████| 81/81 [01:48<00:00,  1.34s/it]
100%|██████████| 21/21 [00:27<00:00,  1.32s/it]


[0.61151079 0.32711306 0.4371134  0.29530201 0.31012658 0.58974359
 0.16528926]


100%|██████████| 81/81 [01:48<00:00,  1.34s/it]
100%|██████████| 21/21 [00:27<00:00,  1.29s/it]


[0.82811245 0.43571429 0.41269841 0.46540881 0.31404959 0.54945055
 0.29166667]


100%|██████████| 81/81 [01:49<00:00,  1.35s/it]
100%|██████████| 21/21 [00:27<00:00,  1.30s/it]


[0.70599339 0.36662107 0.37735849 0.17218543 0.30604982 0.525
 0.13186813]


100%|██████████| 81/81 [01:50<00:00,  1.36s/it]
100%|██████████| 21/21 [00:27<00:00,  1.29s/it]


[0.7675145  0.3928036  0.44239631 0.43621399 0.33670034 0.60759494
 0.35555556]


100%|██████████| 81/81 [01:50<00:00,  1.37s/it]
100%|██████████| 21/21 [00:27<00:00,  1.32s/it]


[0.83286119 0.42780749 0.43979058 0.51937984 0.34285714 0.52873563
 0.29906542]


100%|██████████| 81/81 [01:50<00:00,  1.36s/it]
100%|██████████| 21/21 [00:27<00:00,  1.30s/it]


[0.78565179 0.43585781 0.46608696 0.24460432 0.31884058 0.63888889
 0.22818792]


100%|██████████| 81/81 [01:50<00:00,  1.36s/it]
100%|██████████| 21/21 [00:27<00:00,  1.33s/it]


[0.6150692  0.35218509 0.44250871 0.44609665 0.33333333 0.62337662
 0.21621622]


100%|██████████| 81/81 [01:50<00:00,  1.36s/it]
100%|██████████| 21/21 [00:26<00:00,  1.28s/it]


[0.83272283 0.46428571 0.38418079 0.53744493 0.4137931  0.52747253
 0.25862069]


100%|██████████| 81/81 [01:50<00:00,  1.36s/it]
100%|██████████| 21/21 [00:27<00:00,  1.31s/it]


[0.75971093 0.41547278 0.4488189  0.45756458 0.41545894 0.77777778
 0.40740741]


100%|██████████| 81/81 [01:49<00:00,  1.35s/it]
100%|██████████| 21/21 [00:27<00:00,  1.30s/it]


[0.70375654 0.37851038 0.44806517 0.26760563 0.37037037 0.64615385
 0.2       ]


100%|██████████| 81/81 [01:49<00:00,  1.36s/it]
100%|██████████| 21/21 [00:27<00:00,  1.33s/it]


[0.79759347 0.43052838 0.42290749 0.43859649 0.32394366 0.43243243
 0.20168067]


100%|██████████| 81/81 [01:51<00:00,  1.37s/it]
100%|██████████| 21/21 [00:27<00:00,  1.29s/it]


[0.64538653 0.38591549 0.41011236 0.33333333 0.36842105 0.64102564
 0.19161677]


100%|██████████| 81/81 [01:49<00:00,  1.36s/it]
100%|██████████| 21/21 [00:26<00:00,  1.28s/it]


[0.82460515 0.44657097 0.45112782 0.47457627 0.44444444 0.64102564
 0.25242718]


100%|██████████| 81/81 [01:50<00:00,  1.37s/it]
100%|██████████| 21/21 [00:27<00:00,  1.31s/it]


[0.75421797 0.39586411 0.41025641 0.54464286 0.38222222 0.65671642
 0.30088496]


100%|██████████| 81/81 [01:51<00:00,  1.38s/it]
100%|██████████| 21/21 [00:28<00:00,  1.34s/it]


[0.72515166 0.38443936 0.47761194 0.57416268 0.43902439 0.55813953
 0.32786885]


100%|██████████| 81/81 [01:50<00:00,  1.36s/it]
100%|██████████| 21/21 [00:27<00:00,  1.31s/it]


[0.73713491 0.39402985 0.44333333 0.21621622 0.32713755 0.6984127
 0.24242424]


100%|██████████| 81/81 [01:50<00:00,  1.36s/it]
100%|██████████| 21/21 [00:27<00:00,  1.33s/it]


[0.73293079 0.40526316 0.44404973 0.45918367 0.38709677 0.71186441
 0.23333333]


100%|██████████| 81/81 [01:51<00:00,  1.37s/it]
100%|██████████| 21/21 [00:27<00:00,  1.33s/it]


[0.69115515 0.40836013 0.40803709 0.41361257 0.28813559 0.74576271
 0.20183486]


100%|██████████| 81/81 [01:50<00:00,  1.37s/it]
100%|██████████| 21/21 [00:27<00:00,  1.32s/it]


[0.82392304 0.42910448 0.39361702 0.4610951  0.375      0.46601942
 0.39344262]


100%|██████████| 81/81 [01:51<00:00,  1.38s/it]
100%|██████████| 21/21 [00:27<00:00,  1.32s/it]


[0.76777678 0.42364532 0.43554007 0.4784689  0.43137255 0.47787611
 0.4       ]


100%|██████████| 81/81 [01:50<00:00,  1.37s/it]
100%|██████████| 21/21 [00:28<00:00,  1.35s/it]

[0.82945092 0.42519685 0.50093458 0.48797251 0.3539823  0.66666667
 0.3       ]



