### EDA

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import torchvision
from torchvision import transforms
from torchvision import models

import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, recall_score, precision_score

In [2]:
!pip install GPUtil

In [3]:
from GPUtil import showUtilization as gpu_usage

gpu_usage()

In [4]:
path_1 = '../input/skin-cancer-mnist-ham10000/HAM10000_images_part_1'
path_2 = '../input/skin-cancer-mnist-ham10000/HAM10000_images_part_2'

files_path_1 = []
for dirname, _, filenames in os.walk(path_1):
    for filename in filenames:
        files_path_1.append(filename[:-4])

In [5]:
data = pd.read_csv('../input/skin-cancer-mnist-ham10000/HAM10000_metadata.csv')

In [6]:
path_list = [(path_1 + '/' + i + '.jpg') if i in files_path_1 else (path_2 + '/' + i + '.jpg') for i in list(data['image_id'])]

In [7]:
data['path'] = path_list

In [8]:
label_types = list(set(data['dx']))

In [9]:
sorted([(l, len(data[data['dx'] == l])/len(data)) for l in label_types], key=lambda x: x[1], reverse=True)

In [10]:
labels_dict = {
    'nv': 0,
    'mel': 1,
    'bkl': 2,
    'bcc': 3,
    'akiec': 4,
    'vasc': 5,
    'df': 6
}

In [11]:
data['label'] = [labels_dict[i] for i in data['dx']]

In [14]:
data = data.drop(columns=['lesion_id', 'dx', 'dx_type', 'age', 'sex', 'localization'])

In [15]:
train, test = train_test_split(data, stratify=list(data['label']), test_size = 0.2)

### Dataset

In [16]:
class CustomDataset(Dataset):
    
    def __init__(self, images, labels):
        self.images = images
        self.labels = labels
        
    def __getitem__(self, index):
        image_path = self.images[index]
        label = self.labels[index]
        image = Image.open(image_path)
        image = image.resize((224, 224))
        image = transforms.ToTensor()(image)
        image = transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))(image)
        return image, label
        
    def __len__ (self):
        return len(self.images)

In [17]:
train_dataset = CustomDataset(list(train['path']), list(train['label']))
test_dataset = CustomDataset(list(test['path']), list(test['label']))

In [18]:
BATCH_SIZE = 100

In [19]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

### Helpers

In [20]:
def train_nn(model, data_loader, device):
        loss_fn = torch.nn.CrossEntropyLoss()
        model.train()
        for data in tqdm(data_loader):
            inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

In [21]:
def eval_nn(model, data_loader, device):
    predicted = []
    labels = []
    model.eval()
    with torch.no_grad():
        for data in tqdm(data_loader):
            x, y = data
            x = x.to(device)

            outputs = model(x)
            _, predict = torch.max(outputs.data, 1)
            predict = predict.cpu().detach().numpy().tolist()
            predicted += predict
            labels += y
        print(f1_score(labels, predicted, average=None))
    return labels, predicted

### Finetuning

In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [23]:
device

In [24]:
model = models.resnet18(pretrained=True)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 7)
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

In [25]:
for epoch in range(3):
    train_nn(model, train_loader, device)
    eval_nn(model, test_loader, device)

In [26]:
gpu_usage()

In [27]:
torch.cuda.empty_cache()

In [28]:
gpu_usage()

### Dataset balancing

In [29]:
from torch.utils.data.sampler import WeightedRandomSampler
y_train = np.array(list(train['label']))
counts = np.bincount(y_train.astype(int))
labels_weights = 1. / counts
labels_weights = 1. / counts
weights = labels_weights[y_train.astype(int)]
sampler = WeightedRandomSampler(weights, len(weights), replacement=True)

In [30]:
train_loader = DataLoader(dataset=train_dataset, sampler=sampler, batch_size=BATCH_SIZE)

In [31]:
model = models.resnet18(pretrained=True)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 7)
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

In [32]:
for epoch in range(3):
    train_nn(model, train_loader, device)
    eval_nn(model, test_loader, device)

In [33]:
torch.cuda.empty_cache()

### Augmentation

In [34]:
class CustomDataset2(Dataset):
    
    def __init__(self, images, labels):
        self.images = images
        self.labels = labels
        
    def __getitem__(self, index):
        image_path = self.images[index]
        label = self.labels[index]
        image = Image.open(image_path)
        image = image.resize((300, 300))
        image = transforms.RandomCrop(224)(image)
        image = transforms.RandomHorizontalFlip()(image)
        image = transforms.RandomPerspective()(image)
        image = transforms.RandomRotation(180)(image)
        image = transforms.RandomVerticalFlip()(image)
        image = transforms.ToTensor()(image)
        image = transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))(image)
        return image, label
        
    def __len__ (self):
        return len(self.images)

In [35]:
train_dataset = CustomDataset2(list(train['path']), list(train['label']))
test_dataset = CustomDataset2(list(test['path']), list(test['label']))

In [36]:
train_loader = DataLoader(dataset=train_dataset, sampler=sampler, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [37]:
model = models.resnet18(pretrained=True)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 7)
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

In [38]:
for epoch in range(3):
    train_nn(model, train_loader, device)
    eval_nn(model, test_loader, device)

In [39]:
torch.cuda.empty_cache()

### Classificstion: selfmade network

In [48]:
class SimpleConvNet(torch.nn.Module):

    
    def __init__(self):
        super(SimpleConvNet, self).__init__()
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(2, 2)
        self.dropout = nn.Dropout(p=0.1)
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.conv3 = nn.Conv2d(16, 32, 3)
        self.fc1 = nn.Linear(32 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 7)
        self.batchnorm1 = nn.BatchNorm1d(120)
        self.batchnorm2 = nn.BatchNorm1d(84)

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = self.pool(self.relu(self.conv3(x)))
        x = x.reshape(-1, 32 * 5 * 5)
        x = self.relu(self.fc1(x))
        x = self.batchnorm1(x)
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.fc3(x)
        return x

In [49]:
class CustomDataset3(Dataset):
    
    def __init__(self, images, labels):
        self.images = images
        self.labels = labels
        
    def __getitem__(self, index):
        image_path = self.images[index]
        label = self.labels[index]
        image = Image.open(image_path)
        image = image.resize((80, 80))
        image = transforms.RandomCrop(64)(image)
        image = transforms.RandomHorizontalFlip()(image)
        image = transforms.RandomRotation(180)(image)
        image = transforms.RandomVerticalFlip()(image)
        image = transforms.ToTensor()(image)
        image = transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))(image)
        return image, label
        
    def __len__ (self):
        return len(self.images)

In [50]:
train_dataset = CustomDataset3(list(train['path']), list(train['label']))
test_dataset = CustomDataset3(list(test['path']), list(test['label']))

In [51]:
BATCH_SIZE = 100

train_loader = DataLoader(dataset=train_dataset, sampler=sampler, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [52]:
# Xavier init

def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)

In [53]:
model = SimpleConvNet()

model.apply(init_weights)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

In [54]:
torch.cuda.empty_cache()
gpu_usage()

In [55]:
for epoch in range(30):
    train_nn(model, train_loader, device)
    eval_nn(model, test_loader, device)