In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt 


In [None]:
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --version nightly --apt-packages libomp5 libopenblas-dev

In [None]:
import warnings
import torch_xla
import torch_xla.debug.metrics as met
import torch_xla.distributed.data_parallel as dp
import torch_xla.distributed.parallel_loader as pl
import torch_xla.utils.utils as xu
import torch_xla.core.xla_model as xm 
import torch_xla.distributed.xla_multiprocessing as xmp
import torch_xla.test.test_utils as test_utils
import warnings
import gc

warnings.filterwarnings("ignore")

In [None]:
pip install efficientnet_pytorch

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from skimage import io, transform
from torchvision import models, transforms
import torch.optim as optim
from torch.optim import lr_scheduler
import time
import copy 
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split as ttp
from skimage.filters import threshold_otsu
from skimage.color import rgb2gray
import cv2 as cv 
import pickle
import random 
import albumentations

In [None]:
DIR0 = '../input/jpeg-melanoma-384x384/train'
DIR1 = '../input/jpeg-isic2019-384x384/train'
FOLD_CSVS ={0:'../input/combined-train/train0.csv',1:'../input/combined-train/train1.csv',2:'../input/combined-train/train2.csv',
            3:'../input/combined-train/train3.csv',4:'../input/combined-train/train4.csv'}
TEST_FOLDER = '../input/jpeg-melanoma-384x384/test'
TEST_CSV = '../input/combined-train/test.csv'
MODELS_PATH = '../input/tpu-models/'

In [None]:
mean=[0.485, 0.456, 0.406]
std=[0.229, 0.224, 0.225]
transform_train = albumentations.Compose([
    albumentations.Normalize(mean, std, always_apply = True),
    albumentations.ShiftScaleRotate(),
    albumentations.Flip(p=0.5)
])

transform_valid = albumentations.Compose([
    albumentations.Normalize(mean, std, always_apply = True),
])

transform_test = albumentations.Compose([
    albumentations.Normalize(mean, std, always_apply = True),
])


In [None]:
class melanoma_dataset(Dataset):
    def __init__(self, root_dir, transform, df = pd.DataFrame() , csv_file = False, train = True):
        
        
        self.df = df
        
        if csv_file:
            self.csv = pd.read_csv(csv_file)
        
        self.directory = root_dir
        
        self.transform = transform
        
        self.train = train
        
        
        
    def __getitem__(self,idx):
        
        if not self.df.empty:
            tab = self.df
        else:
            tab = self.csv
        
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        if tab.iloc[idx,1] == '-1':
            directory = self.directory[1]
        else:
            directory = self.directory[0]
        
        
        if self.train == False:
            directory = TEST_FOLDER
        img_name = os.path.join(directory, tab.iloc[idx, 0]) + '.jpg'
        img = cv.imread(img_name)

        target= tab.iloc[idx, 3] if self.train else 0
        
        if self.transform:
            #sample= self.transform(image = self['image'], target = self['target'])
            image = self.transform(image = img)
            flipped = image['image']
            image = np.transpose(flipped, (2, 0, 1)).astype(np.float32)
        
        if self.train:
            return image, target
        else:
            return image
        
        
        
        
    
    def __len__(self):
        if not self.df.empty:
            return len(self.df)
        else:
            return len(self.csv)
        
    
    

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(47)

In [None]:
class Data_Loader():
    def __init__(self,dataset):
        
        self.dataset = dataset
        
    def get(self,batch_size,shuffle,num_workers):
        
        sampler = torch.utils.data.distributed.DistributedSampler(self.dataset,
                                                                  num_replicas = xm.xrt_world_size(),
                                                                  rank = xm.get_ordinal(),
                                                                  shuffle = shuffle)
        dataloader = torch.utils.data.DataLoader(self.dataset,
                                                 batch_size=batch_size,
                                                 shuffle=False,
                                                 sampler=sampler,
                                                 num_workers=num_workers)
        return dataloader

In [None]:
CSV = pd.read_csv(FOLD_CSVS[1])
train_df, valid_df, _,_ =  ttp(CSV, np.zeros(len(CSV)))
dataset = melanoma_dataset([DIR0,DIR1], transform_train, train_df)
dataloader = Data_Loader(dataset).get(batch_size = 64, shuffle = True, num_workers = 4)

In [None]:
from efficientnet_pytorch import EfficientNet 
#model0 = EfficientNet.from_pretrained('efficientnet-b1')
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = xm.xla_device() 
feature_extract = True



In [None]:
def kfold_train(folds=3, num_epochs = 3,train = True):


    #general variables
    since = time.time()
    val_acc_history = []
    roc = 0
    
    if train == True:
        for fold in range(folds):
            best_roc_auc = 0.0
            model = EfficientNet.from_pretrained('efficientnet-b6')
            #model.cuda()
            model_path = f'model_{fold}.pth'
            model._fc = nn.Linear(2304, 1)
            model.to(device)
            #each epoch has k folds
            print('-'*10)
            acc_dic = {}
            loss_dic = {}
            CSV = pd.read_csv(FOLD_CSVS[fold])
            train_df, valid_df, _,_ =  ttp(CSV, np.zeros(len(CSV)))
        
            print('Fold {}/{}'.format(fold, folds-1))

            optimizer = optim.SGD(model.parameters(), lr = 0.001, momentum = 0.9)
            scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
            criterion = nn.BCEWithLogitsLoss()



            for epoch in range(num_epochs):
                print('Epoch {}/{}'.format(epoch, num_epochs-1))
                epoch_roc_auc = 0.0
                epoch_loss = 0.0
                for phase in ['train', 'val']:
                    if phase == 'train':
                        model.train()
                        dataset = melanoma_dataset([DIR0,DIR1], transform_train, train_df)
                        dataloader = Data_Loader(dataset).get(batch_size = 64, shuffle = True, num_workers = 8)
                    else:
                        model.eval()
                        dataset = melanoma_dataset([DIR0,DIR1], transform_valid, valid_df)
                        dataloader = Data_Loader(dataset).get(batch_size = 64, shuffle = False, num_workers = 8)
                    dataset_size = len(dataset)
                    running_loss = 0.0
                    running_outputs = []

                    for inputs, labels in dataloader:
                        inputs = torch.tensor(inputs, device = device, dtype = torch.float32)
                        labels = torch.tensor(labels, device = device, dtype = torch.float32)

                        optimizer.zero_grad()

                        #forward

                        with torch.set_grad_enabled(phase == 'train'):
                            outputs = model(inputs)
                            running_outputs.append(outputs.sigmoid().cpu())
                            #preds = outputs.sigmoid()
                            loss = criterion(outputs, labels.unsqueeze(1))
                            
                            
                            if phase == 'train':
                                loss.backward()
                                xm.optimizer_step(optimizer,barrier=True)

                        running_loss += loss.item()*inputs.size(0)
                          


                    if phase == 'train':
                        scheduler.step()

                    epoch_loss = running_loss/dataset_size
                    if phase == 'val':
                        epoch_roc_auc = roc_auc_score((valid_df['target']) , np.concatenate(running_outputs).ravel())
                    
                    
                    if phase == 'val' and epoch_roc_auc > best_roc_auc:
                        best_roc_auc = epoch_roc_auc
                        best_epoch = epoch
                        best_model_wts = copy.deepcopy(model.state_dict())
                        torch.save(model,model_path)


                    print('{} Epoch: {} Loss: {:.4f} ROC-AUC: {:.4f}'.format(phase,epoch, epoch_loss, 0 if phase == 'train' else epoch_roc_auc))  




            print('Fold {} Best Val ROC-AUC: {:4f}, ({})'.format(fold, best_roc_auc, best_epoch))
            roc += best_roc_auc
        
        print('FINAL ROC-AUC: {}'.format(roc/folds))
            
        
    if train == False:
        dataset = melanoma_dataset(TEST_FOLDER, transform_test, csv_file = TEST_CSV, train = False)
        dataloader = DataLoader(dataset, batch_size = 64, shuffle = False, num_workers = 2)

        outputs_df = pd.DataFrame()
        outputs_dic = {}
        means = []
        for fold in range(folds):
            model = torch.load(f'{MODELS_PATH}model_{fold}.pth')
            model.eval()
            model.to(device)
            torch.set_grad_enabled(False)
            outputs_dic[fold] = []
            for inputs in dataloader:
                inputs = torch.tensor(inputs, device = device, dtype = torch.float32)
                outputs = model(inputs).cpu().numpy()
                #preds = torch.round(outputs.sigmoid())
                outputs_dic[fold].append(outputs)
            outputs_df[fold] = np.concatenate(outputs_dic[fold]).ravel()

        for i in range(len(outputs_df)):
            means.append(np.mean(outputs_df.iloc[i,:]))
        means2 = torch.from_numpy(np.asarray(means))
        preds = means2.sigmoid().numpy()

        sub = pd.DataFrame()
        sub['image_name'] = pd.read_csv(TEST_CSV)['image_name']
        sub['target'] = preds
        sub.to_csv('melanoma_preds.csv', index = False)
    
                

In [None]:
#kfold_train(folds = 5, num_epochs = 5)

In [None]:
kfold_train(folds=3, num_epochs = 3,train = False)