In [None]:
import sys
sys.path.append('../input/hongnanrwightmangenefficientnetpytorchaug192020/gen-efficientnet-pytorch-master')
import geffnet

In [None]:
import os
import time
import cv2
import random
import argparse
import numpy as np
import pandas as pd
from typing import Optional
from tqdm import tqdm
import sklearn
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data.sampler import RandomSampler, SequentialSampler
from torch.optim.lr_scheduler import CosineAnnealingLR
from datetime import datetime
import albumentations
from albumentations.pytorch.transforms import ToTensorV2

# Config

In [None]:
class GlobalConfig:
    seed = 1958
    num_classes = 2
    batch_size = 16
    n_epochs = 5
    # lr = 5e-4
    lr = 0.00003
    scheduler = "CosineAnnealingWarmRestarts"
    train_step_scheduler = False  # do scheduler.step after optimizer.step
    val_step_scheduler = True
    T_0 = 10  # CosineAnnealingWarmRestarts
    min_lr = 1e-6
    weight_decay = 1e-6
    image_size = 512
    resize = 256
    crop_size = {128: 110, 256: 200, 512: 400}
    verbose = 1
    verbose_step = 1
    num_folds = 5
    class_col_name = "target"
    log_path = "./log.txt"
    train_path = '../input/melanoma-merged-external-data-512x512-jpeg/512x512-dataset-melanoma/512x512-dataset-melanoma'

    csv_path = "../input/melanoma-external-malignant-256/train_concat.csv/"
    save_path = "./"
    # test_path = '../input/cassava-leaf-disease-classification/test_images/'
    effnet = "tf_efficientnet_b2_ns"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_weight_path = '../input/efficientnet-weights/tf_efficientnet_b2_ns-00306e48.pth'

In [None]:
config = GlobalConfig

# Utils

In [None]:
from collections import Counter
from tqdm import tqdm
from typing import Optional, List

# possible reference: https://www.programiz.com/python-programming/methods/string/join

def get_file_type(image_folder_path: str, allowed_extensions: Optional[List]=None):
    if allowed_extensions is None:
        allowed_extensions = ['.jpg', '.png', '.jpeg']

    extension_type = []
    file_list = os.listdir(image_folder_path)
    for file in tqdm(file_list):
        extension_type.append(os.path.splitext(file)[-1].lower())
    
    extension_dict = Counter(extension_type)
    assert len(extension_dict.keys()) == 1, "The extension in the folder should all be the same, but found {} extensions".format(extension_dict.keys)
    extension_type = list(extension_dict.keys())[0]
    assert extension_type in allowed_extensions
    return extension_type

# Augmentations

In [None]:
def get_transforms(config):
    transforms_train = albumentations.Compose(
        [

            albumentations.VerticalFlip(p=0.5),
            albumentations.HorizontalFlip(p=0.5),
            albumentations.RandomBrightness(limit=0.2, p=0.75),
            albumentations.RandomContrast(limit=0.2, p=0.75),

            albumentations.HueSaturationValue(hue_shift_limit=10, sat_shift_limit=20, val_shift_limit=10, p=0.5),
            albumentations.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=15, border_mode=0, p=0.85),
            albumentations.Resize(height=config.image_size, width=config.image_size, p=1.0),
            # Test yourself on whether doing cutout last affects the seqeunce order?

            ToTensorV2(p=1.0),
        ],
        p=1.0,
    )

    transforms_val = albumentations.Compose(
        [
            albumentations.Resize(height=config.image_size, width=config.image_size, p=1.0),
            ToTensorV2(p=1.0),
        ],
        p=1.0,
    )

    return transforms_train, transforms_val


# Dataset

In [None]:
import os
import cv2
import numpy as np
import pandas as pd
import albumentations
import torch
# from torch.utils.data import Dataset

from typing import Optional
from tqdm import tqdm



class Melanoma(torch.utils.data.Dataset):
    def __init__(
        self,
        dataframe: pd.DataFrame,
        config: type,
        transforms: Optional[albumentations.core.composition.Compose] = None,
        test: bool = False,
        albu_norm: bool = False
    ):
        # 1. Is it good practice to name self.df = dataframe, or self.df = df
        self.df = dataframe
        self.config = config
        self.transforms = transforms
        self.test = test
        self.albu_norm = albu_norm
        
        '''
        This is necessary as when there is no augmentations passed in, there will not be a case whereby albu_norm is True since albu_norm
        only co-exists with transforms=True
        '''
        
        if self.transforms is None:
            assert self.albu_norm is False
            print('Transforms is None and Albumentation Normalization is not initialized!')
            
        self.image_extension = get_file_type(image_folder_path=config.train_path, allowed_extensions=None)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx: int):


        label = self.df.target.values[idx]
        label = torch.as_tensor(data=label, dtype=torch.int64, device=None)
        image_id = self.df.image_name.values[idx]

        if self.test:
            image_path = os.path.join(self.config.test_path, "{}{}".format(image_id, self.image_extension))
        else:
            image_path = os.path.join(self.config.train_path, "{}{}".format(image_id, self.image_extension))
        

        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        

        if self.albu_norm is False:
            image = image.astype(np.float32) / 255.0

        if self.transforms is not None:
            albu_dict = {"image": image}
            transform = self.transforms(**albu_dict)
            image = transform["image"]
        else:
            image = torch.as_tensor(data=image, dtype=torch.float32, device=None)

        return image_id, image, label


# Model

In [None]:
class CustomEfficientNet(nn.Module):
    # 2. I am not sure but why is config class a type when I check type(config)
    def __init__(self, config: type, pretrained: bool = True):
        super().__init__()
        self.config = config
        # For myself, I like to set argument names for each
        self.model = geffnet.create_model(
            model_weight_path=config.model_weight_path, model_name=config.effnet, pretrained=pretrained
        )
        n_features = self.model.classifier.in_features
        self.model.classifier = nn.Linear(n_features, config.num_classes)

    def forward(self, x):
        # TODO: add dropout layers, or the likes.
        x = self.model(x)
        return x

# Meters

In [None]:
class AverageLossMeter:
    """
    Computes and stores the average and current loss
    """

    def __init__(self):
        self.reset()

    def reset(self):
        self.curr_batch_avg_loss = 0
        self.avg = 0
        self.running_total_loss = 0
        self.count = 0

    def update(self, curr_batch_avg_loss: float, batch_size: str):
        self.curr_batch_avg_loss = curr_batch_avg_loss
        self.running_total_loss += curr_batch_avg_loss * batch_size
        self.count += batch_size
        self.avg = self.running_total_loss / self.count


# Maybe compare with utils.py from source
class AccuracyMeter:
    def __init__(self):
        self.reset()

    def reset(self):
        self.score = 0
        self.count = 0
        self.sum = 0

    def update(self, y_true, y_pred, batch_size=1):

        # so we just need to count total num of images / batch_size
        # self.count += num_steps
        self.batch_size = batch_size
        self.count += self.batch_size
        # this part here already got an acc score for the 4 images, so no need divide batch size
        self.score = sklearn.metrics.accuracy_score(y_true, y_pred)
        total_score = self.score * self.batch_size

        self.sum += total_score

    # 1. I doubt I need to use @property here, but I saw one guy used it, so I am confused.
    @property
    def avg(self):
        self.avg_score = self.sum / self.count
        return self.avg_score

# Train

In [None]:
def seed_all(seed: int = 1992):

    print("Using Seed Number {}".format(seed))

    os.environ["PYTHONHASHSEED"] = str(
        seed)  # set PYTHONHASHSEED env var at fixed value
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.cuda.manual_seed(seed)  # pytorch (both CPU and CUDA)
    np.random.seed(seed)  # for numpy pseudo-random generator
    random.seed(
        seed)  # set fixed value for python built-in pseudo-random generator
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = False


def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)


class Trainer:
    def __init__(self, model, config: type):
        self.model = model
        # self.device = device
        self.config = config
        self.epoch = 0
        self.best_acc = 0
        self.best_loss = 10**5

        # TODO consider moving these to config class
        self.optimizer = torch.optim.AdamW(model.parameters(),
                                           lr=config.lr,
                                           weight_decay=0)
        self.criterion = nn.CrossEntropyLoss()
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            self.optimizer,
            mode="min",
            factor=0.8,
            patience=1,
            verbose=True,
            min_lr=1e-8)

        self.log("Trainer prepared. We are using {} device.".format(
            self.config.device))

    def fit(self, train_loader, val_loader, fold: int):

        self.log("Training on Fold {}".format(fold + 1))

        for epoch in range(self.config.n_epochs):
            # Getting the learning rate after each epoch!
            lr = self.optimizer.param_groups[0]["lr"]
            timestamp = datetime.fromtimestamp(time.time())
            # printing the lr and the timestamp after each epoch.
            self.log("\n{}\nLR: {}".format(timestamp, lr))

            # start time of training on the training set
            train_start_time = time.time()

            # train one epoch on the training set
            avg_train_loss, avg_train_acc_score = self.train_one_epoch(
                train_loader)
            # end time of training on the training set
            train_end_time = time.time()

            # formatting time to make it nicer
            train_elapsed_time = time.strftime(
                "%H:%M:%S", time.gmtime(train_end_time - train_start_time))
            self.log(
                "[RESULT]: Train. Epoch {} | Avg Train Summary Loss: {:.6f} | Train Accuracy: {:6f} | Time Elapsed: {}"
                .format(self.epoch + 1, avg_train_loss, avg_train_acc_score,
                        train_elapsed_time))

            val_start_time = time.time()
            # note here has val predictions... in actual fact it is repeated because its same as avg_val_acc_score
            avg_val_loss, avg_val_acc_score, val_predictions = self.valid_one_epoch(
                val_loader)
            # not sure if it is good practice to write it here
            self.val_predictions = val_predictions
            val_end_time = time.time()
            val_elapsed_time = time.strftime(
                "%H:%M:%S", time.gmtime(val_end_time - val_start_time))

            self.log(
                "[RESULT]: Validation. Epoch: {} | Avg Validation Summary Loss: {:.6f} | Validation Accuracy: {:.6f} | Time Elapsed: {}"
                .format(self.epoch + 1, avg_val_loss, avg_val_acc_score,
                        val_elapsed_time))

            # note here we use avg_val_loss, not train_val_loss! It is just right to use val_loss as benchmark
            if avg_val_loss < self.best_loss:
                self.best_loss = avg_val_loss
                # decided to remove epoch here as epoch can be saved in the model later in self.save
                # also this will overwrite everytime there is a better weight.
                # TODO consider including epoch number inside, and call this epoch number as well
                # through self.load to load the weights in curr_fold_best_checkpoint
                self.save("{}_best_loss_fold_{}.pt".format(
                    self.config.effnet, fold))

            if self.best_acc < avg_val_acc_score:
                self.best_acc = avg_val_acc_score
                # TODO consider saving these weights as well.

            # this part not so clear yet, figure this out on why .step(loss) vs .step() in train epoch
            if self.config.val_step_scheduler:
                self.scheduler.step(avg_val_loss)

            # end of training, epoch + 1 so that self.epoch can be updated.
            self.epoch += 1

        # this is where we end the epoch training for the current fold/model, therefore
        # we can call the final "best weight saved" by this exact name that we saved earlier on.
        curr_fold_best_checkpoint = self.load("{}_best_loss_fold_{}.pt".format(
            self.config.effnet, fold))
        # return the checkpoint for further usage.
        return curr_fold_best_checkpoint

    def train_one_epoch(self, train_loader):

        # set to train mode
        self.model.train()

        # log metrics
        summary_loss = AverageLossMeter()
        accuracy_scores = AccuracyMeter()

        # timer
        start_time = time.time()

        # looping through train loader for one epoch, steps is the number of times to go through each epoch
        for step, (image_ids, images, labels) in enumerate(train_loader):

            
            images = images.to(self.config.device)
            labels = labels.to(self.config.device)

            
            batch_size = images.shape[0]


            logits = self.model(images)

            
            loss = self.criterion(input=logits, target=labels)
            summary_loss.update(loss.item(), batch_size)
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()            
            y_true = labels.cpu().numpy()
            softmax_preds = torch.nn.Softmax(dim=1)(input=logits).to("cpu").detach().numpy()
            y_preds = softmax_preds.argmax(1)
            
            accuracy_scores.update(y_true, y_preds, batch_size=batch_size)
            

            # not too sure yet KIV
            if self.config.train_step_scheduler:
                self.scheduler.step()

            # measure elapsed time
            end_time = time.time()

            if config.verbose:
                if (step % config.verbose_step) == 0:
                    print(
                        f"Train Steps {step}/{len(train_loader)}, " +
                        f"summary_loss: {summary_loss.avg:.3f}, acc: {accuracy_scores.avg:.3f} "
                        + f"time: {(end_time - start_time):.3f}",
                        end="\r",
                    )

        return summary_loss.avg, accuracy_scores.avg

    def valid_one_epoch(self, val_loader):

        # set to eval mode
        self.model.eval()

        # log metrics
        summary_loss = AverageLossMeter()
        accuracy_scores = AccuracyMeter()

        # timer
        start_time = time.time()

        # predictions list to append for oof later
        val_preds_list = []

        # off gradients for torch when validating
        with torch.no_grad():
            for step, (image_ids, images, labels) in enumerate(val_loader):

                images = images.to(self.config.device)
                labels = labels.to(self.config.device)
                batch_size = images.shape[0]

                logits = self.model(images)
                loss = self.criterion(input=logits, target=labels)
                summary_loss.update(loss.item(), batch_size)

                y_true = labels.cpu().numpy()
                # Write that we do not need to detach here as no gradients involved.
                # Basically torch.nn.Softmax(dim=1)(input=logits).to("cpu").detach.numpy()
                softmax_preds = torch.nn.Softmax(dim=1)(
                    input=logits).to("cpu").numpy()
                y_preds = softmax_preds.argmax(1)
                accuracy_scores.update(y_true, y_preds, batch_size=batch_size)

                val_preds_list.append(softmax_preds)

                end_time = time.time()

                if config.verbose:
                    if (step % config.verbose_step) == 0:
                        print(
                            f"Validation Steps {step}/{len(val_loader)}, " +
                            f"summary_loss: {summary_loss.avg:.3f}, val_acc: {accuracy_scores.avg:.6f} "
                            + f"time: {(end_time - start_time):.3f}",
                            end="\r",
                        )

            val_predictions = np.concatenate(val_preds_list)
           
        return summary_loss.avg, accuracy_scores.avg, val_predictions

    def save_model(self, path):
        self.model.eval()
        torch.save(self.model.state_dict(), path)

    # will save the weight for the best val loss and corresponding oof preds
    def save(self, path):
        self.model.eval()
        torch.save(
            {
                "model_state_dict": self.model.state_dict(),
                "optimizer_state_dict": self.optimizer.state_dict(),
                "scheduler_state_dict": self.scheduler.state_dict(),
                "best_acc": self.best_acc,
                "best_loss": self.best_loss,
                "epoch": self.epoch,
                "oof_preds": self.val_predictions,
            },
            path,
        )

    def load(self, path):
        checkpoint = torch.load(path)
        return checkpoint


    def log(self, message):
        if self.config.verbose:
            print(message)
        with open(self.config.log_path, "a+") as logger:
            logger.write(f"{message}\n")


def train_on_fold(config, fold: int):
    model = CustomEfficientNet(config=config, pretrained=True)
    # consider remove if clause?
    if torch.cuda.is_available():
        model.cuda()

    transforms_train, transforms_val = get_transforms(config)

    train_df = df_folds[df_folds["fold"] != fold].reset_index(drop=True)
    val_df = df_folds[df_folds["fold"] == fold].reset_index(drop=True)

    train_set = Melanoma(train_df, config, transforms=transforms_train, test=False, albu_norm=False)
    train_loader = DataLoader(train_set,
                              batch_size=config.batch_size,
                              shuffle=True,
                              num_workers=4,
                              worker_init_fn=seed_worker)

    val_set = Melanoma(val_df, config, transforms=transforms_val, test=False, albu_norm=False)
    val_loader = DataLoader(val_set,
                            batch_size=config.batch_size,
                            shuffle=False,
                            num_workers=4,
                            worker_init_fn=seed_worker)


    cassava_trainer = Trainer(model=model, config=config)

    curr_fold_best_checkpoint = cassava_trainer.fit(train_loader, val_loader,
                                                    fold)

    # loading checkpoint for all 10 epochs for this current fold

    val_df[[str(c) for c in range(config.num_classes)
            ]] = curr_fold_best_checkpoint["oof_preds"]
    val_df["preds"] = curr_fold_best_checkpoint["oof_preds"].argmax(1)

    return val_df


def get_acc_score(y_true, y_pred):
    return sklearn.metrics.accuracy_score(y_true, y_pred)


def get_result(result_df):
    preds = result_df["preds"].values
    labels = result_df[config.class_col_name].values
    score = get_acc_score(labels, preds)
    return score


def train_loop(df_folds, config, fold_num: int = None, train_one_fold=False):
    # here The CV score is the average of the validation fold metric.
    cv_score_list = []
    oof_df = pd.DataFrame()
    if train_one_fold:
        _oof_df = train_on_fold(fold_num)
        curr_fold_best_score = get_result(_oof_df)
        print("Fold {} OOF Score is {}".format(fold_num + 1,
                                               curr_fold_best_score))
    else:
        #for fold in sorted(df_folds["fold"].unique()):
        for fold in range(config.num_folds):
            # note very carefully you need to add 1 here. because df_folds is 1,2,3,4,5
            _oof_df = train_on_fold(config, fold)
            #_oof_df = train_on_fold(config, fold+1)
            oof_df = pd.concat([oof_df, _oof_df])
            curr_fold_best_score = get_result(_oof_df)
            cv_score_list.append(curr_fold_best_score)
            print("\n\n\nOOF Score for Fold {}: {}\n\n\n".format(
                fold + 1, curr_fold_best_score))

    print("CV score", np.mean(cv_score_list))
    print("Variance", np.var(cv_score_list))
    print("Five Folds OOF", get_result(oof_df))
    oof_df.to_csv("oof.csv")





In [None]:
#     skf = StratifiedKFold(n_splits=config.num_folds, shuffle=True, random_state=config.seed)
#     for fold, (train_index, val_index) in enumerate(skf.split(df_folds, df_folds[config.class_col_name])):
#         df_folds.loc[val_index, 'fold'] = int(fold)
#     df_folds['fold'] = df_folds['fold'].astype(int)

In [None]:
if __name__ == "__main__":
    config = GlobalConfig
    seed_all(seed=config.seed)
    train_csv = pd.read_csv('../input/melanoma-merged-external-data-512x512-jpeg/folds.csv')
    df_folds = train_csv.copy()
    df_folds = df_folds.rename(columns={'image_id': 'image_name'})
#     skf = KFold(n_splits=config.num_folds, shuffle=True, random_state=config.seed)
#     for fold, (train_index, val_index) in enumerate(skf.split(df_folds)):
#         df_folds.loc[val_index, 'fold'] = int(fold)
#     df_folds['fold'] = df_folds['fold'].astype(int)
    print(df_folds.groupby(['fold', config.class_col_name]).size())
    train_csv.target.value_counts()
    train_single_fold = train_on_fold(config, fold=1)
    #train_all_folds = train_loop(df_folds,config)