# GPU

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
print(gpu_info)

# CFG

In [None]:
CONFIG_NAME = 'stacking16.yml'
debug = False

In [None]:
TTA_LIST = [
    'CenterCrop-Normalize-ToTensorV2',
    'CenterCrop-Transpose-Normalize-ToTensorV2',
#     'CenterCrop-HorizontalFlip-Normalize-ToTensorV2',
#     'CenterCrop-VerticalFlip-Normalize-ToTensorV2',
    'Resize-Normalize-ToTensorV2',
    'Resize-Transpose-Normalize-ToTensorV2',
#     'Resize-VerticalFlip-Normalize-ToTensorV2',
#     'Resize-HorizontalFlip-Normalize-ToTensorV2'
]

In [None]:
"""
Before running this cell, you will set hidden vars by "add-ons" menu. 
"""

from kaggle_secrets import UserSecretsClient

def clone_repository(
     ssh_keyval: "str: BEGIN RSA PRIVATE KEY to END RSA PRIVATE KEY"
   , ssh_keyname: "str: id_rsa..."
   , gitrepo: "str: your private repository to clone"
   , uname_git: "str: username of your Git") -> None:
   """
   receive hidden vars, then clone private repository to "/kaggle/working/"
   """

   ssh_command = f"ssh -l {uname_git} -i /kaggle/working/{ssh_keyname} -o StrictHostKeyChecking=no -F /dev/null"

   !rm -rf /kaggle/working/$gitrepo
   !echo $ssh_keyval> $ssh_keyname
   !chmod 600 /kaggle/working/$ssh_keyname
   !git -c core.sshCommand="$ssh_command" clone git@github.com:$uname_git/$gitrepo
   !rm /kaggle/working/$ssh_keyname

us = UserSecretsClient()
clone_repository(ssh_keyval=us.get_secret("ssh_keyval"), ssh_keyname=us.get_secret("ssh_keyname")
   , gitrepo=us.get_secret("gitrepo"), uname_git=us.get_secret("uname_git"))
del us

import sys
sys.path.append('./kaggle-cassava')

In [None]:
from src.utils.envs.main import create_env
env_dict = create_env()
env_dict

In [None]:
# ====================================================
# CFG
# ====================================================
import yaml

CONFIG_PATH = f'./kaggle-cassava/config/{CONFIG_NAME}'
with open(CONFIG_PATH) as f:
    config = yaml.load(f)

INFO = config['info']
TAG = config['tag']
CFG = config['cfg']

DATA_PATH = env_dict["data_path"]
env = env_dict["env"]
NOTEBOOK_PATH = env_dict["notebook_dir"]
OUTPUT_DIR = env_dict["output_dir"]
# TITLE = env_dict["title"]

CFG['train'] = True
CFG['inference'] = False

CFG['debug'] = debug

# if CFG['debug']:
#     CFG['epochs'] = 1

# 環境変数
import os
os.environ["GCLOUD_PROJECT"] = INFO['PROJECT_ID']

# 間違ったバージョンを実行しないかチェック
# assert INFO['TITLE'] == TITLE, f'{TITLE}, {INFO["TITLE"]}'
TITLE = INFO["TITLE"]

In [None]:
import os
import glob

model_dirs = []
oof_dirs = []
for stage1 in CFG['stage1_models']:
    num = str(stage1).rjust(2, '0')
    # model
    model_dir_ = glob.glob(f'../input/{num}t*/')
    assert len(model_dir_) == 1, model_dir_
    model_dirs.append(model_dir_[0])
    # oof
    oof_dir_ = glob.glob(f'../input/{num}i*/')
    assert len(oof_dir_) == 1, oof_dir_
    oof_dirs.append(oof_dir_[0])
model_dirs, oof_dirs

# Library

In [None]:
# ====================================================
# Library
# ====================================================
import sys
sys.path.append('../input/pytorch-image-models/pytorch-image-models-master')

import datetime
import os
import math
import time
import random
import glob
import shutil
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter
import yaml

import scipy as sp
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

from tqdm.auto import tqdm
from functools import partial

import cv2
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau

from albumentations import (
    Compose, OneOf, Normalize, Resize, RandomResizedCrop, RandomCrop, HorizontalFlip, VerticalFlip, 
    RandomBrightness, RandomContrast, RandomBrightnessContrast, Rotate, ShiftScaleRotate, Cutout, 
    IAAAdditiveGaussianNoise, Transpose, CenterCrop
    )
from albumentations.pytorch import ToTensorV2
from albumentations import ImageOnlyTransform

import timm
import mlflow

import warnings 
warnings.filterwarnings('ignore')

if CFG['debug']:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
else:
    device = torch.device('cuda')

from src.utils.logger import init_logger
from src.utils.utils import seed_torch, EarlyStopping
from src.utils.loss.bi_tempered_logistic_loss import bi_tempered_logistic_loss
from src.utils.augments.randaugment import RandAugment
from src.utils.augments.augmix import RandomAugMix

start_time = datetime.datetime.now()
start_time_str = start_time.strftime('%m%d%H%M')

# Directory settings

In [None]:
# ====================================================
# Directory settings
# ====================================================
if os.path.exists(OUTPUT_DIR):
    shutil.rmtree(OUTPUT_DIR)
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# save basic files

In [None]:
# with open(f'{OUTPUT_DIR}/{start_time_str}_TAG.json', 'w') as f:
#     json.dump(TAG, f, indent=4)
    
# with open(f'{OUTPUT_DIR}/{start_time_str}_CFG.json', 'w') as f:
#     json.dump(CFG, f, indent=4)

if not os.path.isfile(NOTEBOOK_PATH):
    NOTEBOOK_PATH = '__notebook_source__.ipynb'
    
import shutil
notebook_path = f'{OUTPUT_DIR}/{start_time_str}_{TITLE}.ipynb'
shutil.copy2(NOTEBOOK_PATH, notebook_path)

# Data Loading

In [None]:
train = pd.read_csv(f'{DATA_PATH}/train.csv')
test = pd.read_csv(f'{DATA_PATH}/sample_submission.csv')
label_map = pd.read_json(f'{DATA_PATH}/label_num_to_disease_map.json', 
                         orient='index')

if CFG['debug']:
    train = train.sample(n=1000, random_state=CFG['seed']).reset_index(drop=True)

In [None]:
model_config_paths = []
for model_dir in model_dirs:
    assert len(glob.glob(f'{model_dir}/*.yml'))==1
    model_config_paths.append(glob.glob(f'{model_dir}/*.yml')[0])

# Utils

In [None]:
# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    return accuracy_score(y_true, y_pred)

logger_path = OUTPUT_DIR+f'{start_time_str}_train.log'
LOGGER = init_logger(logger_path)
seed_torch(seed=CFG['seed'])


def remove_glob(pathname, recursive=True):
    for p in glob.glob(pathname, recursive=recursive):
        if os.path.isfile(p):
            os.remove(p)

In [None]:
# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    return accuracy_score(y_true, y_pred)


@contextmanager
def timer(name):
    t0 = time.time()
    LOGGER.info(f'[{name}] start')
    yield
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f} s.')


# def init_logger(log_file=OUTPUT_DIR+'inference.log'):
#     from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
#     logger = getLogger(__name__)
#     logger.setLevel(INFO)
#     handler1 = StreamHandler()
#     handler1.setFormatter(Formatter("%(message)s"))
#     handler2 = FileHandler(filename=log_file)
#     handler2.setFormatter(Formatter("%(message)s"))
#     logger.addHandler(handler1)
#     logger.addHandler(handler2)
#     return logger

#LOGGER = init_logger()


def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

# seed_torch(seed=CFG['seed'])

# train

## Data loading

In [None]:
check_cols = ["image_id", "label", "fold"]

df_ = None
oof_list = []
for oof_dir in oof_dirs:
    tta_preds = glob.glob(os.path.join(oof_dir, "*[!submission].csv"))
    tta_dict = {}
    for tta_pred in tta_preds:
        tta_name = tta_pred.split('/')[-1].split('.')[0]
        df = pd.read_csv(tta_pred)
        # 全てのoofでfoldが揃っているかチェック
        if df_ is not None:
            assert (df[check_cols] == df_[check_cols]).all().all(), tta_name
        df_ = df.copy()
        tta_dict[tta_name] = df
    oof_list.append(tta_dict)
    

first_df = oof_list[0][TTA_LIST[0]]

## Dataset

In [None]:
# X.shape = (N, Models, Labels, channel)
X = np.zeros((len(first_df), len(model_dirs), CFG['target_size'], len(TTA_LIST)), dtype=np.float)
y = first_df['label'].values
folds = first_df[check_cols].values
for model_idx, oof in enumerate(oof_list):
    for channel_idx, tta_name in enumerate(TTA_LIST):
        X[:, model_idx, :, channel_idx] = oof[tta_name][['0','1','2','3','4']].values

# add Channel dim
# X = X.reshape(len(oof_list[0]), len(model_dirs), CFG['target_size'], 1)
# [N, Models, Labels, Channel] -> [N, Channel, Models, Labels]
X = X.transpose(0, 3, 1, 2)

In [None]:
X.shape, y.shape

In [None]:
class StackingDataset(Dataset):
    def __init__(self, X: np.ndarray, y: np.ndarray):

        self.X = X

        self.y = y

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return (
            torch.tensor(self.X[idx], dtype=torch.float),
            torch.tensor(self.y[idx], dtype=torch.long),
        )

## model

In [None]:
# class CNNStacking_(nn.Module):
#     def __init__(self, n_features, n_labels):
#         super(CNNStacking_, self).__init__()
#         self.conv1 = nn.Conv2d(in_channels=1, out_channels=8, kernel_size=(3, 1), bias=False)
#         self.conv2 = nn.Conv2d(in_channels=8, out_channels=16, kernel_size=(3, 1), bias=False)
#         self.dense1 = nn.Linear(in_features=16* n_labels, out_features=4 * n_labels)
#         self.dense2 = nn.Linear(in_features=4 * n_labels, out_features=n_labels)
#         self.relu = nn.ReLU()
        

#     def forward(self, x):
#         print(x.size())
#         x = self.relu(self.conv1(x))
#         print(x.size())
#         x = self.relu(self.conv2(x))
#         print(x.size())
#         x = torch.flatten(x)
#         print(x.size())
#         x = self.relu(self.dense1(x))
#         out = self.dense2(x)
#         return out

In [None]:
class CNNStacking(nn.Module):
    def __init__(self, n_labels):
        super(CNNStacking, self).__init__()

        self.sq = nn.Sequential(
            nn.Conv2d(in_channels=4, out_channels=8, kernel_size=(3, 1), bias=False),
            nn.ReLU(),
            nn.Conv2d(in_channels=8, out_channels=16, kernel_size=(3, 1), bias=False),
            nn.ReLU(),
#             nn.Conv2d(in_channels=16, out_channels=32, kernel_size=(2, 1), bias=False),
#             nn.ReLU(),
            # nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(3, 1), bias=False),
            # nn.ReLU(),
            nn.Flatten(),
#             nn.Linear(in_features=32* n_labels, out_features=16 * n_labels),
#             nn.ReLU(),
            nn.Linear(in_features=16* n_labels, out_features=4 * n_labels),
            nn.ReLU(),
            nn.Linear(in_features=4 * n_labels, out_features=n_labels),
        )

    def forward(self, x):
        return self.sq(x)

In [None]:
model = CNNStacking(5)
train_dataset = StackingDataset(X, y)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True,
                          num_workers=4, pin_memory=True, drop_last=True)

for image, label in train_loader:
    output = model(image)
    print(output)
    break

## helper function

In [None]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))




In [None]:
# ====================================================
# loss
# ====================================================
def get_loss(criterion, y_preds, labels):
    if TAG['criterion']=='CrossEntropyLoss':
        loss = criterion(y_preds, labels)
    elif TAG['criterion'] == 'bi_tempered_logistic_loss':
        loss = criterion(y_preds, labels, t1=CFG['bi_tempered_loss_t1'], t2=CFG['bi_tempered_loss_t2'])
    return loss

In [None]:
# ====================================================
# Helper functions
# ====================================================
def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to train mode
    model.train()
    start = end = time.time()
    global_step = 0
    for step, (features, labels) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        features = features.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        y_preds = model(features)
        loss = get_loss(criterion, y_preds, labels)
        # record loss
        losses.update(loss.item(), batch_size)
        if CFG['gradient_accumulation_steps'] > 1:
            loss = loss / CFG['gradient_accumulation_steps']
        if CFG['apex']:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        # clear memory
        del loss, y_preds
        torch.cuda.empty_cache()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG['max_grad_norm'])
        if (step + 1) % CFG['gradient_accumulation_steps'] == 0:
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG['print_freq'] == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  #'LR: {lr:.6f}  '
                  .format(
                   epoch+1, step, len(train_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses,
                   remain=timeSince(start, float(step+1)/len(train_loader)),
                   grad_norm=grad_norm,
                   #lr=scheduler.get_lr()[0],
                   ))
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to evaluation mode
    model.eval()
    preds = []
    start = end = time.time()
    for step, (features, labels) in enumerate(valid_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        features = features.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        # compute loss
        with torch.no_grad():
            y_preds = model(features)
        loss = get_loss(criterion, y_preds, labels)
        losses.update(loss.item(), batch_size)
        # record accuracy
        preds.append(y_preds.softmax(1).to('cpu').numpy())
        if CFG['gradient_accumulation_steps'] > 1:
            loss = loss / CFG['gradient_accumulation_steps']
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG['print_freq'] == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(
                   step, len(valid_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses,
                   remain=timeSince(start, float(step+1)/len(valid_loader)),
                   ))
    predictions = np.concatenate(preds)
    return losses.avg, predictions


def inference(model, states, test_loader, device):
    model.to(device)
    tk0 = tqdm(enumerate(test_loader), total=len(test_loader))
    probs = []
    for i, (features) in tk0:
        features = features.to(device)
        avg_preds = []
        for state in states:
            # model.load_state_dict(state['model'])
            model.load_state_dict(state)
            model.eval()
            with torch.no_grad():
                y_preds = model(features)
            avg_preds.append(y_preds.softmax(1).to('cpu').numpy())
        avg_preds = np.mean(avg_preds, axis=0)
        probs.append(avg_preds)
    probs = np.concatenate(probs)
    return probs

In [None]:
# ====================================================
# scheduler 
# ====================================================
def get_scheduler(optimizer):
    if TAG['scheduler']=='ReduceLROnPlateau':
        scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=CFG['factor'], patience=CFG['patience'], verbose=True, eps=CFG['eps'])
    elif TAG['scheduler']=='CosineAnnealingLR':
        scheduler = CosineAnnealingLR(optimizer, T_max=CFG['T_max'], eta_min=CFG['min_lr'], last_epoch=-1)
    elif TAG['scheduler']=='CosineAnnealingWarmRestarts':
        scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=CFG['T_0'], T_mult=1, eta_min=CFG['min_lr'], last_epoch=-1)
    return scheduler

# ====================================================
# criterion
# ====================================================
def get_criterion():
    if TAG['criterion']=='CrossEntropyLoss':
        criterion = nn.CrossEntropyLoss()
    elif TAG['criterion'] == 'bi_tempered_logistic_loss':
        criterion = bi_tempered_logistic_loss
    return criterion

## training

In [None]:
# ====================================================
# Train loop
# ====================================================
def train_loop(X, y, folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")
    if not CFG['debug']:
        mlflow.set_tag('running.fold', str(fold))
    # ====================================================
    # loader
    # ====================================================
    trn_idx = folds[folds['fold'] != fold].index
    val_idx = folds[folds['fold'] == fold].index

    train_folds = folds.loc[trn_idx].reset_index(drop=True)
    valid_folds = folds.loc[val_idx].reset_index(drop=True)
    
    X_train = X[trn_idx]
    y_train = y[trn_idx]
    X_valid = X[val_idx]
    y_valid = y[val_idx]
    
    train_dataset = StackingDataset(X_train, y_train)
    valid_dataset = StackingDataset(X_valid, y_valid)

    train_loader = DataLoader(train_dataset, 
                              batch_size=CFG['batch_size'], 
                              shuffle=True, 
                              num_workers=CFG['num_workers'], pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, 
                              batch_size=CFG['batch_size'], 
                              shuffle=False, 
                              num_workers=CFG['num_workers'], pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer & criterion
    # ====================================================
    best_model_path = OUTPUT_DIR+f'fold{fold}_best.pth'
    latest_model_path = OUTPUT_DIR+f'fold{fold}_latest.pth'

    model = CNNStacking(CFG['target_size'])
    model.to(device)
    # 学習途中の重みがあれば読み込み
    if os.path.isfile(latest_model_path):
        state_latest = torch.load(latest_model_path)
        state_best = torch.load(best_model_path)
        model.load_state_dict(state_latest['model'])
        epoch_start = state_latest['epoch']+1
        # er_best_score = state_latest['score']
        er_counter = state_latest['counter']
        er_best_score = state_best['best_score']
        val_loss_history = state_latest['val_loss_history']

        LOGGER.info(f'Load training model in epoch:{epoch_start}, best_score:{er_best_score:.3f}, counter:{er_counter}')

    # 学習済みモデルを再学習する場合
    elif os.path.isfile(best_model_path):
        state_best = torch.load(best_model_path)
        model.load_state_dict(state_best['model'])
        epoch_start = 0 # epochは0からカウントしなおす
        er_counter = 0
        er_best_score = state_best['best_score']
        val_loss_history = []   # 過去のval_lossも使用しない

        LOGGER.info(f'Retrain model, best_score:{er_best_score:.3f}')
    else:
        epoch_start = 0
        er_best_score = None
        er_counter = 0
        val_loss_history = []

    optimizer = Adam(model.parameters(), lr=CFG['lr'], weight_decay=CFG['weight_decay'], amsgrad=False)
    scheduler = get_scheduler(optimizer)
    criterion = get_criterion()

    # 再開時のepochまでschedulerを進める
    assert len(range(epoch_start)) == len(val_loss_history)
    for _, val_loss in zip(range(epoch_start), val_loss_history):
        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()

    # ====================================================
    # apex
    # ====================================================
    if CFG['apex']:
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0)

    # ====================================================
    # loop
    # ====================================================
    # best_score = 0.
    # best_loss = np.inf
    early_stopping = EarlyStopping(
                            patience=CFG['early_stopping_round'], 
                            verbose=True,
                            save_path=best_model_path,
                            counter=er_counter, best_score=er_best_score, 
                            save_latest_path=latest_model_path)
    
    for epoch in range(epoch_start, CFG['epochs']):
        
        start_time = time.time()
        
        # train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, criterion, device)
        valid_labels = valid_folds[CFG['target_col']].values

        # scoring
        score = get_score(valid_labels, preds.argmax(1))

        # get learning rate
        if hasattr(scheduler, 'get_last_lr'):
            last_lr = scheduler.get_last_lr()[0]
        else:
            # ReduceLROnPlateauには関数get_last_lrがない
            last_lr = optimizer.param_groups[0]['lr']
        
        # log mlflow
        if not CFG['debug']:
            mlflow.log_metric(f"fold{fold} avg_train_loss", avg_loss, step=epoch)
            mlflow.log_metric(f"fold{fold} avg_valid_loss", avg_val_loss, step=epoch)
            mlflow.log_metric(f"fold{fold} score", score, step=epoch)
            mlflow.log_metric(f"fold{fold} lr", last_lr, step=epoch)
        
        # early stopping
        early_stopping(avg_val_loss, model, preds, epoch)
        if early_stopping.early_stop:
            print(f'Epoch {epoch+1} - early stopping')
            break
        
        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Accuracy: {score}')
        
        # log mlflow
        if not CFG['debug']:
            mlflow.log_artifact(best_model_path)
            if os.path.isfile(latest_model_path):
                mlflow.log_artifact(latest_model_path)
    
    check_point = torch.load(best_model_path)
    valid_folds[[str(c) for c in range(5)]] = check_point['preds']
    valid_folds['preds'] = check_point['preds'].argmax(1)

    return valid_folds

In [None]:
def get_trained_fold_preds(folds, fold, best_model_path):
    valid_folds = folds.loc[val_idx].reset_index(drop=True)
    check_point = torch.load(best_model_path)
    valid_folds[[str(c) for c in range(5)]] = check_point['preds']
    valid_folds['preds'] = check_point['preds'].argmax(1)

    return valid_folds


def save_confusion_matrix(oof):
    from sklearn.metrics import confusion_matrix
    cm_ = confusion_matrix(oof['label'], oof['preds'], labels=[0,1,2,3,4])
    label_name = ['0 (CBB)', '1 (CBSD)', '2 (CGM)', '3 (CMD)', '4 (Healthy)']
    cm = pd.DataFrame(cm_, index=label_name, columns=label_name)
    cm.to_csv(OUTPUT_DIR+'oof_confusion_matrix.csv', index=True)

In [None]:
# ====================================================
# main
# ====================================================
def get_result(result_df):
    preds = result_df['preds'].values
    labels = result_df[CFG['target_col']].values
    score = get_score(labels, preds)
    LOGGER.info(f'Score: {score:<.5f}')
    
    return score
        
        
def get_oof_list(oof_dirs):
    check_cols = ["image_id", "label", "fold"]

    df_ = None
    oof_list = []
    for oof_dir in oof_dirs:
        tta_preds = glob.glob(os.path.join(oof_dir, "*[!submission].csv"))
        tta_dict = {}
        for tta_pred in tta_preds:
            tta_name = tta_pred.split('/')[-1].split('.')[0]
            df = pd.read_csv(tta_pred)
            # 全てのoofでfoldが揃っているかチェック
            if df_ is not None:
                assert (df[check_cols] == df_[check_cols]).all().all(), tta_name
            df_ = df.copy()
            tta_dict[tta_name] = df
        oof_list.append(tta_dict)
        
    return oof_list

    

def main():

    oof_list = get_oof_list(oof_dirs)
    first_df = oof_list[0][TTA_LIST[0]]
    
    data_num = len(first_df)
    model_num = len(oof_dirs)
    target_num = CFG['target_size']
    channel_num = len(TTA_LIST)
    
    # [N, Models, Labels, Channel]
    X = np.zeros((data_num, model_num, target_num, channel_num), dtype=np.float)
    y = first_df['label'].values
    folds = first_df[['image_id', 'label', 'fold']]
    for model_idx, oof in enumerate(oof_list):
        for channel_idx, tta_name in enumerate(TTA_LIST):
            X[:, model_idx, :, channel_idx] = oof[tta_name][['0','1','2','3','4']].values
    
    # [N, Models, Labels, Channel] -> [N, Channel, Models, Labels]
    X = X.transpose(0, 3, 1, 2)
    
    if CFG['train']:
        # train 
        oof_df = pd.DataFrame()
        for fold in range(CFG['n_fold']):
            best_model_path = OUTPUT_DIR+f'fold{fold}_best.pth'
            if fold in CFG['trn_fold']:
                _oof_df = train_loop(X, y, folds, fold)
            elif os.path.exists(best_model_path):
                _oof_df = get_trained_fold_preds(folds, fold, best_model_path)
            else:
                _oof_df = None
            if _oof_df is not None:
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                _ = get_result(_oof_df)
        # CV result
        LOGGER.info(f"========== CV ==========")
        score = get_result(oof_df)
        # save result
        oof_df.to_csv(OUTPUT_DIR+'oof_df.csv', index=False)
        save_confusion_matrix(oof_df)
        # save tta
        pd.Series(TTA_LIST, name="tta_list").to_csv(OUTPUT_DIR+'tta_list.csv', index=False)
        # log mlflow
        if not CFG['debug']:
            mlflow.log_metric('oof score', score)
            mlflow.delete_tag('running.fold')
            mlflow.log_artifact(OUTPUT_DIR+'oof_df.csv')
            mlflow.log_artifact(OUTPUT_DIR+'tta_list.csv')
    
    if CFG['inference']:
        pass
#         # inference
#         model = CustomModel(TAG['model_name'], pretrained=False)
#         states = [torch.load(OUTPUT_DIR+f'{TAG["model_name"]}_fold{fold}_best.pth') for fold in CFG['trn_fold']]
#         test_dataset = TestDataset(test, transform=get_transforms(data='valid'))
#         test_loader = DataLoader(test_dataset, batch_size=CFG['batch_size'], shuffle=False, 
#                                  num_workers=CFG['num_workers'], pin_memory=True)
#         predictions = inference(model, states, test_loader, device)
#         # submission
#         test['label'] = predictions.argmax(1)
#         test[['image_id', 'label']].to_csv(OUTPUT_DIR+'submission.csv', index=False)

In [None]:
def _load_save_point(run_id):
    # どこで中断したか取得
    stop_fold = int(mlflow.get_run(run_id=run_id).to_dictionary()['data']['tags']['running.fold'])
    # 学習対象のfoldを変更
    CFG['trn_fold'] = [fold for fold in CFG['trn_fold'] if fold>=stop_fold]
    # 学習済みモデルがあれば.pthファイルを取得(学習中も含む)
    client = mlflow.tracking.MlflowClient()
    artifacts = [artifact for artifact in client.list_artifacts(run_id) if ".pth" in artifact.path]
    for artifact in artifacts:
        client.download_artifacts(run_id, artifact.path, OUTPUT_DIR)


def check_have_run():
    results = mlflow.search_runs(INFO['EXPERIMENT_ID'])
    if 'tags.mlflow.runName' in results.columns:
        run_id_list = results[results['tags.mlflow.runName']==TITLE]['run_id'].tolist()
    else:
        print(f'No results in experiment_id=={INFO["EXPERIMENT_ID"]}')
        run_id_list = []
    # 初めて実行する場合
    if len(run_id_list) == 0:
        run_id = None
    # 既に実行されている場合
    else:
        assert len(run_id_list)==1
        run_id = run_id_list[0]
        _load_save_point(run_id)

    return run_id


def push_github():
    ! cp {NOTEBOOK_PATH} kaggle-cassava/notebook/{TITLE}.ipynb
    !git config --global user.email "raijin.1059@gmail.com"
    ! git config --global user.name "Raijin Shibata"
    !cd kaggle-cassava ;git add .; git commit -m {TITLE}; git remote set-url origin https://{user_name}:{password}@github.com/raijin0704/kaggle-cassava.git; git push origin master

In [None]:
if __name__ == '__main__':
    if CFG['debug']:
        main()
    else:
        mlflow.set_tracking_uri(INFO['TRACKING_URI'])
        mlflow.set_experiment('stacking')
        # 既に実行済みの場合は続きから実行する
        run_id = check_have_run()
        with mlflow.start_run(run_id=run_id, run_name=TITLE):
            if run_id is None:
                mlflow.log_artifact(CONFIG_PATH)
                mlflow.log_param('device', device)
                mlflow.set_tag('env', env)
                mlflow.set_tags(TAG)
                mlflow.log_params(CFG)
            mlflow.log_artifact(notebook_path)
            main()
            mlflow.log_artifacts(OUTPUT_DIR)
            remove_glob(f'{OUTPUT_DIR}/*latest.pth')
            push_github()
            if env=="kaggle":
                shutil.copy2(CONFIG_PATH, f'{OUTPUT_DIR}/{CONFIG_NAME}')
                ! rm -r kaggle-cassava
            elif env=="colab":
                shutil.copytree(OUTPUT_DIR, f'{INFO["SHARE_DRIVE_PATH"]}/{TITLE}')
                shutil.copy2(CONFIG_PATH, f'{INFO["SHARE_DRIVE_PATH"]}/{TITLE}/{CONFIG_NAME}')

In [None]:
%debug