# Introduction

if this notebook is helpful for you, please, upvote!

もし役に立ったらupvoteしてくださいね。

Based on [this great notebook](https://www.kaggle.com/yasufuminakama/cassava-resnext50-32x4d-starter-inference).

元にしたもの： [notebook](https://www.kaggle.com/yasufuminakama/cassava-resnext50-32x4d-starter-inference).

This is a notebook for studying parameter tuning using Optuna in the Cassava classification competition.

これはCassava クラス分けコンペでの、Optunaを使ったパラメータチューニング勉強用notebookです。

- resnext50_32x4d_fold0.pth
- resnext50_32x4d_fold1.pth  
- resnext50_32x4d_fold2.pth
- resnext50_32x4d_fold3.pth
- resnext50_32x4d_fold4.pth

Optimize the blend ratio of Resnext 5 model with optuna.

Resnext 5 modelのブレンド比率をoptunaで最適化します。

The probability table for the Resnext 5 and Effcientnet 5 models is in this [notebook](https://www.kaggle.com/marutama/cassava-rnxt-effn-prob-list). You can also enter it in Optuna or Light GBM to get feature importance.

Resnext 5 model, Effcientnet 5 model の確率表はこちらの[notebook](https://www.kaggle.com/marutama/cassava-rnxt-effn-prob-list)です。OptunaやLight GBMに入力してfeature importance出してもいいでしょう。

# Library import

In [None]:
# ====================================================
# Library
# ====================================================
import sys
import os
import math
import time
import random
import shutil
from pathlib import Path
from contextlib import contextmanager

from collections import defaultdict, Counter

import scipy as sp
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

from tqdm.auto import tqdm
from functools import partial

import cv2
from PIL import Image

import optuna

# Optuna example

In [None]:
%%time
optuna.logging.disable_default_handler() # not display log
#optuna.logging.enable_default_handler() # display log

def f(x):
    #y = (x - 2) ** 2
    y = x ** 2 + np.sin(x/5)*2000 + 2000
    return y

def objective(trial):
    x = trial.suggest_uniform('x', -50, 50)
    score = f(x)
    #print('x: %1.3f, score: %1.3f' % (x, score))
    return score

#study = optuna.create_study()
# TPESampler is default sampler
study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=5678))
study.optimize(objective, n_trials=100)

In [None]:
study.best_value

In [None]:
study.trials[10].params

In [None]:
study.best_params

In [None]:
values = [each.value for each in study.trials]
best_values = [np.min(values[:k+1]) for k in range(len(values))]
t = [k for k in range(len(values))]
x = [each.params['x'] for each in study.trials]
x_seq = [x for x in range(-50, 50)] 
fx    = [f(x) for x in range(-50, 50)]

In [None]:
plt.figure(figsize=(12,4.5))
ax = plt.subplot(131)
ax.set_xlim(np.min(x)-0.5, np.max(x)+0.5)
ax.set_ylim(np.min(values)-500, np.max(values)+500)
ax.set_title('graph plot')
ax.plot(x_seq, fx, alpha=0.3, color='red')
ax.scatter(x, values, alpha=0.3)

ax = plt.subplot(132)
ax.set_title('best score')
ax.plot(best_values)
ax.scatter(t, values, alpha=0.3)

ax = plt.subplot(133)
ax.set_title('best score:log scale')
ax.plot(best_values)
ax.scatter(t, values, alpha=0.3)
ax.set_yscale('log')

plt.show()

# Resnext50_32x4d Inference

# Settings

In [None]:
# ====================================================
# Directory settings
# ====================================================
OUTPUT_DIR = './'
MODEL_DIR = '../input/cassava-resnext50-32x4d-starter-training/'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    
TRAIN_PATH = '../input/cassava-leaf-disease-classification/train_images'
MERGED_PATH = '../input/cassava-leaf-disease-merged/train'

TEST_PATH = MERGED_PATH
MERGED_CSV = '../input/cassava-leaf-disease-merged/merged.csv'

FINAL_TEST_PATH = '../input/cassava-leaf-disease-classification/test_images'
FINAL_TEST_CSV= '../input/cassava-leaf-disease-classification/sample_submission.csv'

In [None]:
#BATCH_SIZE(min) : in case of 26337(MERGED num)
#01 ... 67
#02 ... 35
#04 ... 23
#08 ... 20
#16 ... 16
#32 ... 15
#64 ... 15
BATCH_SIZE = 16
total_file_nums = len(os.listdir(TEST_PATH))
d = total_file_nums // BATCH_SIZE
max_file_nums = d * BATCH_SIZE
print(max_file_nums)

In [None]:
inp_imgs= []
for dirname, _, filenames in os.walk(TEST_PATH):
    for filename in filenames[:max_file_nums]: #ここで入力画像数を絞る、バッチサイズの整数倍
        #print(os.path.join(dirname, filename))
        #print(filename)
        inp_imgs.append(filename)
inp_imgs.sort()
#print(len(inp_imgs))

# CFG

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    num_workers=0 # original is 4
    model_name='resnext50_32x4d'
    size=256
    batch_size=BATCH_SIZE # original is 32
    seed=42
    target_size=5
    target_col='label'
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    train=False
    inference=True

# Library for Pytorch and GPU

In [None]:
# ====================================================
# Library for Pytorch and GPU
# ====================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau

from albumentations import (
    Compose, OneOf, Normalize, Resize, RandomResizedCrop, RandomCrop, HorizontalFlip, VerticalFlip, 
    RandomBrightness, RandomContrast, RandomBrightnessContrast, Rotate, ShiftScaleRotate, Cutout, 
    IAAAdditiveGaussianNoise, Transpose
    )
from albumentations.pytorch import ToTensorV2
from albumentations import ImageOnlyTransform

sys.path.append('../input/pytorch-image-models/pytorch-image-models-master')
import timm

import warnings 
warnings.filterwarnings('ignore')


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# Utils

In [None]:
# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    return accuracy_score(y_true, y_pred)


@contextmanager
def timer(name):
    t0 = time.time()
    LOGGER.info(f'[{name}] start')
    yield
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f} s.')


def init_logger(log_file=OUTPUT_DIR+'inference.log'):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

#LOGGER = init_logger()


def seed_torch(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_torch(seed=CFG.seed)

# Dataset

In [None]:
# ====================================================
# Dataset
# ====================================================
class TestDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.file_names = df['image_id'].values
        self.transform = transform
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        file_name = self.file_names[idx]
        file_path = f'{TEST_PATH}/{file_name}'
        image = cv2.imread(file_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
        return image

# Transforms

In [None]:
# ====================================================
# Transforms
# ====================================================
def get_transforms(*, data):
    
    if data == 'train':
        return Compose([
            #Resize(CFG.size, CFG.size),
            RandomResizedCrop(CFG.size, CFG.size),
            Transpose(p=0.5),
            HorizontalFlip(p=0.5),
            VerticalFlip(p=0.5),
            ShiftScaleRotate(p=0.5),
            Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
            ),
            ToTensorV2(),
        ])

    elif data == 'valid':
        return Compose([
            Resize(CFG.size, CFG.size),
            Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
            ),
            ToTensorV2(),
        ])

# MODEL

In [None]:
# ====================================================
# MODEL
# ====================================================
class CustomResNext(nn.Module):
    def __init__(self, model_name='resnext50_32x4d', pretrained=False):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained)
        n_features = self.model.fc.in_features
        self.model.fc = nn.Linear(n_features, CFG.target_size)

    def forward(self, x):
        x = self.model(x)
        return x

# Helper functions

In [None]:
# ====================================================
# Helper functions
# ====================================================
def inference(model, states, test_loader, device):
    model.to(device)
    tk0 = tqdm(enumerate(test_loader), total=len(test_loader))
    probs = []
    for i, (images) in tk0:
        images = images.to(device)
        avg_preds = []
        for state in states:
            model.load_state_dict(state['model'])
            model.eval()
            with torch.no_grad():
                y_preds = model(images)
            avg_preds.append(y_preds.softmax(1).to('cpu').numpy())
        #avg_preds = np.mean(avg_preds, axis=0)
        probs.append(avg_preds)
    probs = np.concatenate(probs)
    return probs

# Data Loading

In [None]:
merged = pd.read_csv(MERGED_CSV)
merged = merged.set_index('image_id')
#merged

In [None]:
len(inp_imgs)
# 21397 TRAIN num
# 26337 MERGED num

In [None]:
merged

In [None]:
tmp = merged.loc[inp_imgs]
test = tmp.rename(columns={'label':'correct'}).drop('source', axis=1)
test = test.reset_index()

In [None]:
test

# inference(all images)

In [None]:
# ====================================================
# inference
# ====================================================
model = CustomResNext(CFG.model_name, pretrained=False)
states = [torch.load(MODEL_DIR+f'{CFG.model_name}_fold{fold}_best.pth') for fold in CFG.trn_fold]
test_dataset = TestDataset(test, transform=get_transforms(data='valid'))
test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False, 
                         num_workers=CFG.num_workers, pin_memory=True)
p = inference(model, states, test_loader, device)

# Make DataFrame for 5 models inference results

In [None]:
MODELS = 5
for i in range(len(p)//MODELS):
    if i==0:
        p0=p[0]
        p1=p[1]
        p2=p[2]
        p3=p[3]
        p4=p[4]
    else:
        p0 = np.concatenate([p0, p[i*5+0]])
        p1 = np.concatenate([p1, p[i*5+1]])
        p2 = np.concatenate([p2, p[i*5+2]])
        p3 = np.concatenate([p3, p[i*5+3]])
        p4 = np.concatenate([p4, p[i*5+4]])

In [None]:
col = ['p0_0', 'p0_1', 'p0_2', 'p0_3', 'p0_4']
df0 = pd.DataFrame(data=p0, columns=col, dtype='float32')
col = ['p1_0', 'p1_1', 'p1_2', 'p1_3', 'p1_4']
df1 = pd.DataFrame(data=p1, columns=col, dtype='float32')
col = ['p2_0', 'p2_1', 'p2_2', 'p2_3', 'p2_4']
df2 = pd.DataFrame(data=p2, columns=col, dtype='float32')
col = ['p3_0', 'p3_1', 'p3_2', 'p3_3', 'p3_4']
df3 = pd.DataFrame(data=p3, columns=col, dtype='float32')
col = ['p4_0', 'p4_1', 'p4_2', 'p4_3', 'p4_4']
df4 = pd.DataFrame(data=p4, columns=col, dtype='float32')

In [None]:
# 参考として元のmeanでのinefence結果を作っておく
#avg_p = np.mean(p, axis=0)
avg_p = p0*0.2 + p1*0.2 + p2*0.2 + p3*0.2 + p4*0.2

In [None]:
avg_p

In [None]:
#test=test.drop('label', axis=1)
test['label']=9999 # 9999 is NaN, to keep integer
#test

In [None]:
for i, name in enumerate(tqdm(test['image_id'].values)):
    #print(i, name)
    #print(predictions[i])
    #print(predictions[i].argmax())
    test.loc[test['image_id']==name, 'label'] = avg_p[i].argmax()
test['label'] = test['label'].astype('int')
test

In [None]:
# acc_scoreで検算チェック
acc_score = accuracy_score(test['correct'], test['label'])
print(acc_score)
# 0.909753703790251 ... TRAIN score
# 0.8880282492311197 ... MERGED score

In [None]:
all_df = pd.merge(test,   df0, left_index=True, right_index=True)
all_df = pd.merge(all_df, df1, left_index=True, right_index=True)
all_df = pd.merge(all_df, df2, left_index=True, right_index=True)
all_df = pd.merge(all_df, df3, left_index=True, right_index=True)
all_df = pd.merge(all_df, df4, left_index=True, right_index=True)
all_df.to_csv("all.csv", index=False)

In [None]:
# You can also use this csv file created in advance.
#all_df = pd.read_csv('../input/cassava-resnext50-5-inference-results/all.csv')

In [None]:
all_df

# Define calculation function

In [None]:
def calc_p(df, a0, a1, a2, a3, a4):
    l = []
    for n in range(len(df)):
        p0 = np.array([df['p0_0'][n], df['p0_1'][n], df['p0_2'][n], df['p0_3'][n], df['p0_4'][n]])
        p1 = np.array([df['p1_0'][n], df['p1_1'][n], df['p1_2'][n], df['p1_3'][n], df['p1_4'][n]])
        p2 = np.array([df['p2_0'][n], df['p2_1'][n], df['p2_2'][n], df['p2_3'][n], df['p2_4'][n]])
        p3 = np.array([df['p3_0'][n], df['p3_1'][n], df['p3_2'][n], df['p3_3'][n], df['p3_4'][n]])
        p4 = np.array([df['p4_0'][n], df['p4_1'][n], df['p4_2'][n], df['p4_3'][n], df['p4_4'][n]])
        p=p0*a0+p1*a1+p2*a2+p3*a3+p4*a4
        c = p.argmax()
        l.append(c)
    return l

In [None]:
def calc_p_pd(df, a0, a1, a2, a3, a4):
    df['pf_0'] = df['p0_0']*a0 + df['p1_0']*a1 + df['p2_0']*a2 + df['p3_0']*a3 + df['p4_0']*a4
    df['pf_1'] = df['p0_1']*a0 + df['p1_1']*a1 + df['p2_1']*a2 + df['p3_1']*a3 + df['p4_1']*a4
    df['pf_2'] = df['p0_2']*a0 + df['p1_2']*a1 + df['p2_2']*a2 + df['p3_2']*a3 + df['p4_2']*a4
    df['pf_3'] = df['p0_3']*a0 + df['p1_3']*a1 + df['p2_3']*a2 + df['p3_3']*a3 + df['p4_3']*a4
    df['pf_4'] = df['p0_4']*a0 + df['p1_4']*a1 + df['p2_4']*a2 + df['p3_4']*a3 + df['p4_4']*a4
    l = []
    for n in range(len(df)):
        p = np.array([df['pf_0'][n] , df['pf_1'][n] , df['pf_2'][n] , df['pf_3'][n] , df['pf_4'][n]])
        c = p.argmax()
        #df['pl'][n] = c
        l.append(c)
    return l

In [None]:
df = all_df
correct = df['correct']
label = df['label']

## calc_p : normal version

In [None]:
%%time
pred = calc_p(df, 0.2, 0.2, 0.2, 0.2, 0.2)
acc_score = accuracy_score(correct, pred)
print(acc_score)

## calc_p_pd : pandas version

In [None]:
%%time
pred = calc_p_pd(df, 0.2, 0.2, 0.2, 0.2, 0.2)
acc_score = accuracy_score(correct, pred)
print(acc_score)

calc_p_pd() is faster than calc_p().

# Optuna optimization

In [None]:
r_min = 0
r_max = 1
# You can increase iteration number.
iteration = 50

optuna.logging.disable_default_handler() # not display log
#optuna.logging.enable_default_handler() # display log

## For Merged(2019 and 2020) data

In [None]:
%%time
correct = df['correct']

def objective(trial):
    a = trial.suggest_uniform('a', r_min, r_max)
    b = trial.suggest_uniform('b', r_min, r_max)
    c = trial.suggest_uniform('c', r_min, r_max)
    d = trial.suggest_uniform('d', r_min, r_max)
    e = trial.suggest_uniform('e', r_min, r_max)

    pred = calc_p_pd(df, a, b, c, d, e)
    score = accuracy_score(correct, pred)
    #print('a:%1.3f,b:%1.3f,c:%1.3f,d:%1.3f,e:%1.3f,score %1.3f' % (a,b,c,d,e,score))
    return score
SEED=1234
#study = optuna.create_study(direction='maximize')
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=SEED))
study.optimize(objective, n_trials=iteration)

In [None]:
study.best_value

In [None]:
study.best_params

In [None]:
plt.plot([trial.value for trial in study.trials])
plt.grid()
plt.show()

In [None]:
plt.plot([trial.params['a'] for trial in study.trials], label='a')
plt.plot([trial.params['b'] for trial in study.trials], label='b')
plt.plot([trial.params['c'] for trial in study.trials], label='c')
plt.plot([trial.params['d'] for trial in study.trials], label='d')
plt.plot([trial.params['e'] for trial in study.trials], label='e')
plt.legend()
plt.grid()
plt.show()

In [None]:
from optuna.visualization import plot_optimization_history
plot_optimization_history(study)

In [None]:
from optuna.visualization import plot_param_importances
plot_param_importances(study)

In [None]:
from optuna.visualization import plot_contour
plot_contour(study)

## For 2020 train data

In [None]:
df_train = df[:21396].copy()

In [None]:
%%time
correct = df_train['correct']
label = df_train['label']
pred = calc_p_pd(df_train, 0.2, 0.2, 0.2, 0.2, 0.2)
acc_score = accuracy_score(correct, pred)
print(acc_score)

In [None]:
%%time
#correct = df_train['correct']

def objective(trial):
    a = trial.suggest_uniform('a', r_min, r_max)
    b = trial.suggest_uniform('b', r_min, r_max)
    c = trial.suggest_uniform('c', r_min, r_max)
    d = trial.suggest_uniform('d', r_min, r_max)
    e = trial.suggest_uniform('e', r_min, r_max)
    pred = calc_p_pd(df_train, a, b, c, d, e)
    score = accuracy_score(correct, pred)
    print('a:%1.3f,b:%1.3f,c:%1.3f,d:%1.3f,e:%1.3f,score %1.3f' % (a,b,c,d,e,score))
    return score

SEED=1234
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=SEED))
study.optimize(objective, n_trials=iteration)

In [None]:
study.best_value

In [None]:
study.best_params

In [None]:
plt.plot([trial.value for trial in study.trials])
plt.grid()
plt.show()

In [None]:
plt.plot([trial.params['a'] for trial in study.trials], label='a')
plt.plot([trial.params['b'] for trial in study.trials], label='b')
plt.plot([trial.params['c'] for trial in study.trials], label='c')
plt.plot([trial.params['d'] for trial in study.trials], label='d')
plt.plot([trial.params['e'] for trial in study.trials], label='e')
plt.legend()
plt.grid()
plt.show()

In [None]:
from optuna.visualization import plot_optimization_history
plot_optimization_history(study)

In [None]:
from optuna.visualization import plot_param_importances
plot_param_importances(study)

In [None]:
from optuna.visualization import plot_contour
plot_contour(study)

# Final inference

In [None]:
# Use last study : 2020 train data
a = study.best_params['a']
b = study.best_params['b']
c = study.best_params['c']
d = study.best_params['d']
e = study.best_params['e']

In [None]:
# ====================================================
# Helper functions
# ====================================================
def final_inference(model, states, test_loader, device):
    model.to(device)
    tk0 = tqdm(enumerate(test_loader), total=len(test_loader))
    probs = []
    for i, (images) in tk0:
        images = images.to(device)
        avg_preds = []
        for state in states:
            model.load_state_dict(state['model'])
            model.eval()
            with torch.no_grad():
                y_preds = model(images)
            avg_preds.append(y_preds.softmax(1).to('cpu').numpy())
        #avg_preds = np.mean(avg_preds, axis=0)
        avg_preds = a*avg_preds[0]+b*avg_preds[1]+c*avg_preds[2]+d*avg_preds[3]+e*avg_preds[4]
        probs.append(avg_preds)
    probs = np.concatenate(probs)
    return probs

In [None]:
TEST_PATH=FINAL_TEST_PATH 
test = pd.read_csv(FINAL_TEST_CSV)
test.head()

In [None]:
# ====================================================
# inference
# ====================================================

model = CustomResNext(CFG.model_name, pretrained=False)
states = [torch.load(MODEL_DIR+f'{CFG.model_name}_fold{fold}_best.pth') for fold in CFG.trn_fold]
test_dataset = TestDataset(test, transform=get_transforms(data='valid'))
test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False, 
                         num_workers=CFG.num_workers, pin_memory=True)
p = final_inference(model, states, test_loader, device)

# submission
test['label'] = p.argmax(1)
test[['image_id', 'label']].to_csv(OUTPUT_DIR+'submission.csv', index=False)
test.head()