In [None]:
#'../input/efficientnet-pytorch-07/efficientnet_pytorch-0.7.0'
package_path = '../input/pytorch-image-models/pytorch-image-models-master' 
import sys
sys.path.append(package_path)

In [None]:
from glob import glob
from sklearn.model_selection import GroupKFold, StratifiedKFold
from skimage import io
import torch
import os
from datetime import datetime
import time
import random
import torchvision
from torchvision import transforms
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from torch.utils.data import Dataset,DataLoader
from torch.utils.data.sampler import SequentialSampler, RandomSampler
from  torch.cuda.amp import autocast, GradScaler

import sklearn
import warnings
import joblib
from sklearn.metrics import roc_auc_score, log_loss
from sklearn import metrics
import warnings
import pydicom
import timm #from efficientnet_pytorch import EfficientNet
from scipy.ndimage.interpolation import zoom
from sklearn.metrics import log_loss

In [None]:
# コンフィグファイル
CFG = {
    # K-交差検証の分割数
    'fold_num': 5,
    # シード固定用
    'seed': 719,
    
    # 事前学習モデル　https://dajiro.com/entry/2020/07/24/161040
    'model_arch': 'tf_efficientnet_b4_ns',
    'img_size': 512,
    'epochs': 10,
    'train_bs': 32,
    'valid_bs': 32,
    'lr': 1e-4,
    'num_workers': 4,
    'accum_iter': 1, # suppoprt to do batch accumulation for backprop with effectively larger batch size
    'verbose_step': 1,
    'device': 'cuda:0',
    'tta': 3,
    'used_epochs': [6,7,8,9],
    'weights': [1,1,1,1]
}

In [None]:
# 訓練データ読み込み
train = pd.read_csv('../input/cassava-leaf-disease-classification/train.csv')
train.head()

In [None]:
# 各ラベルの個数
train.label.value_counts()

> We could do stratified validation split in each fold to make each fold's train and validation set looks like the whole train set in target distributions.

> k-分割交差検証により、各foldのと訓練データ、検証データを分布の比率を維持して分割することで、対象の訓練、検証データセットが全体のデータセットのように見せることができる。

In [None]:
# テストデータ
submission = pd.read_csv('../input/cassava-leaf-disease-classification/sample_submission.csv')
submission.head()

# Helper Functions

In [None]:
# シード固定関数
def seed_everything(seed):
    # 標準ライブラリ
    random.seed(seed)    
    # ハッシュシード
    os.environ['PYTHONHASHSEED'] = str(seed)
    # numpy
    np.random.seed(seed)
    # pytorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
# イメージ取得用関数
import cv2
def get_img(path):
    # 読み込み
    im_bgr = cv2.imread(path)
    # openCVはBGRなので、matplotlib用にRBG変換しておく
    # ::-1で逆順スライス
    im_rgb = im_bgr[:, :, ::-1]
    return im_rgb

# イメージの取得および描写
img = get_img('../input/cassava-leaf-disease-classification/train_images/1000015157.jpg')
plt.imshow(img)
plt.show()

# Dataset

In [None]:
# データセット定義
class CassavaDataset(Dataset):
    def __init__(
        self, df, data_root, transforms=None, output_label=True
    ):
        
        super().__init__()
        self.df = df.reset_index(drop=True).copy()
        self.transforms = transforms
        self.data_root = data_root
        self.output_label = output_label
    
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index: int):
        
        # get labels
        if self.output_label:
            target = self.df.iloc[index]['label']
          
        path = "{}/{}".format(self.data_root, self.df.iloc[index]['image_id'])
        
        img  = get_img(path)
        
        if self.transforms:
            img = self.transforms(image=img)['image']
            
        # do label smoothing
        if self.output_label == True:
            return img, target
        else:
            return img

# Define Train\Validation Image Augmentations
# 訓練および検証画像の拡張定義

In [None]:
# 画像データ拡張ライブラリ
# https://github.com/albumentations-team/albumentations
# https://albumentations.ai/docs/api_reference/pytorch/transforms/#albumentations.pytorch.transforms.ToTensorV2
from albumentations import (
    HorizontalFlip, VerticalFlip, IAAPerspective, ShiftScaleRotate, CLAHE, RandomRotate90,
    Transpose, ShiftScaleRotate, Blur, OpticalDistortion, GridDistortion, HueSaturationValue,
    IAAAdditiveGaussianNoise, GaussNoise, MotionBlur, MedianBlur, IAAPiecewiseAffine, RandomResizedCrop,
    IAASharpen, IAAEmboss, RandomBrightnessContrast, Flip, OneOf, Compose, Normalize, Cutout, CoarseDropout, ShiftScaleRotate, CenterCrop, Resize
)


from albumentations.pytorch import ToTensorV2

# 訓練データの変換
def get_train_transforms():

    # 複数処理をまとめて実行
    return Compose([
            # ランダムにトリミングし、元のサイズにリサイズ(引数：高さ、幅)
            RandomResizedCrop(CFG['img_size'], CFG['img_size']),
            
            # 転置、水平反転、垂直反転（引数：適用確率）
            Transpose(p=0.5),
            HorizontalFlip(p=0.5),
            VerticalFlip(p=0.5),

            # アフィン変換を適用（平行移動、拡大縮小、回転）
            ShiftScaleRotate(p=0.5),
            
            # 色相、彩度、輝度の変更
            HueSaturationValue(hue_shift_limit=0.2, sat_shift_limit=0.2, val_shift_limit=0.2, p=0.5),
        
            # 明るさとコントラストの変更
            RandomBrightnessContrast(brightness_limit=(-0.1,0.1), contrast_limit=(-0.1, 0.1), p=0.5),
        
            # 正規化(ピクセル値を255 = 2 ** 8-1で除算し、チャネルごとの平均を減算し、チャネルごとのstdで除算)
            # ぼやけさせる的なイメージ
            Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], max_pixel_value=255.0, p=1.0),
            CoarseDropout(p=0.5),
            
            # ドロップアウト
            Cutout(p=0.5),
            
            # torchテンソルへ変換
            ToTensorV2(p=1.0),
        ], p=1.)
  

# 検証データの変換
def get_valid_transforms():
    return Compose([
            # 中心部分のトリミング
            CenterCrop(CFG['img_size'], CFG['img_size'], p=1.),
            
            # リサイズ
            Resize(CFG['img_size'], CFG['img_size']),
        
            # 正規化
            Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], max_pixel_value=255.0, p=1.0),
            ToTensorV2(p=1.0),
        ], p=1.)

# 推論処理
def get_inference_transforms():
    return Compose([
            RandomResizedCrop(CFG['img_size'], CFG['img_size']),
            Transpose(p=0.5),
            HorizontalFlip(p=0.5),
            VerticalFlip(p=0.5),
            HueSaturationValue(hue_shift_limit=0.2, sat_shift_limit=0.2, val_shift_limit=0.2, p=0.5),
            RandomBrightnessContrast(brightness_limit=(-0.1,0.1), contrast_limit=(-0.1, 0.1), p=0.5),
            Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], max_pixel_value=255.0, p=1.0),
            ToTensorV2(p=1.0),
        ], p=1.)

# Model

In [None]:
from torch import nn
# https://pytorch.org/docs/stable/nn.html

# モデル構築
class CassvaImgClassifier(nn.Module):
    
    def __init__(self, model_arch, n_class, pretrained=False):
        
        super().__init__()
        # 事前学習モデルの取得、定義 ※model_archにモデル名入力
        self.model = timm.create_model(model_arch, pretrained=pretrained)

        # 試行回数
        n_features = self.model.classifier.in_features
        
        #　最終層の再定義(出力ノード数をn_featuresに設定)
        self.model.classifier = nn.Linear(n_features, n_class)
        
    def forward(self, x):
        x = self.model(x)
        return x

# Main Loop

In [None]:
# プログレスバーの表示
# https://qiita.com/pontyo4/items/76145cb10e030ad8186a
from tqdm import tqdm

# 1エポック分の推論処理
def inference_one_epoch(model, data_loader, device):

    # 推論
    model.eval()

    image_preds_all = []
    
    # プログレスバーの表示するための準備
    pbar = tqdm(enumerate(data_loader), total=len(data_loader))

    # data_loader分繰り返し実行
    for step, (imgs) in pbar:
        
        imgs = imgs.to(device).float()
        
        # ここで推論を実際にしている
        image_preds = model(imgs)   #output = model(input)
        
        image_preds_all += [torch.softmax(image_preds, 1).detach().cpu().numpy()]
        
    image_preds_all = np.concatenate(image_preds_all, axis=0)
    return image_preds_all

In [None]:
!ls ../input/pytorch-efficientnet-baseline-train-amp-aug

In [None]:
if __name__ == '__main__':
     # for training only, need nightly build pytorch

    # シード固定
    seed_everything(CFG['seed'])
    
    # 訓練データとテストデータで分割
    folds = StratifiedKFold(n_splits=CFG['fold_num']).split(np.arange(train.shape[0]), train.label.values)
    
    # 訓練データによる学習（上記で分割したfolds）
    for fold, (trn_idx, val_idx) in enumerate(folds):
        # we'll train fold 0 first
        # 最初にfold0を訓練する　※実際に動かすときは以下の2行をコメントアウト？
        # https://www.kaggle.com/khyeh0719/pytorch-efficientnet-baseline-train-amp-aug
        # 上の処理を前にもってきて、fold > 0らへんの部分を削除すればいい感じになりそう

        if fold > 0:
            break 

        print('Inference fold:{0} trn_idx:{1} val_idx:{2} started'.format(fold,trn_idx,val_idx))

        
        # 検証データセットのインデックス振り直し
        valid_ = train.loc[val_idx,:].reset_index(drop=True)
        
        # 
        valid_ds = CassavaDataset(valid_, '../input/cassava-leaf-disease-classification/train_images/',
                                  transforms=get_inference_transforms(), output_label=False)
        
        test = pd.DataFrame()
        test['image_id'] = list(os.listdir('../input/cassava-leaf-disease-classification/test_images/'))
        test_ds = CassavaDataset(test, '../input/cassava-leaf-disease-classification/test_images/',
                                 transforms=get_inference_transforms(), output_label=False)
        
        val_loader = torch.utils.data.DataLoader(
            valid_ds, 
            batch_size=CFG['valid_bs'],
            num_workers=CFG['num_workers'],
            shuffle=False,
            pin_memory=False,
        )
        
        tst_loader = torch.utils.data.DataLoader(
            test_ds, 
            batch_size=CFG['valid_bs'],
            num_workers=CFG['num_workers'],
            shuffle=False,
            pin_memory=False,
        )

        device = torch.device(CFG['device'])
        
        # モデルの実体化
        model = CassvaImgClassifier(CFG['model_arch'], train.label.nunique()).to(device)
        
        # 検証用予測
        val_preds = []
        
        # テスト用予測
        tst_preds = []
        
        #for epoch in range(CFG['epochs']-3):
        for i, epoch in enumerate(CFG['used_epochs']):    
            model.load_state_dict(torch.load('../input/pytorch-efficientnet-baseline-train-amp-aug/{}_fold_{}_{}'.format(CFG['model_arch'], fold, epoch)))
            
            with torch.no_grad():
                for _ in range(CFG['tta']):
                    val_preds += [CFG['weights'][i]/sum(CFG['weights'])/CFG['tta']*inference_one_epoch(model, val_loader, device)]
                    tst_preds += [CFG['weights'][i]/sum(CFG['weights'])/CFG['tta']*inference_one_epoch(model, tst_loader, device)]

        # 検証用予測の平均
        val_preds = np.mean(val_preds, axis=0) 
        # テスト用予測の平均
        tst_preds = np.mean(tst_preds, axis=0) 
        
        # 損失
        print('fold {} validation loss = {:.5f}'.format(fold, log_loss(valid_.label.values, val_preds)))
        # 正解率
        print('fold {} validation accuracy = {:.5f}'.format(fold, (valid_.label.values==np.argmax(val_preds, axis=1)).mean()))
        
        del model
        torch.cuda.empty_cache()

In [None]:
# 指定sれた配列の最大値となっている要素の先頭インデックス
test['label'] = np.argmax(tst_preds, axis=1)
test.head()

In [None]:
test.to_csv('submission.csv', index=False)

# Train part is here: https://www.kaggle.com/khyeh0719/pytorch-efficientnet-baseline-train-amp-aug