# run_mydataset.py のcross-validationをコマンドラインで実行する例

In [1]:
# parameters
NAME = "test"
SEED = 0
N_FOLD = 2

In [2]:
#!git clone https://github.com/rvorias/ind_knn_ad.git
#!rm -r ./output/{NAME}

In [3]:
import os
import sys
import glob
import random
import numpy as np
import pandas as pd
from pathlib import Path
import torch

OUTPUT_DIR = f"./output/{NAME}"
print(OUTPUT_DIR)
os.makedirs(OUTPUT_DIR, exist_ok=True)

./output/test


In [4]:
from sklearn.model_selection import StratifiedKFold
def cv_split(train, seed: int, n_splits: int, cv_col: str = "label"):
    """
    StratifiedKFold
    """
    v_counts = train[cv_col].value_counts()
    v_lacks = v_counts[v_counts < n_splits].index.to_list()
    v_lacks_df = train[train[cv_col].isin(v_lacks)].reset_index(drop=True)

    train = train[~train[cv_col].isin(v_lacks)].reset_index(drop=True)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    for fold, ( _, val_) in enumerate(skf.split(X=train, y=train[cv_col])):
        train.loc[val_ , "fold"] = fold
        print(f"fold{fold}:", train.loc[val_ , "fold"].shape)

    v_lacks_df["fold"] = -1
    train = pd.concat([train, v_lacks_df], ignore_index=True)

    train['fold'] = train['fold'].astype(int)
    print(train.groupby(['fold', cv_col]).size())
    return train

In [5]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    #torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(SEED)

In [6]:
# sample_data
def _make_sample_csv(output_dir="."):
    """ind_knn_adのサンプル画像から入力用csv作成"""
    good_paths = sorted(glob.glob(f"./ind_knn_ad/datasets/transistor_reduced/train/good/*"))
    defect_paths = []
    for d in ["misplaced", "damaged_case"]:
        defect_paths += sorted(glob.glob(f"./ind_knn_ad/datasets/transistor_reduced/test/{d}/*"))
    print("len(train_good):", len(good_paths))
    print("len(train_defect):", len(defect_paths))
    train_good = pd.DataFrame({"file_path": good_paths, "label": 0})
    train_defect = pd.DataFrame({"file_path": defect_paths, "label": 1})
    train = pd.concat([train_good, train_defect], ignore_index=True)
    
    good_paths = sorted(glob.glob(f"./ind_knn_ad/datasets/transistor_reduced/test/good/*"))
    defect_paths = []
    for d in ["cut_lead", "bent_lead"]:
        defect_paths += sorted(glob.glob(f"./ind_knn_ad/datasets/transistor_reduced/test/{d}/*"))
    print("len(valid_good):", len(good_paths))
    print("len(valid_defect):", len(defect_paths))
    valid_good = pd.DataFrame({"file_path": good_paths, "label": 0})
    valid_defect = pd.DataFrame({"file_path": defect_paths, "label": 1})
    valid = pd.concat([valid_good, valid_defect], ignore_index=True)
    
    train["fold"] = 0
    valid["fold"] = 1
    folds = pd.concat([train, valid])
    folds.to_csv(f"{output_dir}/folds.csv", index=False)
    return folds.reset_index(drop=True)

folds = _make_sample_csv(output_dir=OUTPUT_DIR)
display(folds.head())

len(train_good): 50
len(train_defect): 4
len(valid_good): 2
len(valid_defect): 4


Unnamed: 0,file_path,label,fold
0,./ind_knn_ad/datasets/transistor_reduced/train...,0,0
1,./ind_knn_ad/datasets/transistor_reduced/train...,0,0
2,./ind_knn_ad/datasets/transistor_reduced/train...,0,0
3,./ind_knn_ad/datasets/transistor_reduced/train...,0,0
4,./ind_knn_ad/datasets/transistor_reduced/train...,0,0


In [7]:
#=====

In [8]:
# cross-validation
!python run_mydataset.py \
-tr {OUTPUT_DIR}/folds.csv \
-o {OUTPUT_DIR}/tmp \
-m "PatchCore(f_coreset=1., backbone_name='wide_resnet50_2')" \
-s {SEED}
print("="*75)

=> PatchCore(f_coreset=1., backbone_name='wide_resnet50_2')
   100%|██████████| 2/2 [00:01<00:00,  1.59it/s]                                
model load_flg: <All keys matched successfully> ./output/test/tmp/fold0/model.pth
100%|███████████████████████████████████████████| 54/54 [00:03<00:00, 14.30it/s]
fold0 auc: 0.9750000000000001
   100%|██████████| 50/50 [00:02<00:00, 24.86it/s]                              
model load_flg: <All keys matched successfully> ./output/test/tmp/fold1/model.pth
100%|█████████████████████████████████████████████| 6/6 [00:04<00:00,  1.40it/s]
fold1 auc: 1.0
oof auc: 0.8317307692307692
=> OUTPUT: ./output/test/tmp/folds_predict.csv


In [9]:
#=====

In [10]:
# cross-validation
models = [
    'tf_efficientnetv2_b0.in1k',
    'tf_efficientnetv2_b1.in1k',
    'tf_efficientnetv2_b2.in1k',
    'seresnext50_32x4d.racm_in1k', 
    'resnet50.a1_in1k',
    'resnet152.a3_in1k',
]
for x in models:
    m = f"PatchCore(f_coreset=1., backbone_name='{x}')"
    !python run_mydataset.py \
    -tr {OUTPUT_DIR}/folds.csv \
    -o {OUTPUT_DIR}/{x.replace(".", "_")} \
    -m "{m}"\
    -s {SEED}
    print("="*75)
    print()

=> PatchCore(f_coreset=1., backbone_name='tf_efficientnetv2_b0.in1k')
   100%|██████████| 2/2 [00:01<00:00,  1.56it/s]                                
model load_flg: <All keys matched successfully> ./output/test/tf_efficientnetv2_b0_in1k/fold0/model.pth
100%|███████████████████████████████████████████| 54/54 [00:02<00:00, 19.11it/s]
fold0 auc: 0.965
   100%|██████████| 50/50 [00:01<00:00, 25.76it/s]                              
model load_flg: <All keys matched successfully> ./output/test/tf_efficientnetv2_b0_in1k/fold1/model.pth
100%|█████████████████████████████████████████████| 6/6 [00:01<00:00,  5.46it/s]
fold1 auc: 0.875
oof auc: 0.7283653846153846
=> OUTPUT: ./output/test/tf_efficientnetv2_b0_in1k/folds_predict.csv

=> PatchCore(f_coreset=1., backbone_name='tf_efficientnetv2_b1.in1k')
   100%|██████████| 2/2 [00:01<00:00,  1.58it/s]                                
model load_flg: <All keys matched successfully> ./output/test/tf_efficientnetv2_b1_in1k/fold0/model.pth
100%|██████

In [11]:
# ens
from sklearn.metrics import roc_auc_score

folds1 = pd.read_csv(f"{OUTPUT_DIR}/tf_efficientnetv2_b0_in1k/folds_predict.csv")
preds = folds1["pred"].values
folds1 = pd.read_csv(f"{OUTPUT_DIR}/tf_efficientnetv2_b1_in1k/folds_predict.csv")
preds += folds1["pred"].values 
folds1 = pd.read_csv(f"{OUTPUT_DIR}/tf_efficientnetv2_b2_in1k/folds_predict.csv")
preds += folds1["pred"].values
folds1 = pd.read_csv(f"{OUTPUT_DIR}/seresnext50_32x4d_racm_in1k/folds_predict.csv")
preds += folds1["pred"].values
folds1 = pd.read_csv(f"{OUTPUT_DIR}/resnet50_a1_in1k/folds_predict.csv")
preds += folds1["pred"].values
folds1 = pd.read_csv(f"{OUTPUT_DIR}/resnet152_a3_in1k/folds_predict.csv")
preds += folds1["pred"].values

folds1["pred"] = preds
display(folds1)

roc_auc_score(folds1["label"].values, folds1["pred"].values)

Unnamed: 0,file_path,label,fold,pred
0,./ind_knn_ad/datasets/transistor_reduced/train...,0,0,115.372269
1,./ind_knn_ad/datasets/transistor_reduced/train...,0,0,117.020937
2,./ind_knn_ad/datasets/transistor_reduced/train...,0,0,115.97726
3,./ind_knn_ad/datasets/transistor_reduced/train...,0,0,118.25786
4,./ind_knn_ad/datasets/transistor_reduced/train...,0,0,113.18412
5,./ind_knn_ad/datasets/transistor_reduced/train...,0,0,114.971328
6,./ind_knn_ad/datasets/transistor_reduced/train...,0,0,111.410293
7,./ind_knn_ad/datasets/transistor_reduced/train...,0,0,100.725113
8,./ind_knn_ad/datasets/transistor_reduced/train...,0,0,126.278847
9,./ind_knn_ad/datasets/transistor_reduced/train...,0,0,109.34894


0.7692307692307693

In [12]:
#=====

In [13]:
# test pred
x = 'tf_efficientnetv2_b0.in1k'
m = f"PatchCore(f_coreset=1., backbone_name='{x}')"
!python run_mydataset.py \
-te {OUTPUT_DIR}/folds.csv \
-o {OUTPUT_DIR}/{x.replace(".", "_")} \
-m "{m}"\
-s {SEED} \
--is_test_folds

model load_flg: <All keys matched successfully> ./output/test/tf_efficientnetv2_b0_in1k/fold0/model.pth
100%|███████████████████████████████████████████| 60/60 [00:05<00:00, 10.91it/s]
model load_flg: <All keys matched successfully> ./output/test/tf_efficientnetv2_b0_in1k/fold1/model.pth
100%|███████████████████████████████████████████| 60/60 [00:19<00:00,  3.03it/s]
=> OUTPUT: ./output/test/tf_efficientnetv2_b0_in1k/folds_predict_folds.csv
