## 1단계
- /data/train/aug 폴더 만들기


## 2단계
- /data/train/aug 폴더 아래에 class + state 조합 별로 aug할 데이터 폴더 만들기
- aug에 사용할 이미지 리스트 만들기


## 3단계
- 만들어진 경로에 aug 진행


## 4단계
- 생성된 이미지들을 train_df 에 추가하기
- 이미지 읽는 것을 쉽게 하기 위해서 경로까지 file_name에 포함


## 기준
- 'good'을 제외한 모든 class에 대해서 aug 진행
- 각 class에 대하여 최대 5배까지 aug 진행
- flip, rotation, mixup 쓰면 좋을 듯
    - mixup은 cv2에서 제공하는 cv2.addWeighted 쓰면됨
    - mixup은 alpha 값과 beta값은 label 마다 다른 값 할당
    - mixup은 'good' + @ 로 진행
- https://pyy0715.github.io/Albumentation/ 참고하여 custom aug 만들어보기


## 데이터 버전 관리
- v1
    - aug + mix
- v2
    - aug + mix 데이터 개수 증가
    - 'good' state 추가
- v3 
    - mixup만 적용한 aug => 성능 안좋음
- v4
    - v2 에서 갯수를 더 증가시킴
    - aug_times = 20, good_img_aug_times=2, mixup_times=10
- v5
    - v4 에서 aug갯수만 더 증가시킴
    - aug_times = 35, good_img_aug_times=2, mixup_times=10
- v6
    - v5에서 다른 aug도 적용함
        - Cutout, CoarseDropout 추가
    - mixup 빼고 적용해보기 => training때 적용함 => mixup 학습 경과보고 결정하기
        - 학습 중 mixup 성능 지림, 그래도 일단 모르니 mixup_times=10으로 진행
    - A.OneOf([A.Affine(mode=1), A.GridDistortion()], p=1) => 를 A.OneOf 적용 X
    - aug_times = 45, good_img_aug_times=2, mixup_times=10
    - => 성능 안좋음, 새로 추가한 aug 때문인듯

In [2]:
import cv2
import os
import albumentations as A
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from tqdm import tqdm
from easydict import EasyDict
from glob import glob

### 1단계

In [3]:
root_path = '../data/train'
train_df_path = '../data/train_df.csv'

aug_df_path = '../data/aug_v6_train_df.csv'
aug_img_path = os.path.join(root_path, 'aug_v6')

os.makedirs(aug_img_path, exist_ok=True)

use_good = True
only_mixup = False

aug_times = 45
good_img_aug_times = 2
mixup_times = 10

train_df = pd.read_csv(train_df_path)

### 2단계

In [4]:
label_list = list(train_df['class'].unique())
state_list = [list(train_df['state'][train_df['class']==label].unique()) for label in label_list]

good_imgs = {}
label_comb_img = {}
for idx, label in enumerate(label_list) :
    label_comb_img[label] = {}
    for state in state_list[idx] :            
        
        # good 을 제외한 나머지에 대해서 진행
        if not use_good :

            if state != "good" :
                # label 별 folder 생성
                folder_name = os.path.join(aug_img_path, label+"-"+state)
                os.makedirs(folder_name, exist_ok=True)

                # label별 image list 가져오기
                label_comb_img[label][state] = list(train_df['file_name'][train_df['class']==label][train_df['state']==state])
            else : 
                # label 별 good img들을 가져옴
                good_imgs[label] = list(train_df['file_name'][train_df['class']==label][train_df['state']==state])
        
        else :
            # label 별 folder 생성
            folder_name = os.path.join(aug_img_path, label+"-"+state)
            os.makedirs(folder_name, exist_ok=True)

            # label별 image list 가져오기
            label_comb_img[label][state] = list(train_df['file_name'][train_df['class']==label][train_df['state']==state])
            
            # label 별 good img들을 가져옴
            if state == "good" :
                good_imgs[label] = list(train_df['file_name'][train_df['class']==label][train_df['state']==state])

### 3단계

In [4]:
aug_transforms = A.Compose([
    A.OneOf([
        A.Rotate(),
        A.HorizontalFlip(),
        A.VerticalFlip()
    ], p=1),
    
#     A.OneOf([
    A.Affine(mode=1, p=0.7),
    A.GridDistortion(p=0.7),
#     ], p=1)
    
    A.OneOf([
        A.Cutout(num_holes=150, max_h_size=20, max_w_size=20),
        A.CoarseDropout(max_holes=160, min_holes=140, 
                    max_height=30, max_width=30,
                   min_height=15, min_width=15)
    ], p=1)
    
])

mixup_opt = {
    "bottle":{
        "alpha" : 0.6,
        "beta" : 0.4,
        },
    "cable":{
        "alpha" : 0.2,
        "beta" : 0.8,
        },
    "capsule":{
        "alpha" : 0.4,
        "beta" : 0.6,
        },
    "carpet":{
        "alpha" : 0.3,
        "beta" : 0.7,
        },
    "grid":{
        "alpha" : 0.3,
        "beta" : 0.7,
        },
    "hazelnut":{
        "alpha" : 0.7,
        "beta" : 0.3,
        },
    "leather":{
        "alpha" : 0.6,
        "beta" : 0.4,
        },
    "metal_nut":{
        "alpha" : 0.2,
        "beta" : 0.8,
        },
    "pill":{
        "alpha" : 0.2,
        "beta" : 0.8,
        },
    "screw":{
        "alpha" : 0.4,
        "beta" : 0.6,
        },
    "tile":{
        "alpha" : 0.5,
        "beta" : 0.5,
        },
    "toothbrush":{
        "alpha" : 0.3,
        "beta" : 0.7,
        },
    "wood" : {
        "alpha" : 0.6,
        "beta" : 0.4,
        },
    "transistor":{
        "alpha" : 0.3,
        "beta" : 0.7,
        },
    "zipper":{
        "alpha" : 0.3,
        "beta" : 0.7,
        }
}
    

def mixup(ori_img, img, opt) :
    
    if only_mixup :
        if opt["alpha"] > opt["beta"] :
            alpha = np.random.uniform(low=opt["alpha"] - 0.05, high=opt["alpha"] + 0.15)
            beta = 1 - opt["alpha"]
            
        elif opt["alpha"] <= opt["beta"] :
            beta = np.random.uniform(low=opt["beta"] - 0.05, high=opt["beta"] + 0.15)
            alpha = 1 - opt["beta"]
    
    return cv2.addWeighted(ori_img, opt["alpha"], img, opt["beta"], 1)


for label, state_img in tqdm(label_comb_img.items()) :
    for state, img_list in state_img.items() :
        cnt = len(img_list)
        print("Processing : " + label + "-" + state)
        
        for img_name in img_list :
            img_path = os.path.join(root_path, img_name)
            img = cv2.imread(img_path)
            
            if use_good and state == "good" :
                for cnt in range(good_img_aug_times) :
                    img_aug = aug_transforms(image=img)['image']
                    cv2.imwrite(os.path.join(aug_img_path, label+"-"+state, "aug_"+str(cnt)+"_"+img_name), img_aug)
                continue
            
            if not only_mixup :
                for cnt in range(aug_times) :
                    # Spatial augmentation
                    img_aug = aug_transforms(image=img)['image']
                    cv2.imwrite(os.path.join(aug_img_path, label+"-"+state, "aug_"+str(cnt)+"_"+img_name), img_aug)
                
                for cnt in range(mixup_times) :
                    ori_img = cv2.imread(os.path.join(root_path, random.choice(good_imgs[label])))

                    # Mixup
                    img_mixup = mixup(ori_img, img, mixup_opt[label])
                    cv2.imwrite(os.path.join(aug_img_path, label+"-"+state, "mixup_"+str(cnt)+"_"+img_name), img_mixup)
            
            if only_mixup :
                for cnt in range(mixup_times) :
                    ori_img = cv2.imread(os.path.join(root_path, random.choice(good_imgs[label])))

                    # Mixup
                    img_mixup = mixup(ori_img, img, mixup_opt[label])
                    cv2.imwrite(os.path.join(aug_img_path, label+"-"+state, "mixup_"+str(cnt)+"_"+img_name), img_mixup)

  0%|                                                                                           | 0/15 [00:00<?, ?it/s]

Processing : transistor-good
Processing : transistor-bent_lead
Processing : transistor-damaged_case
Processing : transistor-misplaced
Processing : transistor-cut_lead


  7%|█████▌                                                                             | 1/15 [01:11<16:34, 71.06s/it]

Processing : capsule-good
Processing : capsule-squeeze
Processing : capsule-crack
Processing : capsule-poke
Processing : capsule-faulty_imprint
Processing : capsule-scratch


 13%|██████████▉                                                                       | 2/15 [03:36<24:52, 114.78s/it]

Processing : wood-good
Processing : wood-combined
Processing : wood-scratch
Processing : wood-color
Processing : wood-hole
Processing : wood-liquid


 20%|████████████████▍                                                                 | 3/15 [05:15<21:29, 107.42s/it]

Processing : bottle-good
Processing : bottle-contamination
Processing : bottle-broken_large
Processing : bottle-broken_small


 27%|██████████████████████▏                                                            | 4/15 [06:18<16:28, 89.87s/it]

Processing : screw-good
Processing : screw-thread_side
Processing : screw-manipulated_front
Processing : screw-thread_top
Processing : screw-scratch_neck
Processing : screw-scratch_head


 33%|███████████████████████████▎                                                      | 5/15 [08:35<17:51, 107.16s/it]

Processing : cable-bent_wire
Processing : cable-good
Processing : cable-missing_cable
Processing : cable-cut_outer_insulation
Processing : cable-cut_inner_insulation
Processing : cable-missing_wire
Processing : cable-combined
Processing : cable-cable_swap
Processing : cable-poke_insulation


 40%|████████████████████████████████▊                                                 | 6/15 [10:53<17:38, 117.66s/it]

Processing : carpet-hole
Processing : carpet-good
Processing : carpet-metal_contamination
Processing : carpet-thread
Processing : carpet-color
Processing : carpet-cut


 47%|██████████████████████████████████████▎                                           | 7/15 [13:19<16:54, 126.87s/it]

Processing : hazelnut-good
Processing : hazelnut-crack
Processing : hazelnut-cut
Processing : hazelnut-hole
Processing : hazelnut-print


 53%|███████████████████████████████████████████▋                                      | 8/15 [15:22<14:39, 125.60s/it]

Processing : pill-pill_type
Processing : pill-good
Processing : pill-scratch
Processing : pill-crack
Processing : pill-color
Processing : pill-contamination
Processing : pill-faulty_imprint
Processing : pill-combined


 60%|█████████████████████████████████████████████████▏                                | 9/15 [17:30<12:38, 126.34s/it]

Processing : metal_nut-scratch
Processing : metal_nut-good
Processing : metal_nut-flip
Processing : metal_nut-color
Processing : metal_nut-bent


 67%|██████████████████████████████████████████████████████                           | 10/15 [18:42<09:07, 109.44s/it]

Processing : zipper-fabric_border
Processing : zipper-good
Processing : zipper-split_teeth
Processing : zipper-rough
Processing : zipper-fabric_interior
Processing : zipper-squeezed_teeth
Processing : zipper-combined
Processing : zipper-broken_teeth


 73%|███████████████████████████████████████████████████████████▍                     | 11/15 [20:51<07:41, 115.39s/it]

Processing : leather-good
Processing : leather-fold
Processing : leather-cut
Processing : leather-glue
Processing : leather-poke
Processing : leather-color


 80%|████████████████████████████████████████████████████████████████▊                | 12/15 [23:11<06:09, 123.07s/it]

Processing : toothbrush-good
Processing : toothbrush-defective


 87%|███████████████████████████████████████████████████████████████████████           | 13/15 [23:52<03:16, 98.26s/it]

Processing : tile-good
Processing : tile-glue_strip
Processing : tile-gray_stroke
Processing : tile-oil
Processing : tile-crack
Processing : tile-rough


 93%|████████████████████████████████████████████████████████████████████████████▌     | 14/15 [25:22<01:35, 95.67s/it]

Processing : grid-good
Processing : grid-broken
Processing : grid-glue
Processing : grid-bent
Processing : grid-thread
Processing : grid-metal_contamination


100%|█████████████████████████████████████████████████████████████████████████████████| 15/15 [26:43<00:00, 106.88s/it]


### 4단계

In [5]:
train_df = pd.read_csv(train_df_path)

train_df['file_name'] = list(map(lambda y :os.path.join(root_path, y), train_df['file_name']))

aug_labels = glob(os.path.join(aug_img_path, "*"))
for aug_path in aug_labels :
    label = aug_path.split("\\")[-1]
    class_name = label.split("-")[0]
    state = label.split("-")[-1]
        
    file_list = glob(os.path.join(aug_path,"*"))
#     file_list = list(map(lambda y :os.path.join(aug_path, y), os.listdir(aug_path)))
    label_list = [label] * len(file_list)
    class_list = [class_name] * len(file_list)
    state_list = [state] * len(file_list)
    
    aug_df = pd.DataFrame({'index' : [i + len(train_df) for i in range(len(file_list))],
                           'file_name' : file_list,
                           'class' : class_list,
                           'state' : state_list,
                           'label' : label_list
                          })
    train_df = pd.concat([train_df, aug_df])
    
train_df = train_df.reset_index(drop=True)
train_df.to_csv(aug_df_path, mode='w')