# Uncertainty Based Cost-Effective Labeling Methodology

In [1]:
import torch
import torchvision.datasets as dset
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, random_split, Dataset
import torch.nn as nn
import time
from torch import optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torchvision
import os
import natsort
import pandas as pd
from PIL import Image
from tqdm import tqdm
import shutil
import pandas as pd
import numpy as np

# Device Setting

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
class gpu_setting:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Model

In [4]:
class BasicBlock(nn.Module):
    expansion = 1
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()

        self.residual_function = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(),
            nn.Conv2d(out_channels, out_channels * BasicBlock.expansion, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(out_channels * BasicBlock.expansion),
        )

        # identity mapping, input과 output의 feature map size, filter 수가 동일한 경우 사용.
        self.shortcut = nn.Sequential()

        self.relu = nn.ReLU()

        # projection mapping using 1x1conv
        if stride != 1 or in_channels != BasicBlock.expansion * out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels * BasicBlock.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels * BasicBlock.expansion)
            )

    def forward(self, x):
        x = self.residual_function(x) + self.shortcut(x)
        x = self.relu(x)
        return x


class BottleNeck(nn.Module):
    expansion = 4
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()

        self.residual_function = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(),
            nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(),
            nn.Conv2d(out_channels, out_channels * BottleNeck.expansion, kernel_size=1, stride=1, bias=False),
            nn.BatchNorm2d(out_channels * BottleNeck.expansion),
        )

        self.shortcut = nn.Sequential()

        self.relu = nn.ReLU()

        if stride != 1 or in_channels != out_channels * BottleNeck.expansion:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels*BottleNeck.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels*BottleNeck.expansion)
            )
            
    def forward(self, x):
        x = self.residual_function(x) + self.shortcut(x)
        x = self.relu(x)
        return x

class ResNet(nn.Module):
    def __init__(self, block, num_block, num_classes=9, init_weights=True):
        super().__init__()

        self.in_channels=64

        self.conv1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        )

        self.conv2_x = self._make_layer(block, 64, num_block[0], 1)
        self.conv3_x = self._make_layer(block, 128, num_block[1], 2)
        self.conv4_x = self._make_layer(block, 256, num_block[2], 2)
        self.conv5_x = self._make_layer(block, 512, num_block[3], 2)

        self.avg_pool = nn.AdaptiveAvgPool2d((1,1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)
        
        # weights inittialization
        if init_weights:
            self._initialize_weights()

    def _make_layer(self, block, out_channels, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_channels, out_channels, stride))
            self.in_channels = out_channels * block.expansion

        return nn.Sequential(*layers)

    def forward(self,x):
        output = self.conv1(x)
        output = self.conv2_x(output)
        x = self.conv3_x(output)
        x = self.conv4_x(x)
        x = self.conv5_x(x)
        x = self.avg_pool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

    # define weight initialization function
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)

def resnet18():
    return ResNet(BasicBlock, [2,2,2,2])

def resnet34():
    return ResNet(BasicBlock, [3, 4, 6, 3])

def resnet50():
    return ResNet(BottleNeck, [3,4,6,3])

def resnet101():
    return ResNet(BottleNeck, [3, 4, 23, 3])

def resnet152():
    return ResNet(BottleNeck, [3, 8, 36, 3])

In [5]:
def ResNetParameters(model, train_dl, valid_dl):
    device = gpu_setting.device
    model = model.lower()
    if model == 'resnet34':
        model = resnet34().to(device)
    if model == 'resnet50':
        model = resnet50().to(device)
    if model == 'resnet101':
        model = resnet101().to(device)
        
    loss_func = nn.CrossEntropyLoss(reduction='sum')
    opt = optim.Adam(model.parameters(), lr=0.001)
    lr_scheduler = ReduceLROnPlateau(opt, mode='min', factor=0.1, patience=10)
    
    # definc the training parameters
    params_train = {
        'num_epochs':5,
        'optimizer':opt,
        'loss_func':loss_func,
        'train_dl':train_dl, 
        'val_dl':valid_dl,
        'sanity_check':False,
        'lr_scheduler':lr_scheduler,
        'path2weights':'./trained_model/weights_original_res.pt'
    }
    return model, params_train

# Function

In [6]:
def get_lr(opt):
    for param_group in opt.param_groups:
        return param_group['lr']

def metric_batch(output, target):
    pred = output.argmax(1, keepdim=True)
    corrects = pred.eq(target.view_as(pred)).sum().item()
    return corrects

def loss_batch(loss_func, output, target, opt=None):
    loss = loss_func(output, target)
    metric_b = metric_batch(output, target)

    if opt is not None:
        opt.zero_grad()
        loss.backward()
        opt.step()

    return loss.item(), metric_b

def loss_epoch(model, loss_func, dataset_dl, sanity_check=False, opt=None):
    running_loss = 0.0
    running_metric = 0.0
    len_data = len(dataset_dl.dataset)

    for xb, yb in dataset_dl:
        xb = xb.to(device)
        yb = yb.to(device)
        output = model(xb)

        loss_b, metric_b = loss_batch(loss_func, output, yb, opt)

        running_loss += loss_b
        
        if metric_b is not None:
            running_metric += metric_b
        
        if sanity_check is True:
            break

    loss = running_loss / len_data
    metric = running_metric / len_data

    return loss, metric

def train_val(model, params, epoch):
    num_epochs=epoch
    loss_func=params["loss_func"]
    opt=params["optimizer"]
    train_dl=params["train_dl"]
    val_dl=params["val_dl"]
    sanity_check=params["sanity_check"]
    lr_scheduler=params["lr_scheduler"]
    path2weights=params["path2weights"]

    loss_history = {'train': [], 'val': []}
    metric_history = {'train': [], 'val': []}

    # # GPU out of memoty error
    # best_model_wts = copy.deepcopy(model.state_dict())

    best_loss = float('inf')

    start_time = time.time()

    for epoch in range(num_epochs):
        current_lr = get_lr(opt)
        print('Epoch {}/{}, current lr={}'.format(epoch, num_epochs-1, current_lr))

        model.train()
        train_loss, train_metric = loss_epoch(model, loss_func, train_dl, sanity_check, opt)
        loss_history['train'].append(train_loss)
        metric_history['train'].append(train_metric)

        model.eval()
        with torch.no_grad():
            val_loss, val_metric = loss_epoch(model, loss_func, val_dl, sanity_check)
        loss_history['val'].append(val_loss)
        metric_history['val'].append(val_metric)

        if val_loss < best_loss:
            best_loss = val_loss
            # best_model_wts = copy.deepcopy(model.state_dict())

            torch.save(model.state_dict(), path2weights)
            print('Copied best model weights!')
            print('Get best val_loss')

        lr_scheduler.step(val_loss)

        print('train loss: %.6f, val loss: %.6f, accuracy: %.2f, time: %.4f min' %(train_loss, val_loss, 100*val_metric, (time.time()-start_time)/60))
        print('-'*10)

    # model.load_state_dict(best_model_wts)

    return model, loss_history, metric_history

In [7]:
class CustomDataSet():
    def __init__(self, main_dir, transform, num):
        self.main_dir = main_dir
        self.transform = transform
        self.num = num
        all_imgs = os.listdir(main_dir)
        self.total_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

    def __len__(self):
        return len(self.total_imgs)

    def __getitem__(self, idx):
        img_loc = os.path.join(self.main_dir, self.total_imgs[idx])
        image = Image.open(img_loc).convert("RGB")
        tensor_image = self.transform(image)
        return tensor_image

In [8]:
class CustomSubset(Dataset):
    def __init__(self,Subset,transform=None):
        super(CustomSubset,self).__init__()
        self.Subset=Subset
        self.indices=Subset.indices
        self.transform=transform

    def __len__(self):
        return len(self.Subset)

    def __getitem__(self,idx):
        img,label=self.Subset[idx]
        if self.transform is not None:
            img=self.transform(img)
        return img,label

# csv

In [9]:
df_155655 = pd.read_csv('../csv/data_155655.csv')
label = pd.read_csv('../csv/data_172950.csv', index_col=0)

# Phase 1

## Training

In [11]:
# Initial train data(17,295)
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)

# train & valid split
train_data_len = int(len(train_folder_dataset)*0.8)
valid_data_len = len(train_folder_dataset) - train_data_len
train_data, valid_data = random_split(train_folder_dataset, [train_data_len, valid_data_len])

In [12]:
print(len(train_data), len(valid_data))

13836 3459


In [13]:
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])

train_data = CustomSubset(train_data, train_transformation)
valid_data = CustomSubset(valid_data, train_transformation)

train_dl = DataLoader(train_data, batch_size=32, shuffle=True)
valid_dl = DataLoader(valid_data, batch_size=32, shuffle=False)

In [14]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [16]:
model, loss_hist, metric_hist = train_val(model, params_train, 5)

Epoch 0/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.331893, val loss: 0.324012, accuracy: 90.78, time: 1.5423 min
----------
Epoch 1/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.228293, val loss: 0.287123, accuracy: 92.74, time: 2.4251 min
----------
Epoch 2/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.206911, val loss: 0.194008, accuracy: 94.68, time: 3.2917 min
----------
Epoch 3/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.183145, val loss: 0.187804, accuracy: 94.28, time: 4.1291 min
----------
Epoch 4/4, current lr=0.001
train loss: 0.175342, val loss: 0.223950, accuracy: 94.02, time: 4.9915 min
----------


## Predict

### Set 1 Data Load

In [17]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
test_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])
my_dataset = CustomDataSet(test_dir, transform=test_transformation, num=0)
test_loader = DataLoader(my_dataset, shuffle=False)

all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

### Set 1 data predict & Softmax

In [18]:
pred_list =[]
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [19]:
len(pred_list)

17295

In [20]:
max_list = []
for i in range(0,17295):
    max_list.append(float(pred_list[i][0].max()))

### Uncertainty

In [21]:
cnt = 0
label_list = []
img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    if bool(pred_list[i][0].sort().values[-1] - pred_list[i][0].sort().values[-2] <= 0.98):
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

### Mislabelings & Engineer Cost

In [22]:
engineer_label_count = []
wrong_cnt_list = []
real_label_list = list(df_155655.loc[:17294]['failureNum'].values)
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 3517285.09it/s]


4

In [23]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

## Result

In [24]:
print(engineer_label_count)
print(wrong_cnt_list)

[9509]
[4]


## Append Set 1 Data to Train Data

In [25]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

# Phase 2

## Training

In [26]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)

# train & valid split
train_data_len = int(len(train_folder_dataset)*0.8)
valid_data_len = len(train_folder_dataset) - train_data_len
train_data, valid_data = random_split(train_folder_dataset, [train_data_len, valid_data_len])

In [27]:
print(len(train_data), len(valid_data))

27672 6918


In [28]:
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])

train_data = CustomSubset(train_data, train_transformation)
valid_data = CustomSubset(valid_data, train_transformation)

train_dl = DataLoader(train_data, batch_size=32, shuffle=True)
valid_dl = DataLoader(valid_data, batch_size=32, shuffle=False)

In [29]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [30]:
model, loss_hist, metric_hist = train_val(model, params_train, 5)

Epoch 0/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.345949, val loss: 0.447560, accuracy: 87.14, time: 1.6770 min
----------
Epoch 1/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.205498, val loss: 0.185279, accuracy: 94.81, time: 3.4603 min
----------
Epoch 2/4, current lr=0.001
train loss: 0.168548, val loss: 0.189313, accuracy: 95.09, time: 5.1698 min
----------
Epoch 3/4, current lr=0.001
train loss: 0.145948, val loss: 0.264448, accuracy: 92.09, time: 6.8366 min
----------
Epoch 4/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.128392, val loss: 0.115933, accuracy: 96.14, time: 8.5057 min
----------


## Predict

### Set 2 Data Load

In [31]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
test_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])
my_dataset = CustomDataSet(test_dir, transform=test_transformation, num=0)
test_loader = DataLoader(my_dataset, shuffle=False)

all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

In [32]:
len(test_loader.dataset)

17295

### Set 2 Data Predict & Softmax

In [33]:
pred_list =[]
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [34]:
len(pred_list)

17295

In [35]:
max_list = []
for i in range(0,17295):
    max_list.append(float(pred_list[i][0].max()))

### Uncertainty

In [36]:
cnt = 0
label_list = []
img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    if bool(pred_list[i][0].sort().values[-1] - pred_list[i][0].sort().values[-2] <= 0.98):
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

### Mislabelings & Engineer Cost

In [37]:
real_label_list = list(df_155655.loc[17295*1:17295*2-1]['failureNum'].values)
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 4338286.45it/s]


91

In [38]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

## Result

In [39]:
print(engineer_label_count)
print(wrong_cnt_list)

[9509, 5673]
[4, 91]


## Append Set 2 Data to Train data

In [40]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

# Phase 3

## Training

In [41]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)

# train & valid split
train_data_len = int(len(train_folder_dataset)*0.8)
valid_data_len = len(train_folder_dataset) - train_data_len
train_data, valid_data = random_split(train_folder_dataset, [train_data_len, valid_data_len])

In [42]:
print(len(train_data), len(valid_data))

41508 10377


In [43]:
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])

train_data = CustomSubset(train_data, train_transformation)
valid_data = CustomSubset(valid_data, train_transformation)

train_dl = DataLoader(train_data, batch_size=32, shuffle=True)
valid_dl = DataLoader(valid_data, batch_size=32, shuffle=False)

In [44]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [45]:
model, loss_hist, metric_hist = train_val(model, params_train, 5)

Epoch 0/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.347857, val loss: 1.300647, accuracy: 84.41, time: 2.7268 min
----------
Epoch 1/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.207475, val loss: 0.245724, accuracy: 92.56, time: 5.3616 min
----------
Epoch 2/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.169871, val loss: 0.158580, accuracy: 95.01, time: 8.0267 min
----------
Epoch 3/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.143709, val loss: 0.134284, accuracy: 95.62, time: 10.6906 min
----------
Epoch 4/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.128882, val loss: 0.130549, accuracy: 95.78, time: 13.3967 min
----------


## Predict

### Set 3 Data Load

In [46]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
test_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])
my_dataset = CustomDataSet(test_dir, transform=test_transformation, num=0)
test_loader = DataLoader(my_dataset, shuffle=False)

all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

In [47]:
len(test_loader.dataset)

17295

### Set 3 Data Predict & Softmax

In [48]:
pred_list =[]
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [49]:
len(pred_list)

17295

In [50]:
max_list = []
for i in range(0,17295):
    max_list.append(float(pred_list[i][0].max()))

### Uncertainty

In [53]:
cnt = 0
label_list = []
img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    if bool(pred_list[i][0].sort().values[-1] - pred_list[i][0].sort().values[-2] <= 0.98):
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

### Mislabelings & Engineer Cost

In [54]:
real_label_list = list(df_155655.loc[17295*2:17295*3-1]['failureNum'].values)
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 3470670.67it/s]


16

In [55]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

## Result

In [56]:
print(engineer_label_count)
print(wrong_cnt_list)

[9509, 5673, 3969]
[4, 91, 16]


## Append Set 3 Data to Train Data

In [57]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

# Phase 4

## Training

In [58]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)

# train & valid split
train_data_len = int(len(train_folder_dataset)*0.8)
valid_data_len = len(train_folder_dataset) - train_data_len
train_data, valid_data = random_split(train_folder_dataset, [train_data_len, valid_data_len])

In [59]:
print(len(train_data), len(valid_data))

55344 13836


In [60]:
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])

train_data = CustomSubset(train_data, train_transformation)
valid_data = CustomSubset(valid_data, train_transformation)

train_dl = DataLoader(train_data, batch_size=32, shuffle=True)
valid_dl = DataLoader(valid_data, batch_size=32, shuffle=False)

In [61]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [62]:
model, loss_hist, metric_hist = train_val(model, params_train, 5)

Epoch 0/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.298132, val loss: 0.221231, accuracy: 92.14, time: 3.4211 min
----------
Epoch 1/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.182382, val loss: 0.187300, accuracy: 94.13, time: 6.8321 min
----------
Epoch 2/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.151660, val loss: 0.166604, accuracy: 94.65, time: 10.3746 min
----------
Epoch 3/4, current lr=0.001
train loss: 0.131734, val loss: 0.173981, accuracy: 95.24, time: 13.8325 min
----------
Epoch 4/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.117286, val loss: 0.109345, accuracy: 96.31, time: 17.2106 min
----------


## Predict

### Set 4 Data Load

In [63]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
test_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])
my_dataset = CustomDataSet(test_dir, transform=test_transformation, num=0)
test_loader = DataLoader(my_dataset, shuffle=False)

all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

In [64]:
len(test_loader.dataset)

17295

### Set 4  Data Predict & Softmax

In [65]:
pred_list =[]
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [66]:
len(pred_list)

17295

In [67]:
max_list = []
for i in range(0,17295):
    max_list.append(float(pred_list[i][0].max()))

### Uncertainty

In [68]:
cnt = 0
label_list = []
img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    if bool(pred_list[i][0].sort().values[-1] - pred_list[i][0].sort().values[-2] <= 0.98):
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

### Mislabelings & Engineer Cost

In [69]:
real_label_list = list(df_155655.loc[17295*3:17295*4-1]['failureNum'].values)
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 4338286.45it/s]


13

In [70]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

## Result

In [71]:
print(engineer_label_count)
print(wrong_cnt_list)

[9509, 5673, 3969, 4457]
[4, 91, 16, 13]


## Append Set 4 Data to Train Data

In [72]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

# Phase 5

## Training

In [73]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)

# train & valid split
train_data_len = int(len(train_folder_dataset)*0.8)
valid_data_len = len(train_folder_dataset) - train_data_len
train_data, valid_data = random_split(train_folder_dataset, [train_data_len, valid_data_len])

In [74]:
print(len(train_data), len(valid_data))

69180 17295


In [75]:
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])

train_data = CustomSubset(train_data, train_transformation)
valid_data = CustomSubset(valid_data, train_transformation)

train_dl = DataLoader(train_data, batch_size=32, shuffle=True)
valid_dl = DataLoader(valid_data, batch_size=32, shuffle=False)

In [76]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [77]:
model, loss_hist, metric_hist = train_val(model, params_train, 5)

Epoch 0/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.257689, val loss: 0.671637, accuracy: 80.55, time: 4.2218 min
----------
Epoch 1/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.162191, val loss: 0.155870, accuracy: 94.81, time: 8.7474 min
----------
Epoch 2/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.133412, val loss: 0.136975, accuracy: 95.77, time: 12.8994 min
----------
Epoch 3/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.116397, val loss: 0.106244, accuracy: 96.82, time: 17.1054 min
----------
Epoch 4/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.101641, val loss: 0.100716, accuracy: 96.87, time: 21.3612 min
----------


## Predict

### Set 5 Data Load

In [78]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
test_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])
my_dataset = CustomDataSet(test_dir, transform=test_transformation, num=0)
test_loader = DataLoader(my_dataset, shuffle=False)

all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

In [79]:
len(test_loader.dataset)

17295

### Set 5 Data Predict & Softmax

In [80]:
pred_list =[]
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [81]:
len(pred_list)

17295

In [82]:
max_list = []
for i in range(0,17295):
    max_list.append(float(pred_list[i][0].max()))

### Uncertainty

In [83]:
cnt = 0
label_list = []
img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    if bool(pred_list[i][0].sort().values[-1] - pred_list[i][0].sort().values[-2] <= 0.98):
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

### Mislabelings & Engineer Cost

In [84]:
real_label_list = list(df_155655.loc[17295*4:17295*5-1]['failureNum'].values)
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 4267087.51it/s]


76

In [85]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

## Result

In [86]:
print(engineer_label_count)
print(wrong_cnt_list)

[9509, 5673, 3969, 4457, 1292]
[4, 91, 16, 13, 76]


## Append Set 5 to Train Data

In [87]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

# Phase 6

## Training

In [88]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)

# train & valid split
train_data_len = int(len(train_folder_dataset)*0.8)
valid_data_len = len(train_folder_dataset) - train_data_len
train_data, valid_data = random_split(train_folder_dataset, [train_data_len, valid_data_len])

In [89]:
print(len(train_data), len(valid_data))

83016 20754


In [90]:
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])

train_data = CustomSubset(train_data, train_transformation)
valid_data = CustomSubset(valid_data, train_transformation)

train_dl = DataLoader(train_data, batch_size=32, shuffle=True)
valid_dl = DataLoader(valid_data, batch_size=32, shuffle=False)

In [91]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [92]:
model, loss_hist, metric_hist = train_val(model, params_train, 5)

Epoch 0/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.230613, val loss: 0.290637, accuracy: 93.92, time: 5.0159 min
----------
Epoch 1/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.146696, val loss: 0.124405, accuracy: 96.01, time: 10.2698 min
----------
Epoch 2/4, current lr=0.001
train loss: 0.120556, val loss: 0.126586, accuracy: 95.96, time: 15.3039 min
----------
Epoch 3/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.102748, val loss: 0.091183, accuracy: 96.85, time: 20.3808 min
----------
Epoch 4/4, current lr=0.001
train loss: 0.088343, val loss: 0.102171, accuracy: 96.64, time: 25.4856 min
----------


## Predict

### Set 6 Data Load

In [93]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
test_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])
my_dataset = CustomDataSet(test_dir, transform=test_transformation, num=0)
test_loader = DataLoader(my_dataset, shuffle=False)

all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

In [94]:
len(test_loader.dataset)

17295

### Set 6 Data Predict & Softmax

In [95]:
pred_list =[]
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [96]:
len(pred_list)

17295

In [97]:
max_list = []
for i in range(0,17295):
    max_list.append(float(pred_list[i][0].max()))

### Uncertainty

In [98]:
cnt = 0
label_list = []
img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    if bool(pred_list[i][0].sort().values[-1] - pred_list[i][0].sort().values[-2] <= 0.98):
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

### Mislabelings & Engineer Cost

In [99]:
real_label_list = list(df_155655.loc[17295*5:17295*6-1]['failureNum'].values)
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 3471168.90it/s]


6

In [100]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

## Result

In [101]:
print(engineer_label_count)
print(wrong_cnt_list)

[9509, 5673, 3969, 4457, 1292, 3624]
[4, 91, 16, 13, 76, 6]


## Appned Set 6 Data to Train data

In [102]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

# Phase 7

## Training

In [103]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)

# train & valid split
train_data_len = int(len(train_folder_dataset)*0.8)
valid_data_len = len(train_folder_dataset) - train_data_len
train_data, valid_data = random_split(train_folder_dataset, [train_data_len, valid_data_len])

In [104]:
print(len(train_data), len(valid_data))

96852 24213


In [105]:
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])

train_data = CustomSubset(train_data, train_transformation)
valid_data = CustomSubset(valid_data, train_transformation)

train_dl = DataLoader(train_data, batch_size=32, shuffle=True)
valid_dl = DataLoader(valid_data, batch_size=32, shuffle=False)

In [106]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [107]:
model, loss_hist, metric_hist = train_val(model, params_train, 5)

Epoch 0/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.220039, val loss: 0.143973, accuracy: 95.25, time: 5.9654 min
----------
Epoch 1/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.138641, val loss: 0.142830, accuracy: 95.94, time: 11.9636 min
----------
Epoch 2/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.113925, val loss: 0.098353, accuracy: 96.77, time: 17.6082 min
----------
Epoch 3/4, current lr=0.001
train loss: 0.098396, val loss: 0.102648, accuracy: 96.69, time: 23.2945 min
----------
Epoch 4/4, current lr=0.001
train loss: 0.085633, val loss: 0.105934, accuracy: 96.95, time: 29.2065 min
----------


## Predict

### Set 7 Data Load

In [108]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
test_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])
my_dataset = CustomDataSet(test_dir, transform=test_transformation, num=0)
test_loader = DataLoader(my_dataset, shuffle=False)

all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

### Set 7 Data Predict & Softmax

In [109]:
pred_list =[]
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [110]:
max_list = []
for i in range(0,17295):
    max_list.append(float(pred_list[i][0].max()))

### Uncertainty

In [111]:
cnt = 0
label_list = []
img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    if bool(pred_list[i][0].sort().values[-1] - pred_list[i][0].sort().values[-2] <= 0.98):
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

In [112]:
real_label_list = list(df_155655.loc[17295*6:17295*7-1]['failureNum'].values)
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 4339064.94it/s]


89

In [113]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

## Result

In [114]:
print(engineer_label_count)
print(wrong_cnt_list)

[9509, 5673, 3969, 4457, 1292, 3624, 2707]
[4, 91, 16, 13, 76, 6, 89]


## Append Set 7 Data to Train Data

In [115]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

# Phase 8

## Training

In [116]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)

# train & valid split
train_data_len = int(len(train_folder_dataset)*0.8)
valid_data_len = len(train_folder_dataset) - train_data_len
train_data, valid_data = random_split(train_folder_dataset, [train_data_len, valid_data_len])

In [117]:
print(len(train_data), len(valid_data))

110688 27672


In [118]:
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])

train_data = CustomSubset(train_data, train_transformation)
valid_data = CustomSubset(valid_data, train_transformation)

train_dl = DataLoader(train_data, batch_size=32, shuffle=True)
valid_dl = DataLoader(valid_data, batch_size=32, shuffle=False)

In [119]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [120]:
model, loss_hist, metric_hist = train_val(model, params_train, 5)

Epoch 0/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.218783, val loss: 0.120962, accuracy: 96.02, time: 6.8736 min
----------
Epoch 1/4, current lr=0.001
train loss: 0.137047, val loss: 0.180607, accuracy: 94.15, time: 13.5853 min
----------
Epoch 2/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.114426, val loss: 0.101990, accuracy: 96.78, time: 20.2777 min
----------
Epoch 3/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.097166, val loss: 0.085713, accuracy: 97.13, time: 27.1191 min
----------
Epoch 4/4, current lr=0.001
train loss: 0.085197, val loss: 0.086822, accuracy: 97.22, time: 33.6976 min
----------


## Predict

### Set 8 Data Load

In [121]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
test_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])
my_dataset = CustomDataSet(test_dir, transform=test_transformation, num=0)
test_loader = DataLoader(my_dataset, shuffle=False)

all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

### Set 8 Data Predict & Softmax

In [122]:
pred_list =[]
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [123]:
max_list = []
for i in range(0,17295):
    max_list.append(float(pred_list[i][0].max()))

### Uncertainty

In [124]:
cnt = 0
label_list = []
img_folder = natsort.natsorted(all_imgs)

for i in range(0,17295):
    if bool(pred_list[i][0].sort().values[-1] - pred_list[i][0].sort().values[-2] <= 0.98):
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

### Mislabelings & Engineer Cost

In [125]:
real_label_list = list(df_155655.loc[17295*7:17295*8-1]['failureNum'].values)
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 2479169.09it/s]


42

In [126]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

## Result

In [127]:
print(engineer_label_count)
print(wrong_cnt_list)

[9509, 5673, 3969, 4457, 1292, 3624, 2707, 2407]
[4, 91, 16, 13, 76, 6, 89, 42]


## Add Set 8 Data to Train Data

In [128]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

# Phase 9

## Training

In [129]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)

# train & valid split
train_data_len = int(len(train_folder_dataset)*0.8)
valid_data_len = len(train_folder_dataset) - train_data_len
train_data, valid_data = random_split(train_folder_dataset, [train_data_len, valid_data_len])

In [130]:
print(len(train_data), len(valid_data))

124524 31131


In [131]:
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])

train_data = CustomSubset(train_data, train_transformation)
valid_data = CustomSubset(valid_data, train_transformation)

train_dl = DataLoader(train_data, batch_size=32, shuffle=True)
valid_dl = DataLoader(valid_data, batch_size=32, shuffle=False)

In [134]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [135]:
model, loss_hist, metric_hist = train_val(model, params_train, 5)

Epoch 0/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.195456, val loss: 0.173200, accuracy: 94.39, time: 7.9019 min
----------
Epoch 1/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.129623, val loss: 0.123411, accuracy: 95.87, time: 15.5351 min
----------
Epoch 2/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.105067, val loss: 0.111319, accuracy: 96.59, time: 22.8452 min
----------
Epoch 3/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.089976, val loss: 0.086758, accuracy: 97.23, time: 30.0729 min
----------
Epoch 4/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.077460, val loss: 0.083575, accuracy: 97.18, time: 37.3162 min
----------


## Predict

### Set 9 Data Load

In [136]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
test_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])
my_dataset = CustomDataSet(test_dir, transform=test_transformation, num=0)
test_loader = DataLoader(my_dataset, shuffle=False)

all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

### Set 9 Data Predict & Softmax

In [137]:
pred_list =[]
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [138]:
max_list = []
for i in range(0,17295):
    max_list.append(float(pred_list[i][0].max()))

### Uncertainty

In [139]:
cnt = 0
label_list = []
img_folder = natsort.natsorted(all_imgs)

for i in range(0,17295):
    if bool(pred_list[i][0].sort().values[-1] - pred_list[i][0].sort().values[-2] <= 0.98):
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

### Mislabelings & Engineer Cost

In [140]:
real_label_list = list(df_155655.loc[17295*8:17295*9-1]['failureNum'].values)
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 3431595.05it/s]


9

In [141]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

## Result

In [142]:
print(engineer_label_count)
print(wrong_cnt_list)

[9509, 5673, 3969, 4457, 1292, 3624, 2707, 2407, 5218]
[4, 91, 16, 13, 76, 6, 89, 42, 9]


## Add Set 9 Data to Train Data

In [143]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

# Final Result
- Mislabelings : 346 / 155,655(0.22%)
    - per phase : 4 / 91 / 16 / 13 / 76 / 6 / 89 / 42 / 9
- Engineer Cost : 38,856 / 155,655(25.00%)
    - per phase : 9,509 / 5,673 / 3,969 / 4,457 / 1,292 / 3,624 / 2,707 / 2,407 / 5,218