# Uncertainty Based Cost-Effective Labeling Methodology

In [1]:
import torch
import torchvision.datasets as dset
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, random_split, Dataset
import torch.nn as nn
import time
from torch import optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torchvision
import os
import natsort
import pandas as pd
from PIL import Image
from tqdm import tqdm
import shutil
import pandas as pd
import numpy as np

# Device Setting

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
class gpu_setting:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Model

In [4]:
class BasicBlock(nn.Module):
    expansion = 1
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()

        self.residual_function = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(),
            nn.Conv2d(out_channels, out_channels * BasicBlock.expansion, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(out_channels * BasicBlock.expansion),
        )

        # identity mapping, input과 output의 feature map size, filter 수가 동일한 경우 사용.
        self.shortcut = nn.Sequential()

        self.relu = nn.ReLU()

        # projection mapping using 1x1conv
        if stride != 1 or in_channels != BasicBlock.expansion * out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels * BasicBlock.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels * BasicBlock.expansion)
            )

    def forward(self, x):
        x = self.residual_function(x) + self.shortcut(x)
        x = self.relu(x)
        return x


class BottleNeck(nn.Module):
    expansion = 4
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()

        self.residual_function = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(),
            nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(),
            nn.Conv2d(out_channels, out_channels * BottleNeck.expansion, kernel_size=1, stride=1, bias=False),
            nn.BatchNorm2d(out_channels * BottleNeck.expansion),
        )

        self.shortcut = nn.Sequential()

        self.relu = nn.ReLU()

        if stride != 1 or in_channels != out_channels * BottleNeck.expansion:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels*BottleNeck.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels*BottleNeck.expansion)
            )
            
    def forward(self, x):
        x = self.residual_function(x) + self.shortcut(x)
        x = self.relu(x)
        return x

class ResNet(nn.Module):
    def __init__(self, block, num_block, num_classes=9, init_weights=True):
        super().__init__()

        self.in_channels=64

        self.conv1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        )

        self.conv2_x = self._make_layer(block, 64, num_block[0], 1)
        self.conv3_x = self._make_layer(block, 128, num_block[1], 2)
        self.conv4_x = self._make_layer(block, 256, num_block[2], 2)
        self.conv5_x = self._make_layer(block, 512, num_block[3], 2)

        self.avg_pool = nn.AdaptiveAvgPool2d((1,1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)
        
        # weights inittialization
        if init_weights:
            self._initialize_weights()

    def _make_layer(self, block, out_channels, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_channels, out_channels, stride))
            self.in_channels = out_channels * block.expansion

        return nn.Sequential(*layers)

    def forward(self,x):
        output = self.conv1(x)
        output = self.conv2_x(output)
        x = self.conv3_x(output)
        x = self.conv4_x(x)
        x = self.conv5_x(x)
        x = self.avg_pool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

    # define weight initialization function
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)

def resnet18():
    return ResNet(BasicBlock, [2,2,2,2])

def resnet34():
    return ResNet(BasicBlock, [3, 4, 6, 3])

def resnet50():
    return ResNet(BottleNeck, [3,4,6,3])

def resnet101():
    return ResNet(BottleNeck, [3, 4, 23, 3])

def resnet152():
    return ResNet(BottleNeck, [3, 8, 36, 3])

In [5]:
def ResNetParameters(model, train_dl, valid_dl):
    device = gpu_setting.device
    model = model.lower()
    if model == 'resnet34':
        model = resnet34().to(device)
    if model == 'resnet50':
        model = resnet50().to(device)
    if model == 'resnet101':
        model = resnet101().to(device)
        
    loss_func = nn.CrossEntropyLoss(reduction='sum')
    opt = optim.Adam(model.parameters(), lr=0.001)
    lr_scheduler = ReduceLROnPlateau(opt, mode='min', factor=0.1, patience=10)
    
    # definc the training parameters
    params_train = {
        'num_epochs':5,
        'optimizer':opt,
        'loss_func':loss_func,
        'train_dl':train_dl, 
        'val_dl':valid_dl,
        'sanity_check':False,
        'lr_scheduler':lr_scheduler,
        'path2weights':'./trained_model/weights_original_res.pt'
    }
    return model, params_train

# Function

In [6]:
def get_lr(opt):
    for param_group in opt.param_groups:
        return param_group['lr']

def metric_batch(output, target):
    pred = output.argmax(1, keepdim=True)
    corrects = pred.eq(target.view_as(pred)).sum().item()
    return corrects

def loss_batch(loss_func, output, target, opt=None):
    loss = loss_func(output, target)
    metric_b = metric_batch(output, target)

    if opt is not None:
        opt.zero_grad()
        loss.backward()
        opt.step()

    return loss.item(), metric_b

def loss_epoch(model, loss_func, dataset_dl, sanity_check=False, opt=None):
    running_loss = 0.0
    running_metric = 0.0
    len_data = len(dataset_dl.dataset)

    for xb, yb in dataset_dl:
        xb = xb.to(device)
        yb = yb.to(device)
        output = model(xb)

        loss_b, metric_b = loss_batch(loss_func, output, yb, opt)

        running_loss += loss_b
        
        if metric_b is not None:
            running_metric += metric_b
        
        if sanity_check is True:
            break

    loss = running_loss / len_data
    metric = running_metric / len_data

    return loss, metric

def train_val(model, params, epoch):
    num_epochs=epoch
    loss_func=params["loss_func"]
    opt=params["optimizer"]
    train_dl=params["train_dl"]
    val_dl=params["val_dl"]
    sanity_check=params["sanity_check"]
    lr_scheduler=params["lr_scheduler"]
    path2weights=params["path2weights"]

    loss_history = {'train': [], 'val': []}
    metric_history = {'train': [], 'val': []}

    # # GPU out of memoty error
    # best_model_wts = copy.deepcopy(model.state_dict())

    best_loss = float('inf')

    start_time = time.time()

    for epoch in range(num_epochs):
        current_lr = get_lr(opt)
        print('Epoch {}/{}, current lr={}'.format(epoch, num_epochs-1, current_lr))

        model.train()
        train_loss, train_metric = loss_epoch(model, loss_func, train_dl, sanity_check, opt)
        loss_history['train'].append(train_loss)
        metric_history['train'].append(train_metric)

        model.eval()
        with torch.no_grad():
            val_loss, val_metric = loss_epoch(model, loss_func, val_dl, sanity_check)
        loss_history['val'].append(val_loss)
        metric_history['val'].append(val_metric)

        if val_loss < best_loss:
            best_loss = val_loss
            # best_model_wts = copy.deepcopy(model.state_dict())

            torch.save(model.state_dict(), path2weights)
            print('Copied best model weights!')
            print('Get best val_loss')

        lr_scheduler.step(val_loss)

        print('train loss: %.6f, val loss: %.6f, accuracy: %.2f, time: %.4f min' %(train_loss, val_loss, 100*val_metric, (time.time()-start_time)/60))
        print('-'*10)

    # model.load_state_dict(best_model_wts)

    return model, loss_history, metric_history

In [7]:
class CustomDataSet():
    def __init__(self, main_dir, transform, num):
        self.main_dir = main_dir
        self.transform = transform
        self.num = num
        all_imgs = os.listdir(main_dir)
        self.total_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

    def __len__(self):
        return len(self.total_imgs)

    def __getitem__(self, idx):
        img_loc = os.path.join(self.main_dir, self.total_imgs[idx])
        image = Image.open(img_loc).convert("RGB")
        tensor_image = self.transform(image)
        return tensor_image

In [8]:
class CustomSubset(Dataset):
    def __init__(self,Subset,transform=None):
        super(CustomSubset,self).__init__()
        self.Subset=Subset
        self.indices=Subset.indices
        self.transform=transform

    def __len__(self):
        return len(self.Subset)

    def __getitem__(self,idx):
        img,label=self.Subset[idx]
        if self.transform is not None:
            img=self.transform(img)
        return img,label

# csv

In [9]:
df_155655 = pd.read_csv('../csv/data_155655.csv')
label = pd.read_csv('../csv/data_172950.csv', index_col=0)

# Phase 1

## Training

In [10]:
# Initial train data(17,295)
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)

# train & valid split
train_data_len = int(len(train_folder_dataset)*0.8)
valid_data_len = len(train_folder_dataset) - train_data_len
train_data, valid_data = random_split(train_folder_dataset, [train_data_len, valid_data_len])

In [11]:
print(len(train_data), len(valid_data))

13836 3459


In [12]:
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])

train_data = CustomSubset(train_data, train_transformation)
valid_data = CustomSubset(valid_data, train_transformation)

train_dl = DataLoader(train_data, batch_size=32, shuffle=True)
valid_dl = DataLoader(valid_data, batch_size=32, shuffle=False)

In [13]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [14]:
model, loss_hist, metric_hist = train_val(model, params_train, 5)

Epoch 0/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.364624, val loss: 0.206405, accuracy: 93.90, time: 1.2404 min
----------
Epoch 1/4, current lr=0.001
train loss: 0.252514, val loss: 0.215781, accuracy: 93.55, time: 1.7828 min
----------
Epoch 2/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.217148, val loss: 0.185580, accuracy: 94.48, time: 2.3242 min
----------
Epoch 3/4, current lr=0.001
train loss: 0.199046, val loss: 0.356178, accuracy: 89.30, time: 2.8689 min
----------
Epoch 4/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.182759, val loss: 0.137343, accuracy: 95.43, time: 3.4153 min
----------


## Predict

### Set 1 Data Load

In [15]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
test_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])
my_dataset = CustomDataSet(test_dir, transform=test_transformation, num=0)
test_loader = DataLoader(my_dataset, shuffle=False)

all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

### Set 1 data predict & Softmax

In [16]:
pred_list =[]
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [17]:
len(pred_list)

17295

In [18]:
max_list = []
for i in range(0,17295):
    max_list.append(float(pred_list[i][0].max()))

### Uncertainty

In [19]:
cnt = 0
label_list = []
img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    log_outputs = torch.log(pred_list[i][0])
    entropy = - torch.sum(pred_list[i][0] * log_outputs)
    if entropy> 0.05:
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

### Mislabelings & Engineer Cost

In [20]:
engineer_label_count = []
wrong_cnt_list = []
real_label_list = list(df_155655.loc[:17294]['failureNum'].values)
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 5784266.62it/s]


18

In [21]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

## Result

In [22]:
print(engineer_label_count)
print(wrong_cnt_list)

[5674]
[18]


## Append Set 1 Data to Train Data

In [23]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

# Phase 2

## Training

In [24]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)

# train & valid split
train_data_len = int(len(train_folder_dataset)*0.8)
valid_data_len = len(train_folder_dataset) - train_data_len
train_data, valid_data = random_split(train_folder_dataset, [train_data_len, valid_data_len])

In [25]:
print(len(train_data), len(valid_data))

27672 6918


In [26]:
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])

train_data = CustomSubset(train_data, train_transformation)
valid_data = CustomSubset(valid_data, train_transformation)

train_dl = DataLoader(train_data, batch_size=32, shuffle=True)
valid_dl = DataLoader(valid_data, batch_size=32, shuffle=False)

In [27]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [28]:
model, loss_hist, metric_hist = train_val(model, params_train, 5)

Epoch 0/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.347037, val loss: 0.309095, accuracy: 91.57, time: 1.1145 min
----------
Epoch 1/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.205975, val loss: 0.217898, accuracy: 93.12, time: 2.2288 min
----------
Epoch 2/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.165990, val loss: 0.165539, accuracy: 94.69, time: 3.3325 min
----------
Epoch 3/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.147610, val loss: 0.141141, accuracy: 95.40, time: 4.4417 min
----------
Epoch 4/4, current lr=0.001
train loss: 0.124136, val loss: 0.167317, accuracy: 94.78, time: 5.5494 min
----------


## Predict

### Set 2 Data Load

In [29]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
test_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])
my_dataset = CustomDataSet(test_dir, transform=test_transformation, num=0)
test_loader = DataLoader(my_dataset, shuffle=False)

all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

In [30]:
len(test_loader.dataset)

17295

### Set 2 Data Predict & Softmax

In [31]:
pred_list =[]
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [32]:
len(pred_list)

17295

In [33]:
max_list = []
for i in range(0,17295):
    max_list.append(float(pred_list[i][0].max()))

### Uncertainty

In [34]:
cnt = 0
label_list = []
img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    log_outputs = torch.log(pred_list[i][0])
    entropy = - torch.sum(pred_list[i][0] * log_outputs)
    if entropy> 0.05:
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

### Mislabelings & Engineer Cost

In [35]:
real_label_list = list(df_155655.loc[17295*1:17295*2-1]['failureNum'].values)
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 5670326.56it/s]


89

In [36]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

## Result

In [37]:
print(engineer_label_count)
print(wrong_cnt_list)

[5674, 5627]
[18, 89]


## Append Set 2 Data to Train data

In [38]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

# Phase 3

## Training

In [39]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)

# train & valid split
train_data_len = int(len(train_folder_dataset)*0.8)
valid_data_len = len(train_folder_dataset) - train_data_len
train_data, valid_data = random_split(train_folder_dataset, [train_data_len, valid_data_len])

In [40]:
print(len(train_data), len(valid_data))

41508 10377


In [41]:
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])

train_data = CustomSubset(train_data, train_transformation)
valid_data = CustomSubset(valid_data, train_transformation)

train_dl = DataLoader(train_data, batch_size=32, shuffle=True)
valid_dl = DataLoader(valid_data, batch_size=32, shuffle=False)

In [42]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [43]:
model, loss_hist, metric_hist = train_val(model, params_train, 5)

Epoch 0/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.322736, val loss: 0.434729, accuracy: 88.20, time: 2.2436 min
----------
Epoch 1/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.201229, val loss: 0.341111, accuracy: 91.24, time: 4.7005 min
----------
Epoch 2/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.162605, val loss: 0.161322, accuracy: 94.84, time: 6.3543 min
----------
Epoch 3/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.142982, val loss: 0.148667, accuracy: 95.32, time: 8.0012 min
----------
Epoch 4/4, current lr=0.001
train loss: 0.127005, val loss: 0.175216, accuracy: 94.93, time: 9.6609 min
----------


## Predict

### Set 3 Data Load

In [44]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
test_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])
my_dataset = CustomDataSet(test_dir, transform=test_transformation, num=0)
test_loader = DataLoader(my_dataset, shuffle=False)

all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

In [45]:
len(test_loader.dataset)

17295

### Set 3 Data Predict & Softmax

In [46]:
pred_list =[]
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [47]:
len(pred_list)

17295

In [48]:
max_list = []
for i in range(0,17295):
    max_list.append(float(pred_list[i][0].max()))

### Uncertainty

In [49]:
cnt = 0
label_list = []
img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    log_outputs = torch.log(pred_list[i][0])
    entropy = - torch.sum(pred_list[i][0] * log_outputs)
    if entropy> 0.05:
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

### Mislabelings & Engineer Cost

In [50]:
real_label_list = list(df_155655.loc[17295*2:17295*3-1]['failureNum'].values)
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 5784266.62it/s]


55

In [51]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

## Result

In [52]:
print(engineer_label_count)
print(wrong_cnt_list)

[5674, 5627, 3769]
[18, 89, 55]


## Append Set 3 Data to Train Data

In [53]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

# Phase 4

## Training

In [54]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)

# train & valid split
train_data_len = int(len(train_folder_dataset)*0.8)
valid_data_len = len(train_folder_dataset) - train_data_len
train_data, valid_data = random_split(train_folder_dataset, [train_data_len, valid_data_len])

In [55]:
print(len(train_data), len(valid_data))

55344 13836


In [56]:
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])

train_data = CustomSubset(train_data, train_transformation)
valid_data = CustomSubset(valid_data, train_transformation)

train_dl = DataLoader(train_data, batch_size=32, shuffle=True)
valid_dl = DataLoader(valid_data, batch_size=32, shuffle=False)

In [57]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [58]:
model, loss_hist, metric_hist = train_val(model, params_train, 5)

Epoch 0/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.305754, val loss: 0.189639, accuracy: 94.17, time: 2.1847 min
----------
Epoch 1/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.188695, val loss: 0.181044, accuracy: 94.30, time: 4.3801 min
----------
Epoch 2/4, current lr=0.001
train loss: 0.155480, val loss: 0.208550, accuracy: 94.23, time: 6.5778 min
----------
Epoch 3/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.131606, val loss: 0.109637, accuracy: 96.49, time: 8.7727 min
----------
Epoch 4/4, current lr=0.001
train loss: 0.117461, val loss: 0.118587, accuracy: 96.38, time: 10.9660 min
----------


## Predict

### Set 4 Data Load

In [59]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
test_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])
my_dataset = CustomDataSet(test_dir, transform=test_transformation, num=0)
test_loader = DataLoader(my_dataset, shuffle=False)

all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

In [60]:
len(test_loader.dataset)

17295

### Set 4  Data Predict & Softmax

In [61]:
pred_list =[]
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [62]:
len(pred_list)

17295

In [63]:
max_list = []
for i in range(0,17295):
    max_list.append(float(pred_list[i][0].max()))

### Uncertainty

In [64]:
cnt = 0
label_list = []
img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    log_outputs = torch.log(pred_list[i][0])
    entropy = - torch.sum(pred_list[i][0] * log_outputs)
    if entropy> 0.05:
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

### Mislabelings & Engineer Cost

In [65]:
real_label_list = list(df_155655.loc[17295*3:17295*4-1]['failureNum'].values)
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 5658384.37it/s]


37

In [66]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

## Result

In [67]:
print(engineer_label_count)
print(wrong_cnt_list)

[5674, 5627, 3769, 2030]
[18, 89, 55, 37]


## Append Set 4 Data to Train Data

In [68]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

# Phase 5

## Training

In [69]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)

# train & valid split
train_data_len = int(len(train_folder_dataset)*0.8)
valid_data_len = len(train_folder_dataset) - train_data_len
train_data, valid_data = random_split(train_folder_dataset, [train_data_len, valid_data_len])

In [70]:
print(len(train_data), len(valid_data))

69180 17295


In [71]:
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])

train_data = CustomSubset(train_data, train_transformation)
valid_data = CustomSubset(valid_data, train_transformation)

train_dl = DataLoader(train_data, batch_size=32, shuffle=True)
valid_dl = DataLoader(valid_data, batch_size=32, shuffle=False)

In [72]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [73]:
model, loss_hist, metric_hist = train_val(model, params_train, 5)

Epoch 0/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.257089, val loss: 0.248515, accuracy: 93.74, time: 2.7477 min
----------
Epoch 1/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.157916, val loss: 0.123477, accuracy: 95.89, time: 5.4806 min
----------
Epoch 2/4, current lr=0.001
train loss: 0.132366, val loss: 0.151123, accuracy: 95.21, time: 8.2115 min
----------
Epoch 3/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.115446, val loss: 0.111734, accuracy: 96.46, time: 10.9552 min
----------
Epoch 4/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.098936, val loss: 0.102282, accuracy: 96.94, time: 13.6848 min
----------


## Predict

### Set 5 Data Load

In [74]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
test_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])
my_dataset = CustomDataSet(test_dir, transform=test_transformation, num=0)
test_loader = DataLoader(my_dataset, shuffle=False)

all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

In [75]:
len(test_loader.dataset)

17295

### Set 5 Data Predict & Softmax

In [76]:
pred_list =[]
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [77]:
len(pred_list)

17295

In [78]:
max_list = []
for i in range(0,17295):
    max_list.append(float(pred_list[i][0].max()))

### Uncertainty

In [79]:
cnt = 0
label_list = []
img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    log_outputs = torch.log(pred_list[i][0])
    entropy = - torch.sum(pred_list[i][0] * log_outputs)
    if entropy> 0.05:
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

### Mislabelings & Engineer Cost

In [80]:
real_label_list = list(df_155655.loc[17295*4:17295*5-1]['failureNum'].values)
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 5784727.89it/s]


46

In [81]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

## Result

In [82]:
print(engineer_label_count)
print(wrong_cnt_list)

[5674, 5627, 3769, 2030, 3916]
[18, 89, 55, 37, 46]


## Append Set 5 to Train Data

In [83]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

# Phase 6

## Training

In [84]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)

# train & valid split
train_data_len = int(len(train_folder_dataset)*0.8)
valid_data_len = len(train_folder_dataset) - train_data_len
train_data, valid_data = random_split(train_folder_dataset, [train_data_len, valid_data_len])

In [85]:
print(len(train_data), len(valid_data))

83016 20754


In [86]:
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])

train_data = CustomSubset(train_data, train_transformation)
valid_data = CustomSubset(valid_data, train_transformation)

train_dl = DataLoader(train_data, batch_size=32, shuffle=True)
valid_dl = DataLoader(valid_data, batch_size=32, shuffle=False)

In [87]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [88]:
model, loss_hist, metric_hist = train_val(model, params_train, 5)

Epoch 0/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.231518, val loss: 0.289957, accuracy: 91.12, time: 3.3200 min
----------
Epoch 1/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.145199, val loss: 0.145863, accuracy: 95.44, time: 6.6232 min
----------
Epoch 2/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.123926, val loss: 0.124194, accuracy: 95.95, time: 9.9184 min
----------
Epoch 3/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.106080, val loss: 0.104698, accuracy: 96.52, time: 13.1965 min
----------
Epoch 4/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.091373, val loss: 0.100888, accuracy: 96.82, time: 16.5572 min
----------


## Predict

### Set 6 Data Load

In [89]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
test_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])
my_dataset = CustomDataSet(test_dir, transform=test_transformation, num=0)
test_loader = DataLoader(my_dataset, shuffle=False)

all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

In [90]:
len(test_loader.dataset)

17295

### Set 6 Data Predict & Softmax

In [91]:
pred_list =[]
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [92]:
len(pred_list)

17295

In [93]:
max_list = []
for i in range(0,17295):
    max_list.append(float(pred_list[i][0].max()))

### Uncertainty

In [94]:
cnt = 0
label_list = []
img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    log_outputs = torch.log(pred_list[i][0])
    entropy = - torch.sum(pred_list[i][0] * log_outputs)
    if entropy> 0.05:
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

### Mislabelings & Engineer Cost

In [95]:
real_label_list = list(df_155655.loc[17295*5:17295*6-1]['failureNum'].values)
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 5714099.07it/s]


17

In [96]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

## Result

In [97]:
print(engineer_label_count)
print(wrong_cnt_list)

[5674, 5627, 3769, 2030, 3916, 3415]
[18, 89, 55, 37, 46, 17]


## Appned Set 6 Data to Train data

In [98]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

# Phase 7

## Training

In [99]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)

# train & valid split
train_data_len = int(len(train_folder_dataset)*0.8)
valid_data_len = len(train_folder_dataset) - train_data_len
train_data, valid_data = random_split(train_folder_dataset, [train_data_len, valid_data_len])

In [100]:
print(len(train_data), len(valid_data))

96852 24213


In [101]:
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])

train_data = CustomSubset(train_data, train_transformation)
valid_data = CustomSubset(valid_data, train_transformation)

train_dl = DataLoader(train_data, batch_size=32, shuffle=True)
valid_dl = DataLoader(valid_data, batch_size=32, shuffle=False)

In [102]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [103]:
model, loss_hist, metric_hist = train_val(model, params_train, 5)

Epoch 0/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.211070, val loss: 0.146092, accuracy: 95.67, time: 3.8424 min
----------
Epoch 1/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.135545, val loss: 0.099078, accuracy: 96.91, time: 7.6659 min
----------
Epoch 2/4, current lr=0.001
train loss: 0.111429, val loss: 0.107921, accuracy: 96.73, time: 11.4918 min
----------
Epoch 3/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.094228, val loss: 0.083151, accuracy: 97.28, time: 15.3058 min
----------
Epoch 4/4, current lr=0.001
train loss: 0.082885, val loss: 0.087866, accuracy: 97.18, time: 19.1319 min
----------


## Predict

### Set 7 Data Load

In [104]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
test_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])
my_dataset = CustomDataSet(test_dir, transform=test_transformation, num=0)
test_loader = DataLoader(my_dataset, shuffle=False)

all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

### Set 7 Data Predict & Softmax

In [105]:
pred_list =[]
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [106]:
max_list = []
for i in range(0,17295):
    max_list.append(float(pred_list[i][0].max()))

### Uncertainty

In [107]:
cnt = 0
label_list = []
img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    log_outputs = torch.log(pred_list[i][0])
    entropy = - torch.sum(pred_list[i][0] * log_outputs)
    if entropy> 0.05:
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

In [108]:
real_label_list = list(df_155655.loc[17295*6:17295*7-1]['failureNum'].values)
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 5784266.62it/s]


58

In [109]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

## Result

In [110]:
print(engineer_label_count)
print(wrong_cnt_list)

[5674, 5627, 3769, 2030, 3916, 3415, 3031]
[18, 89, 55, 37, 46, 17, 58]


## Append Set 7 Data to Train Data

In [111]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

# Phase 8

## Training

In [112]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)

# train & valid split
train_data_len = int(len(train_folder_dataset)*0.8)
valid_data_len = len(train_folder_dataset) - train_data_len
train_data, valid_data = random_split(train_folder_dataset, [train_data_len, valid_data_len])

In [113]:
print(len(train_data), len(valid_data))

110688 27672


In [114]:
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])

train_data = CustomSubset(train_data, train_transformation)
valid_data = CustomSubset(valid_data, train_transformation)

train_dl = DataLoader(train_data, batch_size=32, shuffle=True)
valid_dl = DataLoader(valid_data, batch_size=32, shuffle=False)

In [115]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [116]:
model, loss_hist, metric_hist = train_val(model, params_train, 5)

Epoch 0/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.213959, val loss: 0.156851, accuracy: 95.36, time: 4.3695 min
----------
Epoch 1/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.134995, val loss: 0.127940, accuracy: 95.92, time: 8.7209 min
----------
Epoch 2/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.111425, val loss: 0.095587, accuracy: 96.81, time: 13.0835 min
----------
Epoch 3/4, current lr=0.001
train loss: 0.095135, val loss: 0.109567, accuracy: 96.56, time: 17.4354 min
----------
Epoch 4/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.084142, val loss: 0.090963, accuracy: 96.90, time: 21.7973 min
----------


## Predict

### Set 8 Data Load

In [117]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
test_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])
my_dataset = CustomDataSet(test_dir, transform=test_transformation, num=0)
test_loader = DataLoader(my_dataset, shuffle=False)

all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

### Set 8 Data Predict & Softmax

In [118]:
pred_list =[]
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [119]:
max_list = []
for i in range(0,17295):
    max_list.append(float(pred_list[i][0].max()))

### Uncertainty

In [120]:
cnt = 0
label_list = []
img_folder = natsort.natsorted(all_imgs)

for i in range(0,17295):
    log_outputs = torch.log(pred_list[i][0])
    entropy = - torch.sum(pred_list[i][0] * log_outputs)
    if entropy> 0.05:
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

### Mislabelings & Engineer Cost

In [121]:
real_label_list = list(df_155655.loc[17295*7:17295*8-1]['failureNum'].values)
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 5784727.89it/s]


4

In [122]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

## Result

In [123]:
print(engineer_label_count)
print(wrong_cnt_list)

[5674, 5627, 3769, 2030, 3916, 3415, 3031, 3893]
[18, 89, 55, 37, 46, 17, 58, 4]


## Add Set 8 Data to Train Data

In [124]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

# Phase 9

## Training

In [125]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)

# train & valid split
train_data_len = int(len(train_folder_dataset)*0.8)
valid_data_len = len(train_folder_dataset) - train_data_len
train_data, valid_data = random_split(train_folder_dataset, [train_data_len, valid_data_len])

In [126]:
print(len(train_data), len(valid_data))

124524 31131


In [127]:
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])

train_data = CustomSubset(train_data, train_transformation)
valid_data = CustomSubset(valid_data, train_transformation)

train_dl = DataLoader(train_data, batch_size=32, shuffle=True)
valid_dl = DataLoader(valid_data, batch_size=32, shuffle=False)

In [128]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [129]:
model, loss_hist, metric_hist = train_val(model, params_train, 5)

Epoch 0/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.202447, val loss: 0.153769, accuracy: 95.55, time: 4.9056 min
----------
Epoch 1/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.127036, val loss: 0.106875, accuracy: 96.46, time: 9.8029 min
----------
Epoch 2/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.105692, val loss: 0.096837, accuracy: 96.72, time: 14.7203 min
----------
Epoch 3/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.091139, val loss: 0.084979, accuracy: 97.30, time: 19.6281 min
----------
Epoch 4/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.081026, val loss: 0.080965, accuracy: 97.36, time: 24.5450 min
----------


## Predict

### Set 9 Data Load

In [130]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
test_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                ])
my_dataset = CustomDataSet(test_dir, transform=test_transformation, num=0)
test_loader = DataLoader(my_dataset, shuffle=False)

all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

### Set 9 Data Predict & Softmax

In [131]:
pred_list =[]
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [132]:
max_list = []
for i in range(0,17295):
    max_list.append(float(pred_list[i][0].max()))

### Uncertainty

In [133]:
cnt = 0
label_list = []
img_folder = natsort.natsorted(all_imgs)

for i in range(0,17295):
    log_outputs = torch.log(pred_list[i][0])
    entropy = - torch.sum(pred_list[i][0] * log_outputs)
    if entropy> 0.05:
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

### Mislabelings & Engineer Cost

In [134]:
real_label_list = list(df_155655.loc[17295*8:17295*9-1]['failureNum'].values)
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 5783805.43it/s]


20

In [135]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

## Result

In [136]:
print(engineer_label_count)
print(wrong_cnt_list)

[5674, 5627, 3769, 2030, 3916, 3415, 3031, 3893, 2537]
[18, 89, 55, 37, 46, 17, 58, 4, 20]


## Add Set 9 Data to Train Data

In [137]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

# Final Result
- Mislabelings : 344 / 155,655(0.22%)
    - per phase : 18 / 89 / 55 / 37 / 46 / 17 / 58 / 4 / 20
- Engineer Cost : 33,892 / 155,655(21.77%)
    - per phase : 5674 / 5627 / 3769 / 2030 / 3916 / 3415 / 3031 / 3893 / 2537