# Uncertainty Based Labeling Methodology
- Uncertainty 3 : Entropy
- Model : ResNet50

In [1]:
import torch
import torchvision.datasets as dset
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import torch.nn as nn
import time
from torch import optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torchvision
import os
import natsort
import pandas as pd
from PIL import Image
from tqdm import tqdm
import shutil
import pandas as pd
import numpy as np

# Device Setting

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
class gpu_setting:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Model(ResNet50)
### Model

In [4]:
class BasicBlock(nn.Module):
    expansion = 1
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()

        # BatchNorm에 bias가 포함되어 있으므로, conv2d는 bias=False로 설정~~ 홍용민 메롱~~
        self.residual_function = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(),
            nn.Conv2d(out_channels, out_channels * BasicBlock.expansion, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(out_channels * BasicBlock.expansion),
        )

        # identity mapping, input과 output의 feature map size, filter 수가 동일한 경우 사용.
        self.shortcut = nn.Sequential()

        self.relu = nn.ReLU()

        # projection mapping using 1x1conv
        if stride != 1 or in_channels != BasicBlock.expansion * out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels * BasicBlock.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels * BasicBlock.expansion)
            )

    def forward(self, x):
        x = self.residual_function(x) + self.shortcut(x)
        x = self.relu(x)
        return x


class BottleNeck(nn.Module):
    expansion = 4
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()

        self.residual_function = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(),
            nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(),
            nn.Conv2d(out_channels, out_channels * BottleNeck.expansion, kernel_size=1, stride=1, bias=False),
            nn.BatchNorm2d(out_channels * BottleNeck.expansion),
        )

        self.shortcut = nn.Sequential()

        self.relu = nn.ReLU()

        if stride != 1 or in_channels != out_channels * BottleNeck.expansion:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels*BottleNeck.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels*BottleNeck.expansion)
            )
            
    def forward(self, x):
        x = self.residual_function(x) + self.shortcut(x)
        x = self.relu(x)
        return x

class ResNet(nn.Module):
    def __init__(self, block, num_block, num_classes=9, init_weights=True):
        super().__init__()

        self.in_channels=64

        self.conv1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        )

        self.conv2_x = self._make_layer(block, 64, num_block[0], 1)
        self.conv3_x = self._make_layer(block, 128, num_block[1], 2)
        self.conv4_x = self._make_layer(block, 256, num_block[2], 2)
        self.conv5_x = self._make_layer(block, 512, num_block[3], 2)

        self.avg_pool = nn.AdaptiveAvgPool2d((1,1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)
        
        # weights inittialization
        if init_weights:
            self._initialize_weights()

    def _make_layer(self, block, out_channels, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_channels, out_channels, stride))
            self.in_channels = out_channels * block.expansion

        return nn.Sequential(*layers)

    def forward(self,x):
        output = self.conv1(x)
        output = self.conv2_x(output)
        x = self.conv3_x(output)
        x = self.conv4_x(x)
        x = self.conv5_x(x)
        x = self.avg_pool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

    # define weight initialization function
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)

def resnet18():
    return ResNet(BasicBlock, [2,2,2,2])

def resnet34():
    return ResNet(BasicBlock, [3, 4, 6, 3])

def resnet50():
    return ResNet(BottleNeck, [3,4,6,3])

def resnet101():
    return ResNet(BottleNeck, [3, 4, 23, 3])

def resnet152():
    return ResNet(BottleNeck, [3, 8, 36, 3])

### Parameters

In [5]:
def ResNetParameters(model, train_dl, valid_dl):
    device = gpu_setting.device
    model = model.lower()
    if model == 'resnet34':
        model = resnet34().to(device)
    if model == 'resnet50':
        model = resnet50().to(device)
    if model == 'resnet101':
        model = resnet101().to(device)
        
    loss_func = nn.CrossEntropyLoss(reduction='sum')
    opt = optim.Adam(model.parameters(), lr=0.001)
    lr_scheduler = ReduceLROnPlateau(opt, mode='min', factor=0.1, patience=10)
    
    # definc the training parameters
    params_train = {
        'num_epochs':5,
        'optimizer':opt,
        'loss_func':loss_func,
        'train_dl':train_dl, 
        'val_dl':valid_dl,
        'sanity_check':False,
        'lr_scheduler':lr_scheduler,
        'path2weights':'./trained_model/weights_original_res.pt', #이거 변경해서 사용
    }
    return model, params_train

# Function

In [6]:
def get_lr(opt):
    for param_group in opt.param_groups:
        return param_group['lr']
def metric_batch(output, target):
    pred = output.argmax(1, keepdim=True)
    corrects = pred.eq(target.view_as(pred)).sum().item()
    return corrects
def loss_batch(loss_func, output, target, opt=None):
    loss = loss_func(output, target)
    metric_b = metric_batch(output, target)

    if opt is not None:
        opt.zero_grad()
        loss.backward()
        opt.step()

    return loss.item(), metric_b
def loss_epoch(model, loss_func, dataset_dl, sanity_check=False, opt=None):
    running_loss = 0.0
    running_metric = 0.0
    len_data = len(dataset_dl.dataset)

    for xb, yb in dataset_dl:
        xb = xb.to(device)
        yb = yb.to(device)
        output = model(xb)

        loss_b, metric_b = loss_batch(loss_func, output, yb, opt)

        running_loss += loss_b
        
        if metric_b is not None:
            running_metric += metric_b
        
        if sanity_check is True:
            break

    loss = running_loss / len_data
    metric = running_metric / len_data

    return loss, metric

def train_val(model, params, epoch):
    num_epochs=epoch
    loss_func=params["loss_func"]
    opt=params["optimizer"]
    train_dl=params["train_dl"]
    val_dl=params["val_dl"]
    sanity_check=params["sanity_check"]
    lr_scheduler=params["lr_scheduler"]
    path2weights=params["path2weights"]

    loss_history = {'train': [], 'val': []}
    metric_history = {'train': [], 'val': []}

    # # GPU out of memoty error
    # best_model_wts = copy.deepcopy(model.state_dict())

    best_loss = float('inf')

    start_time = time.time()

    for epoch in range(num_epochs):
        current_lr = get_lr(opt)
        print('Epoch {}/{}, current lr={}'.format(epoch, num_epochs-1, current_lr))

        model.train()
        train_loss, train_metric = loss_epoch(model, loss_func, train_dl, sanity_check, opt)
        loss_history['train'].append(train_loss)
        metric_history['train'].append(train_metric)

        model.eval()
        with torch.no_grad():
            val_loss, val_metric = loss_epoch(model, loss_func, val_dl, sanity_check)
        loss_history['val'].append(val_loss)
        metric_history['val'].append(val_metric)

        if val_loss < best_loss:
            best_loss = val_loss
            # best_model_wts = copy.deepcopy(model.state_dict())

            torch.save(model.state_dict(), path2weights)
            print('Copied best model weights!')
            print('Get best val_loss')

        lr_scheduler.step(val_loss)

        print('train loss: %.6f, val loss: %.6f, accuracy: %.2f, time: %.4f min' %(train_loss, val_loss, 100*val_metric, (time.time()-start_time)/60))
        print('-'*10)

    # model.load_state_dict(best_model_wts)

    return model, loss_history, metric_history

# CustomDataset

In [7]:
class CustomDataSet():
    def __init__(self, main_dir, transform, num):
        self.main_dir = main_dir
        self.transform = transform
        self.num = num
        all_imgs = os.listdir(main_dir)
        self.total_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

    def __len__(self):
        return len(self.total_imgs)

    def __getitem__(self, idx):
        img_loc = os.path.join(self.main_dir, self.total_imgs[idx])
        image = Image.open(img_loc).convert("RGB")
        tensor_image = self.transform(image)
        return tensor_image

In [8]:
engineer_label_count = []
wrong_cnt_list = []

In [9]:
df_155655 = pd.read_csv('../csv/data_155655.csv')

In [10]:
df_155655.shape

(155655, 10)

# Experiment

### Phase 1

In [11]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
train_folder_dataset.transform = train_transformation
train_dl = DataLoader(train_folder_dataset, batch_size=32, shuffle=True)

In [12]:
test_dir = '../Data/Data_17295/Unlabeled/'
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)
original_df = pd.read_csv('../csv/data_172950.csv')
tmp = all_imgs[:17295]
for i in range(len(tmp)):
    idx = int(tmp[i][:-4])
    fail_num = str(original_df[original_df['index'] == idx]['failureNum'].values[0])
    image_file = str(original_df[original_df['index'] == idx]['index'].values[0]) + '.png'
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(image_file), '../Data/Data_17295/valid/{0}/{1}'.format(fail_num,image_file))

In [13]:
valid_dir = '../Data/Data_17295/valid/'
valid_folder_dataset = dset.ImageFolder(root=valid_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
valid_folder_dataset.transform = train_transformation
valid_dl = DataLoader(valid_folder_dataset, batch_size=32, shuffle=True)

In [14]:
valid_folder_dataset

Dataset ImageFolder
    Number of datapoints: 17295
    Root location: ../Data/Data_17295/valid/

In [15]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [16]:
model, loss_hist, metric_hist = train_val(model, params_train, 5)

Epoch 0/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.365695, val loss: 0.464802, accuracy: 84.50, time: 3.7675 min
----------
Epoch 1/4, current lr=0.001
train loss: 0.237779, val loss: 1.221917, accuracy: 70.38, time: 6.0740 min
----------
Epoch 2/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.205691, val loss: 0.268737, accuracy: 91.29, time: 8.3698 min
----------
Epoch 3/4, current lr=0.001
train loss: 0.181101, val loss: 0.299791, accuracy: 90.90, time: 10.6455 min
----------
Epoch 4/4, current lr=0.001
train loss: 0.168214, val loss: 0.374037, accuracy: 89.40, time: 12.8911 min
----------


In [21]:
for i in range(0,9):
    tmp_label = str(i)
    tmp_dir = '../Data/Data_17295/valid/{0}'.format(tmp_label)
    tmp_imgs = os.listdir(tmp_dir)
    for i in range(len(tmp_imgs)):
        shutil.move('../Data/Data_17295/valid/{0}/{1}'.format(tmp_label,tmp_imgs[i]), '../Data/Data_17295/Unlabeled/{0}'.format(tmp_imgs[i]))

In [22]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
label = pd.read_csv('../csv/data_172950.csv', index_col=0)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
my_dataset = CustomDataSet(test_dir, transform=train_transformation, num=0)
test_loader = DataLoader(my_dataset , batch_size=1, shuffle=False)
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

In [23]:
pred_list =[]
size = len(test_loader.dataset)
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [61]:
cnt = 0
label_list = []
label = pd.read_csv('../csv/data_172950.csv', index_col=0)

img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    log_outputs = torch.log(pred_list[i][0])
    entropy = - torch.sum(pred_list[i][0] * log_outputs)
    if entropy> 0.05:
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

In [63]:
real_label_list = list(df_155655.loc[:17294]['failureNum'].values)

In [64]:
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 2658134.40it/s]


71

In [65]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

In [66]:
print(engineer_label_count)
print(wrong_cnt_list)

[6454]
[71]


In [67]:
for i in range(0,17295 ):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

### Phase 2

In [68]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
train_folder_dataset.transform = train_transformation
train_dl = DataLoader(train_folder_dataset, batch_size=32, shuffle=True)

In [69]:
train_folder_dataset

Dataset ImageFolder
    Number of datapoints: 34590
    Root location: ../Data/Data_17295/Labeled/

In [70]:
test_dir = '../Data/Data_17295/Unlabeled/'
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)
original_df = pd.read_csv('../csv/data_172950.csv')
tmp = all_imgs[:17295]
for i in range(len(tmp)):
    idx = int(tmp[i][:-4])
    fail_num = str(original_df[original_df['index'] == idx]['failureNum'].values[0])
    image_file = str(original_df[original_df['index'] == idx]['index'].values[0]) + '.png'
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(image_file), '../Data/Data_17295/valid/{0}/{1}'.format(fail_num,image_file))

In [71]:
valid_dir = '../Data/Data_17295/valid/'
valid_folder_dataset = dset.ImageFolder(root=valid_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
valid_folder_dataset.transform = train_transformation
valid_dl = DataLoader(valid_folder_dataset, batch_size=32, shuffle=True)

In [72]:
valid_folder_dataset

Dataset ImageFolder
    Number of datapoints: 17295
    Root location: ../Data/Data_17295/valid/

In [73]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [74]:
model, loss_hist, metric_hist = train_val(model, params_train, 7)

Epoch 0/6, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.336242, val loss: 0.472596, accuracy: 86.71, time: 5.0681 min
----------
Epoch 1/6, current lr=0.001
train loss: 0.199856, val loss: 0.519107, accuracy: 83.24, time: 8.9200 min
----------
Epoch 2/6, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.161001, val loss: 0.347463, accuracy: 88.57, time: 12.7410 min
----------
Epoch 3/6, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.141628, val loss: 0.308766, accuracy: 90.73, time: 16.5635 min
----------
Epoch 4/6, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.129363, val loss: 0.212458, accuracy: 92.67, time: 20.3954 min
----------
Epoch 5/6, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.118252, val loss: 0.194507, accuracy: 93.36, time: 24.2190 min
----------
Epoch 6/6, current lr=0.001
train loss: 0.108663, val loss: 0.230824, accuracy: 9

In [76]:
for i in range(0,9):
    tmp_label = str(i)
    tmp_dir = '../Data/Data_17295/valid/{0}'.format(tmp_label)
    tmp_imgs = os.listdir(tmp_dir)
    for i in range(len(tmp_imgs)):
        shutil.move('../Data/Data_17295/valid/{0}/{1}'.format(tmp_label,tmp_imgs[i]), '../Data/Data_17295/Unlabeled/{0}'.format(tmp_imgs[i]))

In [77]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
label = pd.read_csv('../csv/data_172950.csv', index_col=0)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
my_dataset = CustomDataSet(test_dir, transform=train_transformation, num=0)
test_loader = DataLoader(my_dataset , batch_size=1, shuffle=False)
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

In [78]:
pred_list =[]
size = len(test_loader.dataset)
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [79]:
max_list = []
for i in range(0,17295):
    max_list.append(float(pred_list[i][0].max()))

In [80]:
cnt = 0
label_list = []
label = pd.read_csv('../csv/data_172950.csv', index_col=0)

img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    log_outputs = torch.log(pred_list[i][0])
    entropy = - torch.sum(pred_list[i][0] * log_outputs)
    if entropy> 0.05:
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

In [81]:
real_label_list = list(df_155655.loc[17295*1:17295*2-1]['failureNum'].values)

In [82]:
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 3140008.99it/s]


29

In [83]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

In [84]:
print(engineer_label_count)
print(wrong_cnt_list)

[6454, 5932]
[71, 29]


In [85]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

### Phase 3

In [86]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
train_folder_dataset.transform = train_transformation
train_dl = DataLoader(train_folder_dataset, batch_size=32, shuffle=True)

In [87]:
train_folder_dataset

Dataset ImageFolder
    Number of datapoints: 51885
    Root location: ../Data/Data_17295/Labeled/

In [88]:
test_dir = '../Data/Data_17295/Unlabeled/'
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)
original_df = pd.read_csv('../csv/data_172950.csv')
tmp = all_imgs[:17295]
for i in range(len(tmp)):
    idx = int(tmp[i][:-4])
    fail_num = str(original_df[original_df['index'] == idx]['failureNum'].values[0])
    image_file = str(original_df[original_df['index'] == idx]['index'].values[0]) + '.png'
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(image_file), '../Data/Data_17295/valid/{0}/{1}'.format(fail_num,image_file))

In [89]:
valid_dir = '../Data/Data_17295/valid/'
valid_folder_dataset = dset.ImageFolder(root=valid_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
valid_folder_dataset.transform = train_transformation
valid_dl = DataLoader(valid_folder_dataset, batch_size=32, shuffle=True)

In [90]:
valid_folder_dataset

Dataset ImageFolder
    Number of datapoints: 17295
    Root location: ../Data/Data_17295/valid/

In [91]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [92]:
model, loss_hist, metric_hist = train_val(model, params_train, 9)

Epoch 0/8, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.306321, val loss: 0.164376, accuracy: 95.47, time: 6.7564 min
----------
Epoch 1/8, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.192399, val loss: 0.147278, accuracy: 95.91, time: 14.3437 min
----------
Epoch 2/8, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.155962, val loss: 0.137225, accuracy: 96.20, time: 19.7461 min
----------
Epoch 3/8, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.135220, val loss: 0.120691, accuracy: 95.89, time: 25.1177 min
----------
Epoch 4/8, current lr=0.001
train loss: 0.115513, val loss: 0.149405, accuracy: 94.64, time: 30.5073 min
----------
Epoch 5/8, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.100391, val loss: 0.103206, accuracy: 96.84, time: 35.8943 min
----------
Epoch 6/8, current lr=0.001
train loss: 0.087955, val loss: 0.107645, accuracy: 

In [93]:
for i in range(0,9):
    tmp_label = str(i)
    tmp_dir = '../Data/Data_17295/valid/{0}'.format(tmp_label)
    tmp_imgs = os.listdir(tmp_dir)
    for i in range(len(tmp_imgs)):
        shutil.move('../Data/Data_17295/valid/{0}/{1}'.format(tmp_label,tmp_imgs[i]), '../Data/Data_17295/Unlabeled/{0}'.format(tmp_imgs[i]))

In [94]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
label = pd.read_csv('../csv/data_172950.csv', index_col=0)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
my_dataset = CustomDataSet(test_dir, transform=train_transformation, num=0)
test_loader = DataLoader(my_dataset , batch_size=1, shuffle=False)
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

In [95]:
pred_list =[]
size = len(test_loader.dataset)
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [96]:
max_list = []
for i in range(0,17295):
    max_list.append(float(pred_list[i][0].max()))

In [97]:
cnt = 0
label_list = []
label = pd.read_csv('../csv/data_172950.csv', index_col=0)

img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    log_outputs = torch.log(pred_list[i][0])
    entropy = - torch.sum(pred_list[i][0] * log_outputs)
    if entropy> 0.05:
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

In [98]:
real_label_list = list(df_155655.loc[17295*2:17295*3-1]['failureNum'].values)

In [99]:
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 2881563.82it/s]


54

In [100]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

In [101]:
print(engineer_label_count)
print(wrong_cnt_list)

[6454, 5932, 2204]
[71, 29, 54]


In [102]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

### Phase 4

In [103]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
train_folder_dataset.transform = train_transformation
train_dl = DataLoader(train_folder_dataset, batch_size=32, shuffle=True)

In [104]:
train_folder_dataset

Dataset ImageFolder
    Number of datapoints: 69180
    Root location: ../Data/Data_17295/Labeled/

In [105]:
test_dir = '../Data/Data_17295/Unlabeled/'
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)
original_df = pd.read_csv('../csv/data_172950.csv')
tmp = all_imgs[:17295]
for i in range(len(tmp)):
    idx = int(tmp[i][:-4])
    fail_num = str(original_df[original_df['index'] == idx]['failureNum'].values[0])
    image_file = str(original_df[original_df['index'] == idx]['index'].values[0]) + '.png'
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(image_file), '../Data/Data_17295/valid/{0}/{1}'.format(fail_num,image_file))

In [106]:
valid_dir = '../Data/Data_17295/valid/'
valid_folder_dataset = dset.ImageFolder(root=valid_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
valid_folder_dataset.transform = train_transformation
valid_dl = DataLoader(valid_folder_dataset, batch_size=32, shuffle=True)

In [107]:
valid_folder_dataset

Dataset ImageFolder
    Number of datapoints: 17295
    Root location: ../Data/Data_17295/valid/

In [108]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [109]:
model, loss_hist, metric_hist = train_val(model, params_train, 11)

Epoch 0/10, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.274004, val loss: 0.093508, accuracy: 97.81, time: 8.3381 min
----------
Epoch 1/10, current lr=0.001
train loss: 0.167522, val loss: 0.468085, accuracy: 86.05, time: 15.3074 min
----------
Epoch 2/10, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.135751, val loss: 0.058293, accuracy: 98.28, time: 22.2529 min
----------
Epoch 3/10, current lr=0.001
train loss: 0.116412, val loss: 0.103176, accuracy: 97.27, time: 29.2373 min
----------
Epoch 4/10, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.097726, val loss: 0.048199, accuracy: 98.45, time: 36.2107 min
----------
Epoch 5/10, current lr=0.001
train loss: 0.089625, val loss: 0.050624, accuracy: 98.47, time: 43.1759 min
----------
Epoch 6/10, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.079335, val loss: 0.043360, accuracy: 98.70, time: 50.1408 min
----------
Ep

In [110]:
for i in range(0,9):
    tmp_label = str(i)
    tmp_dir = '../Data/Data_17295/valid/{0}'.format(tmp_label)
    tmp_imgs = os.listdir(tmp_dir)
    for i in range(len(tmp_imgs)):
        shutil.move('../Data/Data_17295/valid/{0}/{1}'.format(tmp_label,tmp_imgs[i]), '../Data/Data_17295/Unlabeled/{0}'.format(tmp_imgs[i]))

In [111]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
label = pd.read_csv('../csv/data_172950.csv', index_col=0)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
my_dataset = CustomDataSet(test_dir, transform=train_transformation, num=0)
test_loader = DataLoader(my_dataset , batch_size=1, shuffle=False)
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

In [112]:
pred_list =[]
size = len(test_loader.dataset)
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [113]:
cnt = 0
label_list = []
label = pd.read_csv('../csv/data_172950.csv', index_col=0)

img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    log_outputs = torch.log(pred_list[i][0])
    entropy = - torch.sum(pred_list[i][0] * log_outputs)
    if entropy> 0.05:
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

In [114]:
real_label_list = list(df_155655.loc[17295*3:17295*4-1]['failureNum'].values)

In [115]:
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 2882250.78it/s]


22

In [116]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

In [117]:
print(engineer_label_count)
print(wrong_cnt_list)

[6454, 5932, 2204, 1236]
[71, 29, 54, 22]


In [118]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

### Phase 5

In [119]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
train_folder_dataset.transform = train_transformation
train_dl = DataLoader(train_folder_dataset, batch_size=32, shuffle=True)

In [120]:
train_folder_dataset

Dataset ImageFolder
    Number of datapoints: 86475
    Root location: ../Data/Data_17295/Labeled/

In [121]:
test_dir = '../Data/Data_17295/Unlabeled/'
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)
original_df = pd.read_csv('../csv/data_172950.csv')
tmp = all_imgs[:17295]
for i in range(len(tmp)):
    idx = int(tmp[i][:-4])
    fail_num = str(original_df[original_df['index'] == idx]['failureNum'].values[0])
    image_file = str(original_df[original_df['index'] == idx]['index'].values[0]) + '.png'
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(image_file), '../Data/Data_17295/valid/{0}/{1}'.format(fail_num,image_file))

In [122]:
valid_dir = '../Data/Data_17295/valid/'
valid_folder_dataset = dset.ImageFolder(root=valid_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
valid_folder_dataset.transform = train_transformation
valid_dl = DataLoader(valid_folder_dataset, batch_size=32, shuffle=True)

In [123]:
valid_folder_dataset

Dataset ImageFolder
    Number of datapoints: 17295
    Root location: ../Data/Data_17295/valid/

In [124]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [125]:
model, loss_hist, metric_hist = train_val(model, params_train, 13)

Epoch 0/12, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.247128, val loss: 0.113029, accuracy: 96.94, time: 9.6733 min
----------
Epoch 1/12, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.153352, val loss: 0.103355, accuracy: 96.93, time: 18.1372 min
----------
Epoch 2/12, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.121920, val loss: 0.095832, accuracy: 97.29, time: 26.6207 min
----------
Epoch 3/12, current lr=0.001
train loss: 0.103342, val loss: 0.101988, accuracy: 97.35, time: 35.1477 min
----------
Epoch 4/12, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.090971, val loss: 0.083641, accuracy: 97.48, time: 43.6649 min
----------
Epoch 5/12, current lr=0.001
train loss: 0.079304, val loss: 0.094295, accuracy: 97.27, time: 52.2009 min
----------
Epoch 6/12, current lr=0.001
train loss: 0.071245, val loss: 0.092398, accuracy: 97.61, time: 60.7267 min
----------
Ep

In [126]:
for i in range(0,9):
    tmp_label = str(i)
    tmp_dir = '../Data/Data_17295/valid/{0}'.format(tmp_label)
    tmp_imgs = os.listdir(tmp_dir)
    for i in range(len(tmp_imgs)):
        shutil.move('../Data/Data_17295/valid/{0}/{1}'.format(tmp_label,tmp_imgs[i]), '../Data/Data_17295/Unlabeled/{0}'.format(tmp_imgs[i]))

In [127]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
label = pd.read_csv('../csv/data_172950.csv', index_col=0)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
my_dataset = CustomDataSet(test_dir, transform=train_transformation, num=0)
test_loader = DataLoader(my_dataset , batch_size=1, shuffle=False)
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

In [128]:
pred_list =[]
size = len(test_loader.dataset)
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [129]:
cnt = 0
label_list = []
label = pd.read_csv('../csv/data_172950.csv', index_col=0)

img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    log_outputs = torch.log(pred_list[i][0])
    entropy = - torch.sum(pred_list[i][0] * log_outputs)
    if entropy> 0.05:
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

In [130]:
real_label_list = list(df_155655.loc[17295*4:17295*5-1]['failureNum'].values)

In [131]:
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 2658329.22it/s]


87

In [132]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

In [133]:
print(engineer_label_count)
print(wrong_cnt_list)

[6454, 5932, 2204, 1236, 1007]
[71, 29, 54, 22, 87]


In [134]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

### Phase 6

In [135]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
train_folder_dataset.transform = train_transformation
train_dl = DataLoader(train_folder_dataset, batch_size=32, shuffle=True)

In [136]:
train_folder_dataset

Dataset ImageFolder
    Number of datapoints: 103770
    Root location: ../Data/Data_17295/Labeled/

In [137]:
test_dir = '../Data/Data_17295/Unlabeled/'
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)
original_df = pd.read_csv('../csv/data_172950.csv')
tmp = all_imgs[:17295]
for i in range(len(tmp)):
    idx = int(tmp[i][:-4])
    fail_num = str(original_df[original_df['index'] == idx]['failureNum'].values[0])
    image_file = str(original_df[original_df['index'] == idx]['index'].values[0]) + '.png'
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(image_file), '../Data/Data_17295/valid/{0}/{1}'.format(fail_num,image_file))

In [138]:
valid_dir = '../Data/Data_17295/valid/'
valid_folder_dataset = dset.ImageFolder(root=valid_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
valid_folder_dataset.transform = train_transformation
valid_dl = DataLoader(valid_folder_dataset, batch_size=32, shuffle=True)

In [139]:
valid_folder_dataset

Dataset ImageFolder
    Number of datapoints: 17295
    Root location: ../Data/Data_17295/valid/

In [140]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [141]:
model, loss_hist, metric_hist = train_val(model, params_train, 15)

Epoch 0/14, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.216744, val loss: 0.143736, accuracy: 96.12, time: 11.2195 min
----------
Epoch 1/14, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.135173, val loss: 0.111585, accuracy: 96.79, time: 21.1979 min
----------
Epoch 2/14, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.107634, val loss: 0.089172, accuracy: 97.36, time: 31.2359 min
----------
Epoch 3/14, current lr=0.001
train loss: 0.089437, val loss: 0.099070, accuracy: 97.05, time: 41.2689 min
----------
Epoch 4/14, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.077979, val loss: 0.059651, accuracy: 97.88, time: 51.3123 min
----------
Epoch 5/14, current lr=0.001
train loss: 0.069275, val loss: 0.069212, accuracy: 97.68, time: 61.4210 min
----------
Epoch 6/14, current lr=0.001
train loss: 0.061727, val loss: 0.071138, accuracy: 97.62, time: 71.5034 min
----------
E

In [142]:
for i in range(0,9):
    tmp_label = str(i)
    tmp_dir = '../Data/Data_17295/valid/{0}'.format(tmp_label)
    tmp_imgs = os.listdir(tmp_dir)
    for i in range(len(tmp_imgs)):
        shutil.move('../Data/Data_17295/valid/{0}/{1}'.format(tmp_label,tmp_imgs[i]), '../Data/Data_17295/Unlabeled/{0}'.format(tmp_imgs[i]))

In [143]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
label = pd.read_csv('../csv/data_172950.csv', index_col=0)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
my_dataset = CustomDataSet(test_dir, transform=train_transformation, num=0)
test_loader = DataLoader(my_dataset , batch_size=1, shuffle=False)
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

In [144]:
pred_list =[]
size = len(test_loader.dataset)
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [145]:
cnt = 0
label_list = []
label = pd.read_csv('../csv/data_172950.csv', index_col=0)

img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    log_outputs = torch.log(pred_list[i][0])
    entropy = - torch.sum(pred_list[i][0] * log_outputs)
    if entropy> 0.05:
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

In [146]:
real_label_list = list(df_155655.loc[17295*5:17295*6-1]['failureNum'].values)

In [147]:
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 2720846.47it/s]


37

In [148]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

In [149]:
print(engineer_label_count)
print(wrong_cnt_list)

[6454, 5932, 2204, 1236, 1007, 1152]
[71, 29, 54, 22, 87, 37]


In [150]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

### Phase 7

In [151]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
train_folder_dataset.transform = train_transformation
train_dl = DataLoader(train_folder_dataset, batch_size=32, shuffle=True)

In [152]:
train_folder_dataset

Dataset ImageFolder
    Number of datapoints: 121065
    Root location: ../Data/Data_17295/Labeled/

In [153]:
test_dir = '../Data/Data_17295/Unlabeled/'
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)
original_df = pd.read_csv('../csv/data_172950.csv')
tmp = all_imgs[:17295]
for i in range(len(tmp)):
    idx = int(tmp[i][:-4])
    fail_num = str(original_df[original_df['index'] == idx]['failureNum'].values[0])
    image_file = str(original_df[original_df['index'] == idx]['index'].values[0]) + '.png'
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(image_file), '../Data/Data_17295/valid/{0}/{1}'.format(fail_num,image_file))

In [154]:
valid_dir = '../Data/Data_17295/valid/'
valid_folder_dataset = dset.ImageFolder(root=valid_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
valid_folder_dataset.transform = train_transformation
valid_dl = DataLoader(valid_folder_dataset, batch_size=32, shuffle=True)

In [155]:
valid_folder_dataset

Dataset ImageFolder
    Number of datapoints: 17295
    Root location: ../Data/Data_17295/valid/

In [156]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [157]:
model, loss_hist, metric_hist = train_val(model, params_train, 17)

Epoch 0/16, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.206134, val loss: 0.323298, accuracy: 91.27, time: 12.7954 min
----------
Epoch 1/16, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.128309, val loss: 0.168751, accuracy: 94.82, time: 24.2973 min
----------
Epoch 2/16, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.104690, val loss: 0.137902, accuracy: 95.44, time: 35.9319 min
----------
Epoch 3/16, current lr=0.001
train loss: 0.089232, val loss: 0.139714, accuracy: 95.18, time: 47.5904 min
----------
Epoch 4/16, current lr=0.001
train loss: 0.075160, val loss: 0.154688, accuracy: 95.26, time: 59.2202 min
----------
Epoch 5/16, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.065473, val loss: 0.135465, accuracy: 95.42, time: 70.8596 min
----------
Epoch 6/16, current lr=0.001
train loss: 0.058044, val loss: 0.137603, accuracy: 95.52, time: 82.4733 min
----------
E

In [158]:
for i in range(0,9):
    tmp_label = str(i)
    tmp_dir = '../Data/Data_17295/valid/{0}'.format(tmp_label)
    tmp_imgs = os.listdir(tmp_dir)
    for i in range(len(tmp_imgs)):
        shutil.move('../Data/Data_17295/valid/{0}/{1}'.format(tmp_label,tmp_imgs[i]), '../Data/Data_17295/Unlabeled/{0}'.format(tmp_imgs[i]))

In [159]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
label = pd.read_csv('../csv/data_172950.csv', index_col=0)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
my_dataset = CustomDataSet(test_dir, transform=train_transformation, num=0)
test_loader = DataLoader(my_dataset , batch_size=1, shuffle=False)
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

In [160]:
pred_list =[]
size = len(test_loader.dataset)
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [161]:
cnt = 0
label_list = []
label = pd.read_csv('../csv/data_172950.csv', index_col=0)

img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    log_outputs = torch.log(pred_list[i][0])
    entropy = - torch.sum(pred_list[i][0] * log_outputs)
    if entropy> 0.05:
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

In [162]:
real_label_list = list(df_155655.loc[17295*6:17295*7-1]['failureNum'].values)

In [163]:
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 2657160.72it/s]


128

In [164]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

In [165]:
print(engineer_label_count)
print(wrong_cnt_list)

[6454, 5932, 2204, 1236, 1007, 1152, 1988]
[71, 29, 54, 22, 87, 37, 128]


In [166]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

### Phase 8

In [167]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
train_folder_dataset.transform = train_transformation
train_dl = DataLoader(train_folder_dataset, batch_size=32, shuffle=True)

In [168]:
train_folder_dataset

Dataset ImageFolder
    Number of datapoints: 138360
    Root location: ../Data/Data_17295/Labeled/

In [169]:
test_dir = '../Data/Data_17295/Unlabeled/'
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)
original_df = pd.read_csv('../csv/data_172950.csv')
tmp = all_imgs[:17295]
for i in range(len(tmp)):
    idx = int(tmp[i][:-4])
    fail_num = str(original_df[original_df['index'] == idx]['failureNum'].values[0])
    image_file = str(original_df[original_df['index'] == idx]['index'].values[0]) + '.png'
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(image_file), '../Data/Data_17295/valid/{0}/{1}'.format(fail_num,image_file))

In [170]:
valid_dir = '../Data/Data_17295/valid/'
valid_folder_dataset = dset.ImageFolder(root=valid_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
valid_folder_dataset.transform = train_transformation
valid_dl = DataLoader(valid_folder_dataset, batch_size=32, shuffle=True)

In [171]:
valid_folder_dataset

Dataset ImageFolder
    Number of datapoints: 17295
    Root location: ../Data/Data_17295/valid/

In [172]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [173]:
model, loss_hist, metric_hist = train_val(model, params_train, 19)

Epoch 0/18, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.199729, val loss: 0.096518, accuracy: 97.42, time: 14.1429 min
----------
Epoch 1/18, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.126209, val loss: 0.077243, accuracy: 97.77, time: 27.3181 min
----------
Epoch 2/18, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.101481, val loss: 0.066444, accuracy: 97.94, time: 40.4892 min
----------
Epoch 3/18, current lr=0.001
train loss: 0.086803, val loss: 0.077191, accuracy: 97.81, time: 53.6364 min
----------
Epoch 4/18, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.074607, val loss: 0.057876, accuracy: 98.13, time: 66.8341 min
----------
Epoch 5/18, current lr=0.001
train loss: 0.064890, val loss: 0.062287, accuracy: 98.01, time: 79.9601 min
----------
Epoch 6/18, current lr=0.001
train loss: 0.057963, val loss: 0.067916, accuracy: 97.70, time: 93.1192 min
----------
E

In [174]:
for i in range(0,9):
    tmp_label = str(i)
    tmp_dir = '../Data/Data_17295/valid/{0}'.format(tmp_label)
    tmp_imgs = os.listdir(tmp_dir)
    for i in range(len(tmp_imgs)):
        shutil.move('../Data/Data_17295/valid/{0}/{1}'.format(tmp_label,tmp_imgs[i]), '../Data/Data_17295/Unlabeled/{0}'.format(tmp_imgs[i]))

In [175]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
label = pd.read_csv('../csv/data_172950.csv', index_col=0)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
my_dataset = CustomDataSet(test_dir, transform=train_transformation, num=0)
test_loader = DataLoader(my_dataset , batch_size=1, shuffle=False)
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

In [176]:
pred_list =[]
size = len(test_loader.dataset)
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [177]:
cnt = 0
label_list = []
label = pd.read_csv('../csv/data_172950.csv', index_col=0)

img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    log_outputs = torch.log(pred_list[i][0])
    entropy = - torch.sum(pred_list[i][0] * log_outputs)
    if entropy> 0.05:
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

In [178]:
real_label_list = list(df_155655.loc[17295*7:17295*8-1]['failureNum'].values)

In [179]:
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 2647366.43it/s]


88

In [180]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

In [181]:
print(engineer_label_count)
print(wrong_cnt_list)

[6454, 5932, 2204, 1236, 1007, 1152, 1988, 609]
[71, 29, 54, 22, 87, 37, 128, 88]


In [182]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

### Phase 9

In [183]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
train_folder_dataset.transform = train_transformation
train_dl = DataLoader(train_folder_dataset, batch_size=32, shuffle=True)

In [184]:
train_folder_dataset

Dataset ImageFolder
    Number of datapoints: 155655
    Root location: ../Data/Data_17295/Labeled/

In [185]:
test_dir = '../Data/Data_17295/Unlabeled/'
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)
original_df = pd.read_csv('../csv/data_172950.csv')
tmp = all_imgs[:17295]
for i in range(len(tmp)):
    idx = int(tmp[i][:-4])
    fail_num = str(original_df[original_df['index'] == idx]['failureNum'].values[0])
    image_file = str(original_df[original_df['index'] == idx]['index'].values[0]) + '.png'
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(image_file), '../Data/Data_17295/valid/{0}/{1}'.format(fail_num,image_file))

In [186]:
valid_dir = '../Data/Data_17295/valid/'
valid_folder_dataset = dset.ImageFolder(root=valid_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
valid_folder_dataset.transform = train_transformation
valid_dl = DataLoader(valid_folder_dataset, batch_size=32, shuffle=True)

In [187]:
valid_folder_dataset

Dataset ImageFolder
    Number of datapoints: 17295
    Root location: ../Data/Data_17295/valid/

In [188]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [189]:
model, loss_hist, metric_hist = train_val(model, params_train, 21)

Epoch 0/20, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.187489, val loss: 0.144100, accuracy: 95.83, time: 15.7593 min
----------
Epoch 1/20, current lr=0.001
train loss: 0.116582, val loss: 0.174545, accuracy: 95.61, time: 30.4359 min
----------
Epoch 2/20, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.093520, val loss: 0.121273, accuracy: 96.37, time: 45.2595 min
----------
Epoch 3/20, current lr=0.001
train loss: 0.078952, val loss: 0.128090, accuracy: 96.44, time: 60.0077 min
----------
Epoch 4/20, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.068578, val loss: 0.101185, accuracy: 96.61, time: 74.7555 min
----------
Epoch 5/20, current lr=0.001
train loss: 0.062099, val loss: 0.103312, accuracy: 96.65, time: 89.4746 min
----------
Epoch 6/20, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.055929, val loss: 0.084778, accuracy: 97.27, time: 104.2537 min
----------


In [190]:
for i in range(0,9):
    tmp_label = str(i)
    tmp_dir = '../Data/Data_17295/valid/{0}'.format(tmp_label)
    tmp_imgs = os.listdir(tmp_dir)
    for i in range(len(tmp_imgs)):
        shutil.move('../Data/Data_17295/valid/{0}/{1}'.format(tmp_label,tmp_imgs[i]), '../Data/Data_17295/Unlabeled/{0}'.format(tmp_imgs[i]))

In [191]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
label = pd.read_csv('../csv/data_172950.csv', index_col=0)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
my_dataset = CustomDataSet(test_dir, transform=train_transformation, num=0)
test_loader = DataLoader(my_dataset , batch_size=1, shuffle=False)
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

In [192]:
pred_list =[]
size = len(test_loader.dataset)
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [193]:
cnt = 0
label_list = []
label = pd.read_csv('../csv/data_172950.csv', index_col=0)

img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    log_outputs = torch.log(pred_list[i][0])
    entropy = - torch.sum(pred_list[i][0] * log_outputs)
    if entropy> 0.05:
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

In [194]:
real_label_list = list(df_155655.loc[17295*8:17295*9-1]['failureNum'].values)

In [195]:
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 2658816.39it/s]


158

In [196]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

# Result

In [197]:
print(engineer_label_count)
print(wrong_cnt_list)

[6454, 5932, 2204, 1236, 1007, 1152, 1988, 609, 804]
[71, 29, 54, 22, 87, 37, 128, 88, 158]


In [198]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))