# Uncertainty Based Labeling Methodology
- Uncertainty 2 : Least Margin
- Model : ResNet50

In [1]:
import torch
import torchvision.datasets as dset
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import torch.nn as nn
import time
from torch import optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torchvision
import os
import natsort
import pandas as pd
from PIL import Image
from tqdm import tqdm
import shutil
import pandas as pd
import numpy as np

# Device Setting

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
class gpu_setting:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Model(ResNet50)

### Model

In [4]:
class BasicBlock(nn.Module):
    expansion = 1
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()

        self.residual_function = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(),
            nn.Conv2d(out_channels, out_channels * BasicBlock.expansion, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(out_channels * BasicBlock.expansion),
        )

        # identity mapping, input과 output의 feature map size, filter 수가 동일한 경우 사용.
        self.shortcut = nn.Sequential()

        self.relu = nn.ReLU()

        # projection mapping using 1x1conv
        if stride != 1 or in_channels != BasicBlock.expansion * out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels * BasicBlock.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels * BasicBlock.expansion)
            )

    def forward(self, x):
        x = self.residual_function(x) + self.shortcut(x)
        x = self.relu(x)
        return x


class BottleNeck(nn.Module):
    expansion = 4
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()

        self.residual_function = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(),
            nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(),
            nn.Conv2d(out_channels, out_channels * BottleNeck.expansion, kernel_size=1, stride=1, bias=False),
            nn.BatchNorm2d(out_channels * BottleNeck.expansion),
        )

        self.shortcut = nn.Sequential()

        self.relu = nn.ReLU()

        if stride != 1 or in_channels != out_channels * BottleNeck.expansion:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels*BottleNeck.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels*BottleNeck.expansion)
            )
            
    def forward(self, x):
        x = self.residual_function(x) + self.shortcut(x)
        x = self.relu(x)
        return x

class ResNet(nn.Module):
    def __init__(self, block, num_block, num_classes=9, init_weights=True):
        super().__init__()

        self.in_channels=64

        self.conv1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        )

        self.conv2_x = self._make_layer(block, 64, num_block[0], 1)
        self.conv3_x = self._make_layer(block, 128, num_block[1], 2)
        self.conv4_x = self._make_layer(block, 256, num_block[2], 2)
        self.conv5_x = self._make_layer(block, 512, num_block[3], 2)

        self.avg_pool = nn.AdaptiveAvgPool2d((1,1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)
        
        # weights inittialization
        if init_weights:
            self._initialize_weights()

    def _make_layer(self, block, out_channels, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_channels, out_channels, stride))
            self.in_channels = out_channels * block.expansion

        return nn.Sequential(*layers)

    def forward(self,x):
        output = self.conv1(x)
        output = self.conv2_x(output)
        x = self.conv3_x(output)
        x = self.conv4_x(x)
        x = self.conv5_x(x)
        x = self.avg_pool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

    # define weight initialization function
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)

def resnet18():
    return ResNet(BasicBlock, [2,2,2,2])

def resnet34():
    return ResNet(BasicBlock, [3, 4, 6, 3])

def resnet50():
    return ResNet(BottleNeck, [3,4,6,3])

def resnet101():
    return ResNet(BottleNeck, [3, 4, 23, 3])

def resnet152():
    return ResNet(BottleNeck, [3, 8, 36, 3])

### Parameters

In [5]:
def ResNetParameters(model, train_dl, valid_dl):
    device = gpu_setting.device
    model = model.lower()
    if model == 'resnet34':
        model = resnet34().to(device)
    if model == 'resnet50':
        model = resnet50().to(device)
    if model == 'resnet101':
        model = resnet101().to(device)
        
    loss_func = nn.CrossEntropyLoss(reduction='sum')
    opt = optim.Adam(model.parameters(), lr=0.001)
    lr_scheduler = ReduceLROnPlateau(opt, mode='min', factor=0.1, patience=10)
    
    # definc the training parameters
    params_train = {
        'num_epochs':5,
        'optimizer':opt,
        'loss_func':loss_func,
        'train_dl':train_dl, 
        'val_dl':valid_dl,
        'sanity_check':False,
        'lr_scheduler':lr_scheduler,
        'path2weights':'./trained_model/weights_original_res.pt', #이거 변경해서 사용
    }
    return model, params_train

# Function

In [6]:
def get_lr(opt):
    for param_group in opt.param_groups:
        return param_group['lr']
def metric_batch(output, target):
    pred = output.argmax(1, keepdim=True)
    corrects = pred.eq(target.view_as(pred)).sum().item()
    return corrects
def loss_batch(loss_func, output, target, opt=None):
    loss = loss_func(output, target)
    metric_b = metric_batch(output, target)

    if opt is not None:
        opt.zero_grad()
        loss.backward()
        opt.step()

    return loss.item(), metric_b
def loss_epoch(model, loss_func, dataset_dl, sanity_check=False, opt=None):
    running_loss = 0.0
    running_metric = 0.0
    len_data = len(dataset_dl.dataset)

    for xb, yb in dataset_dl:
        xb = xb.to(device)
        yb = yb.to(device)
        output = model(xb)

        loss_b, metric_b = loss_batch(loss_func, output, yb, opt)

        running_loss += loss_b
        
        if metric_b is not None:
            running_metric += metric_b
        
        if sanity_check is True:
            break

    loss = running_loss / len_data
    metric = running_metric / len_data

    return loss, metric

def train_val(model, params, epoch):
    num_epochs=epoch
    loss_func=params["loss_func"]
    opt=params["optimizer"]
    train_dl=params["train_dl"]
    val_dl=params["val_dl"]
    sanity_check=params["sanity_check"]
    lr_scheduler=params["lr_scheduler"]
    path2weights=params["path2weights"]

    loss_history = {'train': [], 'val': []}
    metric_history = {'train': [], 'val': []}

    # # GPU out of memoty error
    # best_model_wts = copy.deepcopy(model.state_dict())

    best_loss = float('inf')

    start_time = time.time()

    for epoch in range(num_epochs):
        current_lr = get_lr(opt)
        print('Epoch {}/{}, current lr={}'.format(epoch, num_epochs-1, current_lr))

        model.train()
        train_loss, train_metric = loss_epoch(model, loss_func, train_dl, sanity_check, opt)
        loss_history['train'].append(train_loss)
        metric_history['train'].append(train_metric)

        model.eval()
        with torch.no_grad():
            val_loss, val_metric = loss_epoch(model, loss_func, val_dl, sanity_check)
        loss_history['val'].append(val_loss)
        metric_history['val'].append(val_metric)

        if val_loss < best_loss:
            best_loss = val_loss
            # best_model_wts = copy.deepcopy(model.state_dict())

            torch.save(model.state_dict(), path2weights)
            print('Copied best model weights!')
            print('Get best val_loss')

        lr_scheduler.step(val_loss)

        print('train loss: %.6f, val loss: %.6f, accuracy: %.2f, time: %.4f min' %(train_loss, val_loss, 100*val_metric, (time.time()-start_time)/60))
        print('-'*10)

    # model.load_state_dict(best_model_wts)

    return model, loss_history, metric_history

# CustomDataset

In [7]:
class CustomDataSet():
    def __init__(self, main_dir, transform, num):
        self.main_dir = main_dir
        self.transform = transform
        self.num = num
        all_imgs = os.listdir(main_dir)
        self.total_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

    def __len__(self):
        return len(self.total_imgs)

    def __getitem__(self, idx):
        img_loc = os.path.join(self.main_dir, self.total_imgs[idx])
        image = Image.open(img_loc).convert("RGB")
        tensor_image = self.transform(image)
        return tensor_image

In [8]:
engineer_label_count = []
wrong_cnt_list = []

In [9]:
df_155655 = pd.read_csv('../csv/data_155655.csv')

In [10]:
df_155655.shape

(155655, 10)

# Experiment

### Phase 1

In [11]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
train_folder_dataset.transform = train_transformation
train_dl = DataLoader(train_folder_dataset, batch_size=32, shuffle=True)

In [12]:
train_folder_dataset

Dataset ImageFolder
    Number of datapoints: 17295
    Root location: ../Data/Data_17295/Labeled/

In [13]:
test_dir = '../Data/Data_17295/Unlabeled/'
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)
original_df = pd.read_csv('../csv/data_172950.csv')
tmp = all_imgs[:17295]
for i in range(len(tmp)):
    idx = int(tmp[i][:-4])
    fail_num = str(original_df[original_df['index'] == idx]['failureNum'].values[0])
    image_file = str(original_df[original_df['index'] == idx]['index'].values[0]) + '.png'
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(image_file), '../Data/Data_17295/valid/{0}/{1}'.format(fail_num,image_file))

In [14]:
valid_dir = '../Data/Data_17295/valid/'
valid_folder_dataset = dset.ImageFolder(root=valid_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
valid_folder_dataset.transform = train_transformation
valid_dl = DataLoader(valid_folder_dataset, batch_size=32, shuffle=True)

In [15]:
valid_folder_dataset

Dataset ImageFolder
    Number of datapoints: 17295
    Root location: ../Data/Data_17295/valid/

In [16]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [17]:
model, loss_hist, metric_hist = train_val(model, params_train, 5)

Epoch 0/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.344339, val loss: 0.505448, accuracy: 87.68, time: 3.9003 min
----------
Epoch 1/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.221236, val loss: 0.290321, accuracy: 92.15, time: 5.5265 min
----------
Epoch 2/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.205943, val loss: 0.277957, accuracy: 92.14, time: 7.1047 min
----------
Epoch 3/4, current lr=0.001
train loss: 0.175019, val loss: 0.400956, accuracy: 86.86, time: 8.6751 min
----------
Epoch 4/4, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.166867, val loss: 0.265863, accuracy: 91.22, time: 10.2692 min
----------


In [18]:
for i in range(0,9):
    tmp_label = str(i)
    tmp_dir = '../Data/Data_17295/valid/{0}'.format(tmp_label)
    tmp_imgs = os.listdir(tmp_dir)
    for i in range(len(tmp_imgs)):
        shutil.move('../Data/Data_17295/valid/{0}/{1}'.format(tmp_label,tmp_imgs[i]), '../Data/Data_17295/Unlabeled/{0}'.format(tmp_imgs[i]))

In [19]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
label = pd.read_csv('../csv/data_172950.csv', index_col=0)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
my_dataset = CustomDataSet(test_dir, transform=train_transformation, num=0)
test_loader = DataLoader(my_dataset , batch_size=1, shuffle=False)
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

In [20]:
pred_list =[]
size = len(test_loader.dataset)
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [21]:
max_list = []
for i in range(0,17295):
    max_list.append(float(pred_list[i][0].max()))

In [23]:
cnt = 0
label_list = []
label = pd.read_csv('../csv/data_172950.csv', index_col=0)

img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    if bool(pred_list[i][0].sort().values[-1] - pred_list[i][0].sort().values[-2] <= 0.98):
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

In [24]:
real_label_list = list(df_155655.loc[17295*0:17295*1-1]['failureNum'].values)

In [25]:
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 2479084.37it/s]


44

In [26]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

In [27]:
print(engineer_label_count)
print(wrong_cnt_list)

[5839]
[44]


In [28]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

### Phase 2

In [29]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
train_folder_dataset.transform = train_transformation
train_dl = DataLoader(train_folder_dataset, batch_size=32, shuffle=True)

In [30]:
train_folder_dataset

Dataset ImageFolder
    Number of datapoints: 34590
    Root location: ../Data/Data_17295/Labeled/

In [31]:
test_dir = '../Data/Data_17295/Unlabeled/'
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)
original_df = pd.read_csv('../csv/data_172950.csv')
tmp = all_imgs[:17295]
for i in range(len(tmp)):
    idx = int(tmp[i][:-4])
    fail_num = str(original_df[original_df['index'] == idx]['failureNum'].values[0])
    image_file = str(original_df[original_df['index'] == idx]['index'].values[0]) + '.png'
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(image_file), '../Data/Data_17295/valid/{0}/{1}'.format(fail_num,image_file))

In [32]:
valid_dir = '../Data/Data_17295/valid/'
valid_folder_dataset = dset.ImageFolder(root=valid_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
valid_folder_dataset.transform = train_transformation
valid_dl = DataLoader(valid_folder_dataset, batch_size=32, shuffle=True)

In [33]:
valid_folder_dataset

Dataset ImageFolder
    Number of datapoints: 17295
    Root location: ../Data/Data_17295/valid/

In [34]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [35]:
model, loss_hist, metric_hist = train_val(model, params_train, 7)

Epoch 0/6, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.317448, val loss: 0.438243, accuracy: 86.58, time: 4.0013 min
----------
Epoch 1/6, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.191020, val loss: 0.347640, accuracy: 89.30, time: 6.5525 min
----------
Epoch 2/6, current lr=0.001
train loss: 0.152808, val loss: 0.580875, accuracy: 86.22, time: 8.9706 min
----------
Epoch 3/6, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.137607, val loss: 0.294596, accuracy: 90.44, time: 11.2738 min
----------
Epoch 4/6, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.127160, val loss: 0.249808, accuracy: 91.46, time: 13.6026 min
----------
Epoch 5/6, current lr=0.001
train loss: 0.110110, val loss: 0.297326, accuracy: 90.62, time: 15.9595 min
----------
Epoch 6/6, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.099325, val loss: 0.214200, accuracy: 93

In [36]:
for i in range(0,9):
    tmp_label = str(i)
    tmp_dir = '../Data/Data_17295/valid/{0}'.format(tmp_label)
    tmp_imgs = os.listdir(tmp_dir)
    for i in range(len(tmp_imgs)):
        shutil.move('../Data/Data_17295/valid/{0}/{1}'.format(tmp_label,tmp_imgs[i]), '../Data/Data_17295/Unlabeled/{0}'.format(tmp_imgs[i]))

In [37]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
label = pd.read_csv('../csv/data_172950.csv', index_col=0)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
my_dataset = CustomDataSet(test_dir, transform=train_transformation, num=0)
test_loader = DataLoader(my_dataset , batch_size=1, shuffle=False)
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

In [38]:
pred_list =[]
size = len(test_loader.dataset)
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [39]:
max_list = []
for i in range(0,17295):
    max_list.append(float(pred_list[i][0].max()))

In [41]:
cnt = 0
label_list = []
label = pd.read_csv('../csv/data_172950.csv', index_col=0)

img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    if bool(pred_list[i][0].sort().values[-1] - pred_list[i][0].sort().values[-2] <= 0.98):
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

In [42]:
real_label_list = list(df_155655.loc[17295*1:17295*2-1]['failureNum'].values)

In [43]:
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 2478999.65it/s]


53

In [44]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

In [45]:
print(engineer_label_count)
print(wrong_cnt_list)

[5839, 4423]
[44, 53]


In [46]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

### Phase 3

In [47]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
train_folder_dataset.transform = train_transformation
train_dl = DataLoader(train_folder_dataset, batch_size=32, shuffle=True)

In [48]:
train_folder_dataset

Dataset ImageFolder
    Number of datapoints: 51885
    Root location: ../Data/Data_17295/Labeled/

In [49]:
test_dir = '../Data/Data_17295/Unlabeled/'
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)
original_df = pd.read_csv('../csv/data_172950.csv')
tmp = all_imgs[:17295]
for i in range(len(tmp)):
    idx = int(tmp[i][:-4])
    fail_num = str(original_df[original_df['index'] == idx]['failureNum'].values[0])
    image_file = str(original_df[original_df['index'] == idx]['index'].values[0]) + '.png'
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(image_file), '../Data/Data_17295/valid/{0}/{1}'.format(fail_num,image_file))

In [50]:
valid_dir = '../Data/Data_17295/valid/'
valid_folder_dataset = dset.ImageFolder(root=valid_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
valid_folder_dataset.transform = train_transformation
valid_dl = DataLoader(valid_folder_dataset, batch_size=32, shuffle=True)

In [51]:
valid_folder_dataset

Dataset ImageFolder
    Number of datapoints: 17295
    Root location: ../Data/Data_17295/valid/

In [52]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [53]:
model, loss_hist, metric_hist = train_val(model, params_train, 9)

Epoch 0/8, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.318775, val loss: 0.180838, accuracy: 95.17, time: 3.8731 min
----------
Epoch 1/8, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.194050, val loss: 0.135126, accuracy: 95.60, time: 7.2562 min
----------
Epoch 2/8, current lr=0.001
train loss: 0.161656, val loss: 0.144084, accuracy: 95.50, time: 10.6268 min
----------
Epoch 3/8, current lr=0.001
train loss: 0.140839, val loss: 0.144980, accuracy: 96.19, time: 14.0310 min
----------
Epoch 4/8, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.119237, val loss: 0.131449, accuracy: 96.22, time: 17.4627 min
----------
Epoch 5/8, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.104939, val loss: 0.116134, accuracy: 96.66, time: 20.8698 min
----------
Epoch 6/8, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.090749, val loss: 0.113959, accuracy: 9

In [54]:
for i in range(0,9):
    tmp_label = str(i)
    tmp_dir = '../Data/Data_17295/valid/{0}'.format(tmp_label)
    tmp_imgs = os.listdir(tmp_dir)
    for i in range(len(tmp_imgs)):
        shutil.move('../Data/Data_17295/valid/{0}/{1}'.format(tmp_label,tmp_imgs[i]), '../Data/Data_17295/Unlabeled/{0}'.format(tmp_imgs[i]))

In [55]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
label = pd.read_csv('../csv/data_172950.csv', index_col=0)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
my_dataset = CustomDataSet(test_dir, transform=train_transformation, num=0)
test_loader = DataLoader(my_dataset , batch_size=1, shuffle=False)
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

In [56]:
pred_list =[]
size = len(test_loader.dataset)
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [57]:
max_list = []
for i in range(0,17295):
    max_list.append(float(pred_list[i][0].max()))

In [59]:
cnt = 0
label_list = []
label = pd.read_csv('../csv/data_172950.csv', index_col=0)

img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    if bool(pred_list[i][0].sort().values[-1] - pred_list[i][0].sort().values[-2] <= 0.98):
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

In [60]:
real_label_list = list(df_155655.loc[17295*2:17295*3-1]['failureNum'].values)

In [61]:
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 2479253.83it/s]


148

In [62]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

In [63]:
print(engineer_label_count)
print(wrong_cnt_list)

[5839, 4423, 1305]
[44, 53, 148]


In [64]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

### Phase 4

In [65]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
train_folder_dataset.transform = train_transformation
train_dl = DataLoader(train_folder_dataset, batch_size=32, shuffle=True)

In [66]:
train_folder_dataset

Dataset ImageFolder
    Number of datapoints: 69180
    Root location: ../Data/Data_17295/Labeled/

In [67]:
test_dir = '../Data/Data_17295/Unlabeled/'
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)
original_df = pd.read_csv('../csv/data_172950.csv')
tmp = all_imgs[:17295]
for i in range(len(tmp)):
    idx = int(tmp[i][:-4])
    fail_num = str(original_df[original_df['index'] == idx]['failureNum'].values[0])
    image_file = str(original_df[original_df['index'] == idx]['index'].values[0]) + '.png'
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(image_file), '../Data/Data_17295/valid/{0}/{1}'.format(fail_num,image_file))

In [68]:
valid_dir = '../Data/Data_17295/valid/'
valid_folder_dataset = dset.ImageFolder(root=valid_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
valid_folder_dataset.transform = train_transformation
valid_dl = DataLoader(valid_folder_dataset, batch_size=32, shuffle=True)

In [69]:
valid_folder_dataset

Dataset ImageFolder
    Number of datapoints: 17295
    Root location: ../Data/Data_17295/valid/

In [70]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [71]:
model, loss_hist, metric_hist = train_val(model, params_train, 11)

Epoch 0/10, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.271770, val loss: 0.185292, accuracy: 94.85, time: 4.3832 min
----------
Epoch 1/10, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.165723, val loss: 0.063778, accuracy: 98.10, time: 8.9465 min
----------
Epoch 2/10, current lr=0.001
train loss: 0.136450, val loss: 0.096489, accuracy: 97.40, time: 13.7088 min
----------
Epoch 3/10, current lr=0.001
train loss: 0.117684, val loss: 0.090777, accuracy: 97.31, time: 18.6062 min
----------
Epoch 4/10, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.100418, val loss: 0.051193, accuracy: 98.51, time: 22.9458 min
----------
Epoch 5/10, current lr=0.001
train loss: 0.089916, val loss: 0.054008, accuracy: 98.51, time: 27.3228 min
----------
Epoch 6/10, current lr=0.001
train loss: 0.079049, val loss: 0.062908, accuracy: 98.22, time: 31.6674 min
----------
Epoch 7/10, current lr=0.001
train loss: 0.07032

In [72]:
for i in range(0,9):
    tmp_label = str(i)
    tmp_dir = '../Data/Data_17295/valid/{0}'.format(tmp_label)
    tmp_imgs = os.listdir(tmp_dir)
    for i in range(len(tmp_imgs)):
        shutil.move('../Data/Data_17295/valid/{0}/{1}'.format(tmp_label,tmp_imgs[i]), '../Data/Data_17295/Unlabeled/{0}'.format(tmp_imgs[i]))

In [73]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
label = pd.read_csv('../csv/data_172950.csv', index_col=0)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
my_dataset = CustomDataSet(test_dir, transform=train_transformation, num=0)
test_loader = DataLoader(my_dataset , batch_size=1, shuffle=False)
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

In [74]:
pred_list =[]
size = len(test_loader.dataset)
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [75]:
max_list = []
for i in range(0,17295):
    max_list.append(float(pred_list[i][0].max()))

In [77]:
cnt = 0
label_list = []
label = pd.read_csv('../csv/data_172950.csv', index_col=0)

img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    if bool(pred_list[i][0].sort().values[-1] - pred_list[i][0].sort().values[-2] <= 0.98):
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

In [78]:
real_label_list = list(df_155655.loc[17295*3:17295*4-1]['failureNum'].values)

In [79]:
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 2892363.94it/s]


35

In [80]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

In [81]:
print(engineer_label_count)
print(wrong_cnt_list)

[5839, 4423, 1305, 2325]
[44, 53, 148, 35]


In [82]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

###  Phase 5

In [83]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
train_folder_dataset.transform = train_transformation
train_dl = DataLoader(train_folder_dataset, batch_size=32, shuffle=True)

In [84]:
train_folder_dataset

Dataset ImageFolder
    Number of datapoints: 86475
    Root location: ../Data/Data_17295/Labeled/

In [85]:
test_dir = '../Data/Data_17295/Unlabeled/'
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)
original_df = pd.read_csv('../csv/data_172950.csv')
tmp = all_imgs[:17295]
for i in range(len(tmp)):
    idx = int(tmp[i][:-4])
    fail_num = str(original_df[original_df['index'] == idx]['failureNum'].values[0])
    image_file = str(original_df[original_df['index'] == idx]['index'].values[0]) + '.png'
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(image_file), '../Data/Data_17295/valid/{0}/{1}'.format(fail_num,image_file))

In [86]:
valid_dir = '../Data/Data_17295/valid/'
valid_folder_dataset = dset.ImageFolder(root=valid_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
valid_folder_dataset.transform = train_transformation
valid_dl = DataLoader(valid_folder_dataset, batch_size=32, shuffle=True)

In [87]:
valid_folder_dataset

Dataset ImageFolder
    Number of datapoints: 17295
    Root location: ../Data/Data_17295/valid/

In [88]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [89]:
model, loss_hist, metric_hist = train_val(model, params_train, 13)

Epoch 0/12, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.233020, val loss: 0.118947, accuracy: 96.85, time: 5.0767 min
----------
Epoch 1/12, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.143368, val loss: 0.096510, accuracy: 97.06, time: 10.1035 min
----------
Epoch 2/12, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.120339, val loss: 0.086390, accuracy: 97.62, time: 15.3713 min
----------
Epoch 3/12, current lr=0.001
train loss: 0.102939, val loss: 0.119251, accuracy: 97.09, time: 20.5951 min
----------
Epoch 4/12, current lr=0.001
train loss: 0.090331, val loss: 0.104038, accuracy: 97.39, time: 25.8755 min
----------
Epoch 5/12, current lr=0.001
train loss: 0.077687, val loss: 0.097925, accuracy: 97.54, time: 31.3380 min
----------
Epoch 6/12, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.070241, val loss: 0.080663, accuracy: 97.77, time: 36.4658 min
----------
Ep

In [90]:
for i in range(0,9):
    tmp_label = str(i)
    tmp_dir = '../Data/Data_17295/valid/{0}'.format(tmp_label)
    tmp_imgs = os.listdir(tmp_dir)
    for i in range(len(tmp_imgs)):
        shutil.move('../Data/Data_17295/valid/{0}/{1}'.format(tmp_label,tmp_imgs[i]), '../Data/Data_17295/Unlabeled/{0}'.format(tmp_imgs[i]))

In [91]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
label = pd.read_csv('../csv/data_172950.csv', index_col=0)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
my_dataset = CustomDataSet(test_dir, transform=train_transformation, num=0)
test_loader = DataLoader(my_dataset , batch_size=1, shuffle=False)
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

In [92]:
pred_list =[]
size = len(test_loader.dataset)
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [93]:
max_list = []
for i in range(0,17295):
    max_list.append(float(pred_list[i][0].max()))

In [95]:
cnt = 0
label_list = []
label = pd.read_csv('../csv/data_172950.csv', index_col=0)

img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    if bool(pred_list[i][0].sort().values[-1] - pred_list[i][0].sort().values[-2] <= 0.98):
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

In [96]:
real_label_list = list(df_155655.loc[17295*4:17295*5-1]['failureNum'].values)

In [97]:
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 2478999.65it/s]


128

In [98]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

In [99]:
print(engineer_label_count)
print(wrong_cnt_list)

[5839, 4423, 1305, 2325, 839]
[44, 53, 148, 35, 128]


In [100]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

### Phase 6

In [101]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
train_folder_dataset.transform = train_transformation
train_dl = DataLoader(train_folder_dataset, batch_size=32, shuffle=True)

In [102]:
train_folder_dataset

Dataset ImageFolder
    Number of datapoints: 103770
    Root location: ../Data/Data_17295/Labeled/

In [103]:
test_dir = '../Data/Data_17295/Unlabeled/'
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)
original_df = pd.read_csv('../csv/data_172950.csv')
tmp = all_imgs[:17295]
for i in range(len(tmp)):
    idx = int(tmp[i][:-4])
    fail_num = str(original_df[original_df['index'] == idx]['failureNum'].values[0])
    image_file = str(original_df[original_df['index'] == idx]['index'].values[0]) + '.png'
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(image_file), '../Data/Data_17295/valid/{0}/{1}'.format(fail_num,image_file))

In [104]:
valid_dir = '../Data/Data_17295/valid/'
valid_folder_dataset = dset.ImageFolder(root=valid_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
valid_folder_dataset.transform = train_transformation
valid_dl = DataLoader(valid_folder_dataset, batch_size=32, shuffle=True)

In [105]:
valid_folder_dataset

Dataset ImageFolder
    Number of datapoints: 17295
    Root location: ../Data/Data_17295/valid/

In [106]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [107]:
model, loss_hist, metric_hist = train_val(model, params_train, 15)

Epoch 0/14, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.211947, val loss: 0.105341, accuracy: 96.84, time: 5.9088 min
----------
Epoch 1/14, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.132811, val loss: 0.094891, accuracy: 97.17, time: 15.5801 min
----------
Epoch 2/14, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.108700, val loss: 0.085866, accuracy: 97.54, time: 23.5696 min
----------
Epoch 3/14, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.091785, val loss: 0.080731, accuracy: 97.86, time: 29.9129 min
----------
Epoch 4/14, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.077703, val loss: 0.070102, accuracy: 97.83, time: 35.9348 min
----------
Epoch 5/14, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.067913, val loss: 0.063139, accuracy: 98.07, time: 41.7385 min
----------
Epoch 6/14, current lr=0.001


In [108]:
for i in range(0,9):
    tmp_label = str(i)
    tmp_dir = '../Data/Data_17295/valid/{0}'.format(tmp_label)
    tmp_imgs = os.listdir(tmp_dir)
    for i in range(len(tmp_imgs)):
        shutil.move('../Data/Data_17295/valid/{0}/{1}'.format(tmp_label,tmp_imgs[i]), '../Data/Data_17295/Unlabeled/{0}'.format(tmp_imgs[i]))

In [109]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
label = pd.read_csv('../csv/data_172950.csv', index_col=0)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
my_dataset = CustomDataSet(test_dir, transform=train_transformation, num=0)
test_loader = DataLoader(my_dataset , batch_size=1, shuffle=False)
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

In [110]:
pred_list =[]
size = len(test_loader.dataset)
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [111]:
max_list = []
for i in range(0,17295):
    max_list.append(float(pred_list[i][0].max()))

In [113]:
cnt = 0
label_list = []
label = pd.read_csv('../csv/data_172950.csv', index_col=0)

img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    if bool(pred_list[i][0].sort().values[-1] - pred_list[i][0].sort().values[-2] <= 0.98):
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

In [114]:
real_label_list = list(df_155655.loc[17295*5:17295*6-1]['failureNum'].values)

In [115]:
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 2479084.37it/s]


98

In [116]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

In [117]:
print(engineer_label_count)
print(wrong_cnt_list)

[5839, 4423, 1305, 2325, 839, 868]
[44, 53, 148, 35, 128, 98]


In [118]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

### Phase 7

In [119]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
train_folder_dataset.transform = train_transformation
train_dl = DataLoader(train_folder_dataset, batch_size=32, shuffle=True)

In [120]:
train_folder_dataset

Dataset ImageFolder
    Number of datapoints: 121065
    Root location: ../Data/Data_17295/Labeled/

In [121]:
test_dir = '../Data/Data_17295/Unlabeled/'
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)
original_df = pd.read_csv('../csv/data_172950.csv')
tmp = all_imgs[:17295]
for i in range(len(tmp)):
    idx = int(tmp[i][:-4])
    fail_num = str(original_df[original_df['index'] == idx]['failureNum'].values[0])
    image_file = str(original_df[original_df['index'] == idx]['index'].values[0]) + '.png'
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(image_file), '../Data/Data_17295/valid/{0}/{1}'.format(fail_num,image_file))

In [122]:
valid_dir = '../Data/Data_17295/valid/'
valid_folder_dataset = dset.ImageFolder(root=valid_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
valid_folder_dataset.transform = train_transformation
valid_dl = DataLoader(valid_folder_dataset, batch_size=32, shuffle=True)

In [123]:
valid_folder_dataset

Dataset ImageFolder
    Number of datapoints: 17295
    Root location: ../Data/Data_17295/valid/

In [124]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [125]:
model, loss_hist, metric_hist = train_val(model, params_train, 17)

Epoch 0/16, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.198689, val loss: 0.181273, accuracy: 94.57, time: 7.3541 min
----------
Epoch 1/16, current lr=0.001
train loss: 0.124463, val loss: 0.190708, accuracy: 94.54, time: 14.0413 min
----------
Epoch 2/16, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.101433, val loss: 0.162898, accuracy: 94.91, time: 21.1244 min
----------
Epoch 3/16, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.086382, val loss: 0.126553, accuracy: 95.69, time: 28.1433 min
----------
Epoch 4/16, current lr=0.001
train loss: 0.073492, val loss: 0.140117, accuracy: 95.57, time: 34.9194 min
----------
Epoch 5/16, current lr=0.001
train loss: 0.063774, val loss: 0.144440, accuracy: 95.22, time: 42.2558 min
----------
Epoch 6/16, current lr=0.001
train loss: 0.056675, val loss: 0.154556, accuracy: 95.77, time: 49.1299 min
----------
Epoch 7/16, current lr=0.001
train loss: 0.0502

In [126]:
for i in range(0,9):
    tmp_label = str(i)
    tmp_dir = '../Data/Data_17295/valid/{0}'.format(tmp_label)
    tmp_imgs = os.listdir(tmp_dir)
    for i in range(len(tmp_imgs)):
        shutil.move('../Data/Data_17295/valid/{0}/{1}'.format(tmp_label,tmp_imgs[i]), '../Data/Data_17295/Unlabeled/{0}'.format(tmp_imgs[i]))

In [127]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
label = pd.read_csv('../csv/data_172950.csv', index_col=0)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
my_dataset = CustomDataSet(test_dir, transform=train_transformation, num=0)
test_loader = DataLoader(my_dataset , batch_size=1, shuffle=False)
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

In [128]:
pred_list =[]
size = len(test_loader.dataset)
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [129]:
max_list = []
for i in range(0,17295):
    max_list.append(float(pred_list[i][0].max()))

In [131]:
cnt = 0
label_list = []
label = pd.read_csv('../csv/data_172950.csv', index_col=0)

img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    if bool(pred_list[i][0].sort().values[-1] - pred_list[i][0].sort().values[-2] <= 0.98):
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

In [132]:
real_label_list = list(df_155655.loc[17295*6:17295*7-1]['failureNum'].values)

In [133]:
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 2892248.62it/s]


240

In [134]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

In [135]:
print(engineer_label_count)
print(wrong_cnt_list)

[5839, 4423, 1305, 2325, 839, 868, 1220]
[44, 53, 148, 35, 128, 98, 240]


In [136]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

### Phase 8

In [137]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
train_folder_dataset.transform = train_transformation
train_dl = DataLoader(train_folder_dataset, batch_size=32, shuffle=True)

In [138]:
train_folder_dataset

Dataset ImageFolder
    Number of datapoints: 138360
    Root location: ../Data/Data_17295/Labeled/

In [139]:
test_dir = '../Data/Data_17295/Unlabeled/'
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)
original_df = pd.read_csv('../csv/data_172950.csv')
tmp = all_imgs[:17295]
for i in range(len(tmp)):
    idx = int(tmp[i][:-4])
    fail_num = str(original_df[original_df['index'] == idx]['failureNum'].values[0])
    image_file = str(original_df[original_df['index'] == idx]['index'].values[0]) + '.png'
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(image_file), '../Data/Data_17295/valid/{0}/{1}'.format(fail_num,image_file))

In [140]:
valid_dir = '../Data/Data_17295/valid/'
valid_folder_dataset = dset.ImageFolder(root=valid_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
valid_folder_dataset.transform = train_transformation
valid_dl = DataLoader(valid_folder_dataset, batch_size=32, shuffle=True)

In [141]:
valid_folder_dataset

Dataset ImageFolder
    Number of datapoints: 17295
    Root location: ../Data/Data_17295/valid/

In [142]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [143]:
model, loss_hist, metric_hist = train_val(model, params_train, 19)

Epoch 0/18, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.189066, val loss: 0.108915, accuracy: 97.17, time: 8.3022 min
----------
Epoch 1/18, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.119554, val loss: 0.079677, accuracy: 97.66, time: 16.1214 min
----------
Epoch 2/18, current lr=0.001
train loss: 0.095237, val loss: 0.080740, accuracy: 97.91, time: 23.5240 min
----------
Epoch 3/18, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.079484, val loss: 0.061188, accuracy: 98.05, time: 31.5035 min
----------
Epoch 4/18, current lr=0.001
train loss: 0.069462, val loss: 0.074445, accuracy: 97.79, time: 39.7060 min
----------
Epoch 5/18, current lr=0.001
train loss: 0.061647, val loss: 0.069349, accuracy: 97.80, time: 47.4637 min
----------
Epoch 6/18, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.053223, val loss: 0.056571, accuracy: 98.31, time: 54.9693 min
----------
Ep

In [144]:
for i in range(0,9):
    tmp_label = str(i)
    tmp_dir = '../Data/Data_17295/valid/{0}'.format(tmp_label)
    tmp_imgs = os.listdir(tmp_dir)
    for i in range(len(tmp_imgs)):
        shutil.move('../Data/Data_17295/valid/{0}/{1}'.format(tmp_label,tmp_imgs[i]), '../Data/Data_17295/Unlabeled/{0}'.format(tmp_imgs[i]))

In [145]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
label = pd.read_csv('../csv/data_172950.csv', index_col=0)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
my_dataset = CustomDataSet(test_dir, transform=train_transformation, num=0)
test_loader = DataLoader(my_dataset , batch_size=1, shuffle=False)
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

In [146]:
pred_list =[]
size = len(test_loader.dataset)
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [147]:
max_list = []
for i in range(0,17295):
    max_list.append(float(pred_list[i][0].max()))

In [149]:
cnt = 0
label_list = []
label = pd.read_csv('../csv/data_172950.csv', index_col=0)

img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    if bool(pred_list[i][0].sort().values[-1] - pred_list[i][0].sort().values[-2] <= 0.98):
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

In [150]:
real_label_list = list(df_155655.loc[17295*7:17295*8-1]['failureNum'].values)

In [151]:
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 2479338.56it/s]


100

In [152]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

In [153]:
print(engineer_label_count)
print(wrong_cnt_list)

[5839, 4423, 1305, 2325, 839, 868, 1220, 569]
[44, 53, 148, 35, 128, 98, 240, 100]


In [154]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))

### Phase 9

In [155]:
train_dir = '../Data/Data_17295/Labeled/'
train_folder_dataset = dset.ImageFolder(root=train_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
train_folder_dataset.transform = train_transformation
train_dl = DataLoader(train_folder_dataset, batch_size=32, shuffle=True)

In [156]:
train_folder_dataset

Dataset ImageFolder
    Number of datapoints: 155655
    Root location: ../Data/Data_17295/Labeled/

In [157]:
test_dir = '../Data/Data_17295/Unlabeled/'
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)
original_df = pd.read_csv('../csv/data_172950.csv')
tmp = all_imgs[:17295]
for i in range(len(tmp)):
    idx = int(tmp[i][:-4])
    fail_num = str(original_df[original_df['index'] == idx]['failureNum'].values[0])
    image_file = str(original_df[original_df['index'] == idx]['index'].values[0]) + '.png'
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(image_file), '../Data/Data_17295/valid/{0}/{1}'.format(fail_num,image_file))

In [158]:
valid_dir = '../Data/Data_17295/valid/'
valid_folder_dataset = dset.ImageFolder(root=valid_dir)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
valid_folder_dataset.transform = train_transformation
valid_dl = DataLoader(valid_folder_dataset, batch_size=32, shuffle=True)

In [159]:
valid_folder_dataset

Dataset ImageFolder
    Number of datapoints: 17295
    Root location: ../Data/Data_17295/valid/

In [160]:
model, params_train = ResNetParameters('resnet50', train_dl, valid_dl)

In [161]:
model, loss_hist, metric_hist = train_val(model, params_train, 21)

Epoch 0/20, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.180848, val loss: 0.124646, accuracy: 96.10, time: 9.2071 min
----------
Epoch 1/20, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.112200, val loss: 0.103022, accuracy: 96.73, time: 17.7909 min
----------
Epoch 2/20, current lr=0.001
train loss: 0.090029, val loss: 0.119757, accuracy: 96.55, time: 26.7230 min
----------
Epoch 3/20, current lr=0.001
train loss: 0.075896, val loss: 0.111839, accuracy: 96.28, time: 35.7249 min
----------
Epoch 4/20, current lr=0.001
train loss: 0.066329, val loss: 0.121576, accuracy: 96.36, time: 44.2725 min
----------
Epoch 5/20, current lr=0.001
train loss: 0.057530, val loss: 0.105744, accuracy: 96.80, time: 53.3541 min
----------
Epoch 6/20, current lr=0.001
Copied best model weights!
Get best val_loss
train loss: 0.051797, val loss: 0.091022, accuracy: 97.10, time: 61.7009 min
----------
Epoch 7/20, current lr=0.001
train loss: 0.0457

In [162]:
for i in range(0,9):
    tmp_label = str(i)
    tmp_dir = '../Data/Data_17295/valid/{0}'.format(tmp_label)
    tmp_imgs = os.listdir(tmp_dir)
    for i in range(len(tmp_imgs)):
        shutil.move('../Data/Data_17295/valid/{0}/{1}'.format(tmp_label,tmp_imgs[i]), '../Data/Data_17295/Unlabeled/{0}'.format(tmp_imgs[i]))

In [163]:
num = 0
test_dir = '../Data/Data_17295/Unlabeled/'
label = pd.read_csv('../csv/data_172950.csv', index_col=0)
train_transformation = transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                # transforms.Normalize([train_meanR, train_meanG, train_meanB],[train_stdR, train_stdG, train_stdB]),
                ])
my_dataset = CustomDataSet(test_dir, transform=train_transformation, num=0)
test_loader = DataLoader(my_dataset , batch_size=1, shuffle=False)
all_imgs = os.listdir(test_dir)
all_imgs = natsort.natsorted(all_imgs)[17295*num:17295*(num+1)]

In [164]:
pred_list =[]
size = len(test_loader.dataset)
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
    for X in test_loader:
        X = X.to(device)
        pred = model(X)
        sft = torch.nn.functional.softmax(pred, dim=1)
        pred_list.append(sft)

In [165]:
max_list = []
for i in range(0,17295):
    max_list.append(float(pred_list[i][0].max()))

In [167]:
cnt = 0
label_list = []
label = pd.read_csv('../csv/data_172950.csv', index_col=0)

img_folder = natsort.natsorted(all_imgs)
for i in range(0,17295):
    if bool(pred_list[i][0].sort().values[-1] - pred_list[i][0].sort().values[-2] <= 0.98):
        idx = int(img_folder[i][:-4])
        engineerlabel = int(label[label['index']==idx]['failureNum'])
        label_list.append(engineerlabel)
        cnt = cnt + 1
    else:
        label_list.append(int(pred_list[i][0].argmax()))

In [168]:
real_label_list = list(df_155655.loc[17295*8:17295*9-1]['failureNum'].values)

In [169]:
wrong_cnt = 0
for i in tqdm(range(0,17295)):
    if label_list[i] != real_label_list[i]:
        wrong_cnt = wrong_cnt + 1
        
wrong_cnt

100%|███████████████████████████████████████████████████████████████████████| 17295/17295 [00:00<00:00, 2478914.93it/s]


182

In [170]:
engineer_label_count.append(cnt)
wrong_cnt_list.append(wrong_cnt)

# Result

In [171]:
print(engineer_label_count)
print(wrong_cnt_list)

[5839, 4423, 1305, 2325, 839, 868, 1220, 569, 799]
[44, 53, 148, 35, 128, 98, 240, 100, 182]


In [172]:
for i in range(0,17295):
    shutil.move('../Data/Data_17295/Unlabeled/{0}'.format(img_folder[i]),'../Data/Data_17295/Labeled/{0}/{1}'.format(label_list[i],img_folder[i]))