#### ViT Model for Classification of All Defects (including nones)
ViT Model from Google [paper](https://arxiv.org/abs/2010.11929)
using Keras implementation from faustomorales [GitHub](https://github.com/faustomorales/vit-keras)

Data preprocessing:
- Resized to 224x224 with no filters
- None is randomly undersampled to 30,000.

ViT model:
- Size = B16
- Patch size = 16
- Using included top

In [None]:
### RUN THE FOLLOWING ON TERMINAL FIRST ###
# pip install --upgrade pip
# pip install --upgrade tensorflow
# pip install tensorflow_addons
# pip install vit-keras
# pip install pickle5

In [1]:
# import libraries
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras import datasets, layers, models, losses, optimizers, regularizers, callbacks
from vit_keras import vit, utils

import os
import time
import numpy as np
import pandas as pd
import cv2
from skimage.morphology import skeletonize, thin

import helpers as helper
from keras_model_s3_wrapper import *

import boto3
import pickle5 as pickle
s3 = boto3.resource('s3')
bucket_name = 'wafer-capstone'
my_bucket = s3.Bucket(bucket_name)

In [2]:
tf.__version__

'2.4.0'

In [3]:
tf.config.list_physical_devices(device_type=None)

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:3', device_type='GPU')]

In [4]:
# specify variables for model
path = 'processed_data/WM-clean224'
result_path = 'results_vit'

filename = 'WM-clean224'
map_column = 'waferMap224'
binarized = False

model_id = 'vit_b16_includedtop'
data_id = '224'
note = '' # -optional

In [5]:
# load dev and test sets
# directly from S3 (using boto3 resource)
start = time.time()

dev_key = f'{path}/{filename}-dev.pkl'
test_key = f'{path}/{filename}-test.pkl'

dev = pickle.loads(my_bucket.Object(dev_key).get()['Body'].read())
test = pickle.loads(my_bucket.Object(test_key).get()['Body'].read())

print("Wall time: {:.2f} seconds".format(time.time() - start))
print(f"Train: {len(dev)}")
print(f"Dev: {len(test)}")

print(f"Sanity check: {np.unique(dev[map_column][0])}")

Wall time: 46.35 seconds
Train: 25942
Dev: 25943
Sanity check: [0 1 2]


In [6]:
# load augmented 224x224 train dataset
# # need to binarize and apply n=2 morphological thinning
start = time.time()

train_key = f'processed_data/WM-clean224/WM-clean224-train-all.pkl'
train = pickle.loads(my_bucket.Object(train_key).get()['Body'].read())

# remove augmented samples
train = train[train.ID != 'A'].reset_index(drop=True)

# def preprocess(x):
#     ret, thresh_img = cv2.threshold(x, 1, 1, cv2.THRESH_BINARY)
#     y = thin(thresh_img, 2)
#     return y

# train['thinMap2'] = train.waferMap224.apply(lambda x: preprocess(x))
# train['thinMap2'] = train.thinMap2.apply(lambda x: x.astype(np.uint8))

print("Wall time: {:.2f} seconds".format(time.time() - start))
print(f"Train: {len(train)}")
print(f"Sanity check: {np.unique(train[map_column][0])}")

Wall time: 77.08 seconds
Train: 47863
Sanity check: [0 1 2]


#### Quick EDA

In [None]:
# train failure type distribution
helper.defect_distribution(train, note='Train Set')

In [None]:
# dev failure type distribution
helper.defect_distribution(dev, note='Dev Set')

In [None]:
# test failure type distribution
helper.defect_distribution(test, note='Test Set')

#### Data set-up

In [8]:
# prepare inputs
# convert maps to grayscale images with size (224, 224, 3)
start = time.time()

def grayscale_convert(x):
    if binarized:
        img = np.uint8(x*255)
    else:
        img = np.uint8(x/2*255)
    img_bgr = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
    return img_bgr

train['bgr'] = train[map_column].apply(lambda x: grayscale_convert(x))
dev['bgr'] = dev[map_column].apply(lambda x: grayscale_convert(x))
test['bgr'] = test[map_column].apply(lambda x: grayscale_convert(x))

print("Wall time: {:.2f} seconds".format(time.time() - start))
# sanity check
print(train['bgr'].shape)

Wall time: 28.38 seconds
(47863,)


In [11]:
# prepare inputs
start = time.time()

x_train = np.stack(train['bgr'])
x_val = np.stack(dev['bgr'])
x_test = np.stack(test['bgr'])

print("Wall time: {:.2f} seconds".format(time.time() - start))
# sanity check
# expected: (#rows, 224, 224, 3)
print(x_train.shape)

Wall time: 6.02 seconds
(47863, 224, 224, 3)


In [None]:
# # expand tensor and create dummy dimension at axis 3
# # images in greyscale, so no channel dimension
# start = time.time()

# x_train = tf.expand_dims(x_train, axis=3, name=None)
# x_val = tf.expand_dims(x_val, axis=3, name=None)
# x_test = tf.expand_dims(x_test, axis=3, name=None)

# print("Wall time: {:.2f} seconds".format(time.time() - start))
# # sanity check
# # expected: TensorShape([#rows, xdim, ydim, 1])
# x_train.shape

In [12]:
# prepare labels for supervised learning
# note: make sure labels are integers if using sparse categorical cross entropy
start = time.time()

y_train = np.asarray(train['classifyLabels']).astype(np.uint8)
y_val = np.asarray(dev['classifyLabels']).astype(np.uint8)
y_test = np.asarray(test['classifyLabels']).astype(np.uint8)

print("Wall time: {:.2f} seconds".format(time.time() - start))
# sanity check
# expected: type = int, min = 0, max = 7
print(type(y_train[0]))
print(min(y_train), min(y_val), min(y_test))
print(max(y_train), max(y_val), max(y_test))

Wall time: 0.00 seconds
<class 'numpy.uint8'>
0 0 0
8 8 8


#### Model

In [13]:
# fine-tune using pre-trained vit model
image_size = 224
model = vit.vit_b16(
    image_size=image_size,
    activation='sigmoid',
    pretrained=True,
    include_top=True,
    pretrained_top=False,
    classes=9
)

model.summary()



Model: "vit-b16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 224, 224, 3)]     0         
_________________________________________________________________
embedding (Conv2D)           (None, 14, 14, 768)       590592    
_________________________________________________________________
reshape (Reshape)            (None, 196, 768)          0         
_________________________________________________________________
class_token (ClassToken)     (None, 197, 768)          768       
_________________________________________________________________
Transformer/posembed_input ( (None, 197, 768)          151296    
_________________________________________________________________
Transformer/encoderblock_0 ( ((None, 197, 768), (None, 7087872   
_________________________________________________________________
Transformer/encoderblock_1 ( ((None, 197, 768), (None, 7087

In [14]:
# set model optimizer and metrics
opt = optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer='adam', loss=losses.sparse_categorical_crossentropy, metrics=['accuracy'])

In [None]:
# run model
start = time.time()

history = model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size=128, epochs=30)

print("Wall time: {:.2f} seconds".format(time.time() - start))

Epoch 1/30


In [None]:
# visualize accuracy and loss history
fig, axs = plt.subplots(2, 1, figsize=(15,15))

axs[0].plot(history.history['loss'])
axs[0].plot(history.history['val_loss'])
axs[0].title.set_text('Training Loss vs Validation Loss')
axs[0].legend(['Train', 'Val'])

axs[1].plot(history.history['accuracy'])
axs[1].plot(history.history['val_accuracy'])
axs[1].title.set_text('Training Accuracy vs Validation Accuracy')
axs[1].legend(['Train', 'Val'])

#### Model results

In [None]:
# save model to S3
s3_save_keras_model(model, f'{model_id}-{data_id}{note}')

In [None]:
# compute model results on test set
start = time.time()
results = model.evaluate(x_test, y_test)
print("Wall time: {:.2f} seconds".format(time.time() - start))
print(results)

In [None]:
# generate predictions for model analysis
start = time.time()
y_pred = model.predict(x_test)
y_max = np.argmax(y_pred, axis=1).astype(np.uint8)
predictions = [y_max, y_pred]
print("Wall time: {:.2f} seconds".format(time.time() - start))

In [None]:
# save predictions to local instance
with open(f'{result_path}/{model_id}-{data_id}{note}.pkl', "wb") as f:
    pickle.dump(predictions, f)

In [None]:
# plot confusion matrix
helper.plot_confusion_matrix(y_test, y_max, mode='all', normalize=True)

In [None]:
# plot confusion matrix counts
helper.plot_confusion_matrix(y_test, y_max, mode='all', normalize=False)