## Model Pipeline
**Data preprocessing**: Resize to 60x60

**Model**: GoogLeNet one-step

In [None]:
!pip install pickle5

In [None]:
# import libraries
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras import datasets, layers, models, losses, optimizers, regularizers, callbacks

import os
import time
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix

import cv2
from scipy.ndimage import median_filter
from skimage.transform import resize as sk_resize
from skimage.util import img_as_ubyte
from skimage.morphology import skeletonize, thin

import helpers as helper
from keras_model_s3_wrapper import *

import boto3
import pickle5 as pickle
s3 = boto3.resource('s3')
bucket_name = 'wafer-capstone'
my_bucket = s3.Bucket(bucket_name)

In [None]:
tf.__version__

In [None]:
tf.config.list_physical_devices(device_type=None)

### Load the data
Dataset must have the following columns: 
- **waferMap**: defect data of wafer map where 0 = blank spot, 1 = normal die (passed the electrical test), and 2 = broken die (failed electrical test); data type must be np.uint8
- **ID**: unique identification for each waferMap, separate from dataframe index

If labeled, dataset must have the following columns:
- **detectLabels**: for evaluating the detect model, where 0 = no defect, 1 = defect
- **classifyLabels**: for evaluating the classify model, where 0 = Loc, 1 = Edge-Loc, 2 = Center, 3 = Edge-Ring, 4 = Scratch, 5 = Random, 6 = Near-full, 7 = Donut, 8 = none

In [None]:
# specify variables

# specify data to load
path = '' # S3 folder where data lives
filename = '' # data filename in S3
labeled = True

# where to save results
result_path = '' # folder in local instance to save results
result_filename = '' # filename to save the results as

# which model to run
model = 'googlenet-all60'

In [None]:
# load data directly from S3 (using boto3 resource)
start = time.time()

data_key = f'{path}/{filename}.pkl'
data = pickle.loads(my_bucket.Object(data_key).get()['Body'].read())

print("Wall time: {:.2f} seconds".format(time.time() - start))
print(f"Dataset length: {len(data)}")

In [None]:
# IF LABELED
# show failure type distribution
if labeled:
    data_defects = data[data.detectLabels == 1]
    helper.defect_distribution(data_defects, note=f'({filename})')

### Data preprocessing

In [None]:
# resize to 60x60
start = time.time()

def preprocess(x):
    y = sk_resize(x, [60 ,60])
    new_y = img_as_ubyte(y)
    return new_y
    
data['waferMap224'] = data.waferMap.apply(lambda x: preprocess(x))

print("Wall time: {:.2f} seconds".format(time.time() - start))
print("Sanity checks:")
print(f'Map shape: {data.waferMap224[0].shape}')

In [None]:
plt.imshow(data.waferMap224[0])

### GoogLeNet Model

##### Data set-up

In [None]:
# prepare inputs
start = time.time()

x_det = np.stack(data['waferMap224'])
x_det = tf.expand_dims(x_det, axis=3, name=None)
x_det = tf.repeat(x_det, 3, axis=3)

print("Wall time: {:.2f} seconds".format(time.time() - start))
# sanity check
# expected: TensorShape([#rows, xdim, ydim, 1])
x_det.shape

In [None]:
# IF LABELED
# prepare labels for evaluating results
if labeled:
    y_det = np.asarray(data['classifyLabels']).astype(np.uint8)

##### Load and run model

In [None]:
# load saved detect model from S3
start = time.time()

googlenet = s3_get_keras_model(model)
googlenet.summary()

print("Wall time: {:.2f} seconds".format(time.time() - start))

In [None]:
# generate predictions
start = time.time()

googlenet_pred = googlenet.predict(x_det)
googlenet_labels = np.argmax(googlenet_pred[0], axis=1).astype(np.uint8)

print("Wall time: {:.2f} seconds".format(time.time() - start))

In [None]:
# IF LABELED
if labeled: 
    y_test = data['classifyLabels'].tolist()
    
    # manually compute overall accuracy
    googlenet_cm = confusion_matrix(y_test, googlenet_labels)

    num = 0
    for i in range(9):
        num += googlenet_cm[i][i]

    overall_accuracy = num / len(y_test) * 100
    print(f'Overall Model Accuracy: {overall_accuracy:.2f}%') 

    # plot confusion matrix
    helper.plot_confusion_matrix(y_test, googlenet_labels, mode='all', normalize=True)

### Error Analysis

In [None]:
# IF LABELED
# plot confusion matrix counts
if labeled:
    helper.plot_confusion_matrix(y_test, googlenet_labels, mode='all', normalize=False)

##### Optional visualization of misclassified wafers
Parameters:
- **true_label**: true label of the wafer
- **pred_label**: label predicted by the model
- **n**: number of samples to visualize (note: must be less than or equal to the total number in confusion matrix)

0 = Loc, 1 = Edge-Loc, 2 = Center, 3 = Edge-Ring, 4 = Scratch, 5 = Random, 6 = Near-full, 7 = Donut, 8 = none

In [None]:
# # plot misclassified wafers
# print('Scratch mislabled as None')
# helper.visualize_misclassified(data, y_test, googlenet_labels, true_label=4, pred_label=8, n=9, 
#                         figsize=(5,5), col='waferMap', cmap='gray_r')