In [1]:
# import libraries
import pandas as pd
import numpy as np
import pickle5 as pickle
import math
import matplotlib.pyplot as plt
from matplotlib import colors

import cv2
from scipy.ndimage import median_filter
from skimage.transform import resize as sk_resize
from skimage.util import img_as_ubyte
from skimage.morphology import skeletonize, thin

import time

In [2]:
# load data
with open('WM-clean.pkl', "rb") as fh:
    data = pickle.load(fh)
print(f'Raw data shape: {data.shape}')
data.head()

Raw data shape: (172950, 11)


Unnamed: 0,waferMap,dieSize,lotName,waferIndex,failureType,encoding,x_,y_,shape,labels,dataset
0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,1.0,none,1,45,48,"(45, 48)","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",train
1,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,2.0,none,1,45,48,"(45, 48)","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",test
2,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,3.0,none,1,45,48,"(45, 48)","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",test
3,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,4.0,none,1,45,48,"(45, 48)","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",train
4,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,5.0,none,1,45,48,"(45, 48)","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",train


In [3]:
# add index column to identify specific wafers 
data.reset_index(inplace=True)
data = data.rename(columns={'index':'ID'})

# add classification model labels
fail_dict = {'none': 8, 'Loc': 0, 'Edge-Loc': 1, 'Center': 2, 'Edge-Ring': 3, 
             'Scratch': 4, 'Random': 5, 'Near-full': 6, 'Donut': 7}
data['classifyLabels'] = data['failureType'].apply(lambda x: fail_dict[x])

data = data[['ID', 'waferMap', 'classifyLabels', 'dataset']]

data.head()

Unnamed: 0,ID,waferMap,classifyLabels,dataset
0,0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",8,train
1,1,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",8,test
2,2,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",8,test
3,3,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",8,train
4,4,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",8,train


In [4]:
# create detund dataset
# make list of none indices
none_df = data[(data.classifyLabels == 8) & (data.dataset == 'train')]
none_list = none_df.index.tolist()
print(f'None count: {len(none_list)}')

# make list of defect indices plus dev and test
def_df = data[(data.classifyLabels != 8) | (data.dataset == 'dev') | (data.dataset == 'test')]
def_list = def_df.index.tolist()
print(f'Defect count: {len(def_list)}')

# randomly undersample none list
from random import sample, seed
seed(424)
under_none = sample(none_list, 30000)

# recombine defect indices with undersampled none indices
new_list = sorted(under_none + def_list)
print(f'Undersampled dataset count: {len(new_list)}')

# undersample data
undersampled = data.iloc[new_list].reset_index(drop=True)
print(undersampled.shape)
undersampled.head()

None count: 103202
Defect count: 69748
Undersampled dataset count: 99748
(99748, 4)


Unnamed: 0,ID,waferMap,classifyLabels,dataset
0,1,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",8,test
1,2,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",8,test
2,5,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",8,test
3,9,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",8,train
4,10,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",8,test


In [5]:
# resize to 224x224
start = time.time()

def resize(x):
    y = sk_resize(x, [224,224])
    new_y = img_as_ubyte(y)
    img = np.uint8(new_y/2*255)
    img_bgr = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
    return img_bgr
    
undersampled['waferMap224'] = undersampled.waferMap.apply(lambda x: resize(x))

# check that resizing worked
print('Wall time: {:.3f} seconds'.format(time.time() - start))
print(undersampled.waferMap224[2].shape)

Wall time: 210.602 seconds
(224, 224, 3)


In [6]:
# binarize and apply n=2 morphological thinning
start = time.time()

def preprocess(x):
    y = sk_resize(x, [224,224])
    new_y = img_as_ubyte(y)
    ret, thresh_img = cv2.threshold(new_y, 1, 1, cv2.THRESH_BINARY)
    z = thin(thresh_img, 2).astype(np.uint8)
    img = np.uint8(z*255)
    img_bgr = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
    return img_bgr
    
undersampled['thinMap2'] = undersampled.waferMap.apply(lambda x: preprocess(x))

# check that resizing worked
print('Wall time: {:.3f} seconds'.format(time.time() - start))
print(undersampled.thinMap2[2].shape)
print(np.unique(undersampled.thinMap2[2]))

Wall time: 521.029 seconds
(224, 224, 3)
[  0 255]


In [7]:
# create numpy arrays for saving
ids = undersampled['ID'].to_numpy()
labels = undersampled['classifyLabels'].to_numpy().astype(np.uint8)
dataset = undersampled['dataset'].to_numpy()
wafermap = np.stack(undersampled['waferMap224'].to_numpy())
thinmap = np.stack(undersampled['thinMap2'].to_numpy())

In [8]:
# save resized undersampled dataset as npz file
np.savez_compressed('vit/WM-clean-vit224-undersampled.npz', 
                    ids=ids, labels=labels, dataset=dataset, wafermap=wafermap)

In [9]:
# save thinned undersampled dataset as npz file
np.savez_compressed('vit/WM-clean-vit224thin-undersampled.npz', 
                    ids=ids, labels=labels, dataset=dataset, thinmap=thinmap)