In [1]:
# import libraries
import pandas as pd
import numpy as np
import pickle5 as pickle
import math
import matplotlib.pyplot as plt
from matplotlib import colors
from skimage.transform import resize as sk_resize

from scipy import ndimage
import cv2

from helpers import *

#### Unfiltered datasets
- Create detect and classify labels 
- Add ID from index for identifying specific wafers
- Pre-split into train, dev, test

In [3]:
# load data
with open('../../WM-clean.pkl', "rb") as fh:
    raw_data = pickle.load(fh)
print(f'Raw data shape: {raw_data.shape}')
raw_data.head()

Raw data shape: (172950, 11)


Unnamed: 0,waferMap,dieSize,lotName,waferIndex,failureType,encoding,x_,y_,shape,labels,dataset
0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,1.0,none,1,45,48,"(45, 48)","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",train
1,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,2.0,none,1,45,48,"(45, 48)","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",test
2,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,3.0,none,1,45,48,"(45, 48)","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",test
3,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,4.0,none,1,45,48,"(45, 48)","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",train
4,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,5.0,none,1,45,48,"(45, 48)","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",train


In [4]:
# add index column to identify specific wafers 
raw_data.reset_index(inplace=True)
raw_data = raw_data.rename(columns={'index':'ID'})

# add detection model labels
raw_data['detectLabels'] = raw_data['failureType'].apply(lambda x: 0 if x == 'none' else 1)

# add classification model labels
fail_types = raw_data.failureType.unique()
fail_dict = {fail_types[i]:i for i in range(len(fail_types))}
print(fail_dict)

raw_data['classifyLabels'] = raw_data['failureType'].apply(lambda x: fail_dict[x])

raw_data.head()

{'none': 0, 'Loc': 1, 'Edge-Loc': 2, 'Center': 3, 'Edge-Ring': 4, 'Scratch': 5, 'Random': 6, 'Near-full': 7, 'Donut': 8}


Unnamed: 0,ID,waferMap,dieSize,lotName,waferIndex,failureType,encoding,x_,y_,shape,labels,dataset,detectLabels,classifyLabels
0,0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,1.0,none,1,45,48,"(45, 48)","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",train,0,0
1,1,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,2.0,none,1,45,48,"(45, 48)","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",test,0,0
2,2,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,3.0,none,1,45,48,"(45, 48)","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",test,0,0
3,3,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,4.0,none,1,45,48,"(45, 48)","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",train,0,0
4,4,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,5.0,none,1,45,48,"(45, 48)","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",train,0,0


In [None]:
# save raw data with added columns (ID, detectLabels, classifyLabels)
# with open('WM-clean-id.pkl', "wb") as f:
#     pickle.dump(raw_data, f)

In [None]:
# split raw data into train, dev, and test sets
raw_train = raw_data[raw_data.dataset == 'train'].reset_index(drop=True)
raw_dev = raw_data[raw_data.dataset == 'dev'].reset_index(drop=True)
raw_test = raw_data[raw_data.dataset == 'test'].reset_index(drop=True)
print(f"Train: {len(raw_train)}")
print(f"Dev: {len(raw_dev)}")
print(f"Test: {len(raw_test)}")

In [None]:
# save raw data with added columns (ID, detectLabels, classifyLabels)
# pre-split into test, dev, train
# with open('WM-clean-id-train.pkl', "wb") as f:
#     pickle.dump(raw_train, f)

In [None]:
# with open('WM-clean-id-dev.pkl', "wb") as f:
#     pickle.dump(raw_dev, f)

In [None]:
# with open('WM-clean-id-test.pkl', "wb") as f:
#     pickle.dump(raw_test, f)

#### Unfiltered resized datasets
- waferMap resized to 224x224 --> waferMap224
- Contains detect and classify labels, ID 
- Pre-split into train, dev, test
- Compressed to save space

In [None]:
# resize to 224x224
# normalize values to range of [0,1]
raw_data['waferMap224'] = raw_data.waferMap.apply(lambda x: sk_resize(x/2, [224,224], order=0, preserve_range=True, anti_aliasing=False))

In [None]:
# check that resizing worked
raw_data.waferMap224[2738].shape

In [None]:
# keep only columns needed for single defect modeling
sdm_data = raw_data[['ID', 'waferMap224', 'dieSize', 'lotName', 'failureType', 
                     'dataset', 'detectLabels', 'classifyLabels']]
sdm_data.head()

In [None]:
# save resized data
# with open('WM-clean-id-224.pkl', "wb") as f:
#     pickle.dump(sdm_data, f)

In [None]:
# split resized data into train, dev, and test sets
sdm_train = sdm_data[sdm_data.dataset == 'train'].reset_index(drop=True)
sdm_dev = sdm_data[sdm_data.dataset == 'dev'].reset_index(drop=True)
sdm_test = sdm_data[sdm_data.dataset == 'test'].reset_index(drop=True)
print(f"Train: {len(sdm_train)}")
print(f"Dev: {len(sdm_dev)}")
print(f"Test: {len(sdm_test)}")

In [None]:
# save resized data
# pre-split into train, dev, test
# with open('WM-clean-id224-train.pkl', "wb") as f:
#     pickle.dump(sdm_train, f)
# save(sdm_train, 'WM-clean-id224-train.zip')

In [None]:
# with open('WM-clean-id224-dev.pkl', "wb") as f:
#     pickle.dump(sdm_dev, f)
# save(sdm_dev, 'WM-clean-id224-dev.zip')

In [None]:
# with open('WM-clean-id224-test.pkl', "wb") as f:
#     pickle.dump(sdm_test, f)
# save(sdm_test, 'WM-clean-id224-test.zip')

#### Filtered resized datasets
- Used median filter 7x7
- waferMap224: resized to 224x224
- Contains detect and classify labels, ID 
- Pre-split into train, dev, test
- Compressed to save space

In [None]:
# load resized train data
# with open('WM-clean-id224-train.pkl', "rb") as fh:
#     sdm_train = pickle.load(fh)
sdm_train = load('WM-clean-id224-train.zip')
    
print(sdm_train.shape)
sdm_train.head()

In [None]:
# apply 7x7 median filter
sdm_train['filterMap7'] = sdm_train.waferMap224.apply(lambda x: ndimage.median_filter(x, size=7))

# keep only filtered maps
filtered_train = sdm_train.drop(['waferMap224'], axis=1)
print(filtered_train.shape)
filtered_train.head()

In [None]:
# save filtered data
# with open('WM-clean-id224filter7-train.pkl', "wb") as f:
#     pickle.dump(filtered_train, f)
# save(filtered_train, 'WM-clean-id224filter7-train.zip')

In [None]:
# load resized dev data
# with open('WM-clean-id224-dev.pkl', "rb") as fh:
#     sdm_dev = pickle.load(fh)
sdm_dev = load('WM-clean-id224-dev.zip')

print(sdm_dev.shape)
sdm_dev.head()

In [None]:
# apply 7x7 median filter
sdm_dev['filterMap7'] = sdm_dev.waferMap224.apply(lambda x: ndimage.median_filter(x, size=7))

# keep only filtered maps
filtered_dev = sdm_dev.drop(['waferMap224'], axis=1)
print(filtered_dev.shape)
filtered_dev.head()

In [None]:
# with open('WM-clean-id224filter7-dev.pkl', "wb") as f:
#     pickle.dump(filtered_dev, f)
# save(filtered_dev, 'WM-clean-id224filter7-dev.zip')

In [3]:
# load resized test data
# with open('WM-clean-id224-test.pkl', "rb") as fh:
#     sdm_test = pickle.load(fh)
sdm_test = load('WM-clean-id224-test.zip')
    
print(sdm_test.shape)
sdm_test.head()

(25943, 8)


Unnamed: 0,ID,waferMap224,dieSize,lotName,failureType,dataset,detectLabels,classifyLabels
0,1,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",1683.0,lot1,none,test,0,0
1,2,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",1683.0,lot1,none,test,0,0
2,5,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",1683.0,lot1,none,test,0,0
3,10,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",1683.0,lot1,none,test,0,0
4,11,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",1683.0,lot1,none,test,0,0


In [4]:
# apply 7x7 median filter
sdm_test['filterMap7'] = sdm_test.waferMap224.apply(lambda x: ndimage.median_filter(x, size=7))

# keep only filtered maps
filtered_test = sdm_test.drop(['waferMap224'], axis=1)
print(filtered_test.shape)
filtered_test.head()

(25943, 8)


Unnamed: 0,ID,dieSize,lotName,failureType,dataset,detectLabels,classifyLabels,filterMap7
0,1,1683.0,lot1,none,test,0,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,2,1683.0,lot1,none,test,0,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,5,1683.0,lot1,none,test,0,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,10,1683.0,lot1,none,test,0,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,11,1683.0,lot1,none,test,0,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [6]:
# with open('WM-clean-id224filter7-test.pkl', "wb") as f:
#     pickle.dump(filtered_test, f)
# save(filtered_test, 'WM-clean-id224filter7-test.zip')