In [1]:
# import libraries
import pandas as pd
import numpy as np
import pickle5 as pickle
import math
import matplotlib.pyplot as plt
from matplotlib import colors
from skimage.transform import resize as sk_resize

from scipy import ndimage
import cv2

from helpers import *

In [2]:
# load data
with open('../../WM-clean.pkl', "rb") as fh:
    data = pickle.load(fh)
print(f'Raw data shape: {data.shape}')
data.head()

Raw data shape: (172950, 11)


Unnamed: 0,waferMap,dieSize,lotName,waferIndex,failureType,encoding,x_,y_,shape,labels,dataset
0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,1.0,none,1,45,48,"(45, 48)","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",train
1,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,2.0,none,1,45,48,"(45, 48)","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",test
2,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,3.0,none,1,45,48,"(45, 48)","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",test
3,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,4.0,none,1,45,48,"(45, 48)","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",train
4,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,5.0,none,1,45,48,"(45, 48)","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",train


In [3]:
# add index column to identify specific wafers 
data.reset_index(inplace=True)
data = data.rename(columns={'index':'ID'})

# add detection model labels
data['detectLabels'] = data['failureType'].apply(lambda x: 0 if x == 'none' else 1)

# add classification model labels
fail_dict = {'none': 8, 'Loc': 0, 'Edge-Loc': 1, 
             'Center': 2, 'Edge-Ring': 3, 'Scratch': 4, 
             'Random': 5, 'Near-full': 6, 'Donut': 7}

data['classifyLabels'] = data['failureType'].apply(lambda x: fail_dict[x])

data.head()

Unnamed: 0,ID,waferMap,dieSize,lotName,waferIndex,failureType,encoding,x_,y_,shape,labels,dataset,detectLabels,classifyLabels
0,0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,1.0,none,1,45,48,"(45, 48)","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",train,0,8
1,1,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,2.0,none,1,45,48,"(45, 48)","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",test,0,8
2,2,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,3.0,none,1,45,48,"(45, 48)","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",test,0,8
3,3,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,4.0,none,1,45,48,"(45, 48)","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",train,0,8
4,4,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,5.0,none,1,45,48,"(45, 48)","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",train,0,8


In [4]:
# split raw data into train, dev, and test sets
train = data[data.dataset == 'train'].reset_index(drop=True)
dev = data[data.dataset == 'dev'].reset_index(drop=True)
test = data[data.dataset == 'test'].reset_index(drop=True)
print(f"Train: {len(train)}")
print(f"Dev: {len(dev)}")
print(f"Test: {len(test)}")

Train: 121065
Dev: 25942
Test: 25943


In [5]:
# save train, dev, test sets
with open('WM-clean2-train.pkl', "wb") as f:
    pickle.dump(train, f)
with open('WM-clean2-dev.pkl', "wb") as f:
    pickle.dump(dev, f)
with open('WM-clean2-test.pkl', "wb") as f:
    pickle.dump(test, f)

#### Create Dataset for Detect Model
- Undersample raw train dataset
- Keep only 30,000 none samples randomly

In [6]:
# make list of none indices
none_df = train[train.detectLabels == 0]
none_list = none_df.index.tolist()
print(f'None count: {len(none_list)}')

# make list of defect indices
def_df = train[train.detectLabels == 1]
def_list = def_df.index.tolist()
print(f'Defect count: {len(def_list)}')

None count: 103202
Defect count: 17863


In [7]:
# randomly undersample none list
from random import sample
under_none = sample(none_list, 30000)

# sanity check
len(under_none)

30000

In [8]:
# recombine defect indices with undersampled none indices
new_list = sorted(under_none + def_list)
print(f'Undersampled dataset count: {len(new_list)}')

Undersampled dataset count: 47863


In [9]:
# undersample data
undersampled = data.iloc[new_list].reset_index(drop=True)
print(undersampled.shape)
undersampled.head()

(47863, 14)


Unnamed: 0,ID,waferMap,dieSize,lotName,waferIndex,failureType,encoding,x_,y_,shape,labels,dataset,detectLabels,classifyLabels
0,9,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,10.0,none,1,45,48,"(45, 48)","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",train,0,8
1,11,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,12.0,none,1,45,48,"(45, 48)","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",test,0,8
2,12,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,13.0,none,1,45,48,"(45, 48)","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",train,0,8
3,21,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,22.0,none,1,45,48,"(45, 48)","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",train,0,8
4,24,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,25.0,none,1,45,48,"(45, 48)","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",dev,0,8


In [10]:
# save undersampled dataset
with open('WM-clean2-train-detund.pkl', "wb") as f:
    pickle.dump(undersampled, f)

#### Create Dataset for Classify Model
Random transformations (flips, rotations) on dataset as-is

In [6]:
# df for classification model
def_df = train[train.detectLabels == 1]
print(len(def_df))
def_df.head()

17863


Unnamed: 0,ID,waferMap,dieSize,lotName,waferIndex,failureType,encoding,x_,y_,shape,labels,dataset,detectLabels,classifyLabels
25,36,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",2460.0,lot2,13.0,Edge-Loc,3,53,58,"(53, 58)","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",train,1,1
26,37,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",2460.0,lot2,14.0,Edge-Loc,3,53,58,"(53, 58)","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",train,1,1
27,38,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",2460.0,lot2,15.0,Edge-Loc,3,53,58,"(53, 58)","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",train,1,1
28,39,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",2460.0,lot2,16.0,Loc,5,53,58,"(53, 58)","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",train,1,0
29,41,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",2460.0,lot2,18.0,Edge-Loc,3,53,58,"(53, 58)","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",train,1,1


In [7]:
# figure out max count for balancing
max_count = max(def_df.groupby('classifyLabels')['ID'].count().tolist())

# dataframe for collecting all samples
aug_df = pd.DataFrame()

for i in range(8):
    labels = def_df[def_df.classifyLabels == (i)].index.tolist()
    label_df = def_df[def_df.classifyLabels == (i)][['ID', 'failureType', 'classifyLabels', 'waferMap']]
    defect = label_df.failureType.unique()[0]
    
    # calculate how many samples are needed to balance with max class
    n = max_count - len(label_df)
    
    print(f'Augmenting class label {i} ({defect}) with {n} samples')
    print(f'Starting count: {len(label_df)}')
    
    if n > 0:
        
        # create new samples by flipping and rotating
        # using helper function from helpers
        new_df = flip_rotate(df=label_df, col='waferMap', defect=defect, classLabel=(i+1), labels=labels, number=n)
        print(f'Samples created: {len(new_df)}')
        
        # concatenate augmented samples with original samples
        new_df2 = pd.concat([new_df, label_df], ignore_index=True, axis=0)
        print(f'New count: {len(new_df2)}')
        
        # collect with samples from other labels
        aug_df = pd.concat([aug_df, new_df2], ignore_index=True, axis=0)
        print(f'Total augmented df: {len(aug_df)}')
    
    else:
        
        # collect max class with samples from other labels without augmentation
        aug_df = pd.concat([aug_df, label_df], ignore_index=True, axis=0)
        print(f'Total augmented df: {len(aug_df)}')

Augmenting class label 0 (Loc) with 4261 samples
Starting count: 2515
Samples created: 4261
New count: 6776
Total augmented df: 6776
Augmenting class label 1 (Edge-Loc) with 3144 samples
Starting count: 3632
Samples created: 3144
New count: 6776
Total augmented df: 13552
Augmenting class label 2 (Center) with 3770 samples
Starting count: 3006
Samples created: 3770
New count: 6776
Total augmented df: 20328
Augmenting class label 3 (Edge-Ring) with 0 samples
Starting count: 6776
Total augmented df: 27104
Augmenting class label 4 (Scratch) with 5941 samples
Starting count: 835
Samples created: 5941
New count: 6776
Total augmented df: 33880
Augmenting class label 5 (Random) with 6170 samples
Starting count: 606
Samples created: 6170
New count: 6776
Total augmented df: 40656
Augmenting class label 6 (Near-full) with 6672 samples
Starting count: 104
Samples created: 6672
New count: 6776
Total augmented df: 47432
Augmenting class label 7 (Donut) with 6387 samples
Starting count: 389
Samples c

In [10]:
# randomly shuffle augmented df
shuffled_df = aug_df.sample(frac=1).reset_index(drop=True)
shuffled_df['classifyLabels'] = shuffled_df.classifyLabels.astype('int')
print(shuffled_df.shape)
shuffled_df.head()

(54208, 4)


Unnamed: 0,ID,classifyLabels,failureType,waferMap
0,A,1,Loc,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,A,8,Donut,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,127689,2,Center,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2,..."
3,A,5,Scratch,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,A,8,Donut,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [11]:
# save augmented dataset
with open('WM-clean2-train-clsaug.pkl', "wb") as f:
    pickle.dump(shuffled_df, f)