In [1]:
# import libraries
import pandas as pd
import numpy as np
import pickle5 as pickle
import math
import matplotlib.pyplot as plt
from matplotlib import colors
import time
import random

In [2]:
# load data
path = '../../data/raw/LSWMD.pkl'
raw_data = pd.read_pickle(path)
raw_data.head()

Unnamed: 0,waferMap,dieSize,lotName,waferIndex,trianTestLabel,failureType
0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,1.0,[[Training]],[[none]]
1,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,2.0,[[Training]],[[none]]
2,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,3.0,[[Training]],[[none]]
3,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,4.0,[[Training]],[[none]]
4,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1683.0,lot1,5.0,[[Training]],[[none]]


In [3]:
# generate numeric labels for failureType
d_lables = {'[]': 0,
            "[['none']]": 1,
            "[['Edge-Ring']]": 2,
            "[['Edge-Loc']]": 3,
            "[['Center']]": 4,
            "[['Loc']]": 5,
            "[['Scratch']]": 6,
            "[['Random']]": 7,
            "[['Donut']]": 8,
            "[['Near-full']]": 9}

raw_data.failureType = raw_data.failureType.astype(str)
raw_data['encoding'] = raw_data.failureType.map(d_lables)

In [4]:
# keep only unlabeled data
data = raw_data[raw_data.encoding == 0].reset_index(drop=True)

data.dieSize = data.dieSize.astype(int)
data.waferIndex = data.waferIndex.astype(int)

print(data.shape)
data.head()

(638507, 7)


Unnamed: 0,waferMap,dieSize,lotName,waferIndex,trianTestLabel,failureType,encoding
0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",2460,lot2,11,[],[],0
1,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",2460,lot2,21,[],[],0
2,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",2460,lot3,20,[],[],0
3,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",2460,lot5,21,[],[],0
4,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 1, 2,...",533,lot7,9,[],[],0


In [5]:
# keep only needed columns
data = data.drop(['trianTestLabel', 'failureType', 'encoding'], axis=1)

# add index column to identify specific wafers 
data.reset_index(inplace=True)
data = data.rename(columns={'index':'ID'})

print(data.shape)
data.head()

(638507, 5)


Unnamed: 0,ID,waferMap,dieSize,lotName,waferIndex
0,0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",2460,lot2,11
1,1,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",2460,lot2,21
2,2,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",2460,lot3,20
3,3,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",2460,lot5,21
4,4,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 1, 2,...",533,lot7,9


In [6]:
# # save unlabeled data
# with open('../../data/WM-unlabeled.pkl', "wb") as f:
#     pickle.dump(data, f)

In [7]:
# generate a list of lots w/ 25 wafers
full_lots = (data.groupby('lotName').count().waferMap == 25).index[data.groupby('lotName').count().waferMap == 25].tolist()
print(len(full_lots))
full_lots[:3]

13159


['lot10003', 'lot10004', 'lot10005']

In [8]:
# pick random 50 lots
random.seed(0)
random_full = random.sample(full_lots, 50)

# collect indices for random lots
indices = [i for i in range(len(data)) if data.lotName[i] in set(random_full)]

# create df of random lots
random_df = data.loc[indices].reset_index(drop=True)
len(random_df)

1250

In [9]:
# # save random dataset
# with open('../../data/WM-unlabeled-random50-0.pkl', "wb") as f:
#     pickle.dump(random_df, f)

In [10]:
# pick random 50 lots
random.seed(424)
random_full = random.sample(full_lots, 50)

# collect indices for random lots
indices = [i for i in range(len(data)) if data.lotName[i] in set(random_full)]

# create df of random lots
random_df = data.loc[indices].reset_index(drop=True)
len(random_df)

1250

In [11]:
# # save random dataset
# with open('../../data/WM-unlabeled-random50-424.pkl', "wb") as f:
#     pickle.dump(random_df, f)

In [12]:
# pick random 50 lots
random.seed(314)
random_full = random.sample(full_lots, 50)

# collect indices for random lots
indices = [i for i in range(len(data)) if data.lotName[i] in set(random_full)]

# create df of random lots
random_df = data.loc[indices].reset_index(drop=True)
len(random_df)

1250

In [13]:
# # save random dataset
# with open('../../data/WM-unlabeled-random50-314.pkl', "wb") as f:
#     pickle.dump(random_df, f)

In [None]:
# generate a list of lots w/ certain dieSize
lots_4143 = (data.groupby('lotName').count().waferMap == 25).index[data.groupby('lotName').count().waferMap == 25].tolist()
print(len(full_lots))
full_lots[:3]

In [18]:
# generate list of lots with dieSize 4143
dieSize4143 = data.groupby('dieSize').get_group(4143).reset_index(drop=True)
lots_4143 = (dieSize4143.groupby('lotName').count().waferMap == 25).index[dieSize4143.groupby('lotName').count().waferMap == 25].tolist()
print(len(lots_4143))
lots_4143[:3]

# collect indices for 4143 lots
indices = [i for i in range(len(data)) if data.lotName[i] in set(lots_4143)]

# create df of random lots
size_df = data.loc[indices].reset_index(drop=True)
len(size_df)

44


1100

In [19]:
# # save 4143 dataset
# with open('../../data/customer/WM-unlabeled-size4143.pkl', "wb") as f:
#     pickle.dump(size_df, f)

In [20]:
# generate list of lots with dieSize 515
dieSize515 = data.groupby('dieSize').get_group(515).reset_index(drop=True)
lots_515 = (dieSize515.groupby('lotName').count().waferMap == 25).index[dieSize515.groupby('lotName').count().waferMap == 25].tolist()
print(len(lots_515))
lots_515[:3]

# pick random 50 lots
random.seed(314)
random_size = random.sample(lots_515, 50)

# collect indices for random lots
indices = [i for i in range(len(data)) if data.lotName[i] in set(random_size)]

# create df of random lots
random_df = data.loc[indices].reset_index(drop=True)
len(random_df)

415


1250

In [21]:
# # save 515 dataset
# with open('../../data/customer/WM-unlabeled-size515.pkl', "wb") as f:
#     pickle.dump(random_df, f)