## Data Pre-processing : Elliminating possible sources of error from data

### Facts
1. data is already separated into train and val
2. Data is verified at the filename level before storing 
3. There are some errenous files (For eg. 161244.jpg,143441.jpg in trainWaymo folder = 0 bytes)
4. Label and target mean the same thing here
5. Although numbers in image names go to a higher value, the number of images may be less

### Sources of errors that are nullified: 
1. Images with no labels
2. Curropt images (un-openable or 0 bytes) 
3. More labels than corresponding images and vice versa
4. Errenous sequence/ order of labels

__Clean Data is saved as npz files in the end__

In [1]:
import glob
import random
import numpy as np
import pandas as pd

import PIL
from PIL import Image

In [2]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [3]:
# This may change based on how you position the data directory
def name(x):
    return(int(x.split('/')[-1].split('.')[0]))

directory = "."

In [4]:
train_honda = directory + "/trainHonda100k/"
val_honda = directory + "/valHonda100k/"

train_images_honda = sorted(glob.glob(train_honda+"*.jpg"), key = name)
val_images_honda = sorted(glob.glob(val_honda+"*.jpg"), key = name)

len_train_honda = len(train_images_honda)
len_val_honda = len(val_images_honda)

train_images = []
val_images = []

train_targets =[]
val_targets = []

print("Total Honda file paths to be loaded : {}".format((len_train_honda  +len_val_honda )))

#sample (see they are sorted)
for k in train_images_honda[0:5]:print(k, end="\n")
for k in train_images_honda[-5:]:print(k, end="\n")
    
problem_file_paths=[]

Total Honda file paths to be loaded : 110000
./trainHonda100k/0.jpg
./trainHonda100k/1.jpg
./trainHonda100k/2.jpg
./trainHonda100k/3.jpg
./trainHonda100k/4.jpg
./trainHonda100k/109994.jpg
./trainHonda100k/109995.jpg
./trainHonda100k/109996.jpg
./trainHonda100k/109997.jpg
./trainHonda100k/109998.jpg


In [5]:
train_waymo = directory + "/trainWaymo/"
val_waymo = directory + "/valWaymo/"

train_images_waymo = sorted(glob.glob(train_waymo+"*.jpg"), key = name)
val_images_waymo = sorted(glob.glob(val_waymo+"*.jpg"), key = name)

len_train_waymo = len(train_images_waymo)
len_val_waymo = len(val_images_waymo)

print("Total Waymo file paths to be loaded : {}".format((len_train_waymo +len_val_waymo)))

for k in val_images_waymo[0:5]:print(k, end="\n")
for k in val_images_waymo[-5:]:print(k, end="\n")

Total Waymo file paths to be loaded : 141456
./valWaymo/0.jpg
./valWaymo/1.jpg
./valWaymo/2.jpg
./valWaymo/3.jpg
./valWaymo/4.jpg
./valWaymo/33413.jpg
./valWaymo/33414.jpg
./valWaymo/33415.jpg
./valWaymo/33416.jpg
./valWaymo/33417.jpg


In [6]:
# The name of the last image in honda is 10998.jpg but there are only a total of 100000 of them
print(len(train_images_honda))

100000


In [7]:
#Waymo
print(len(train_images_waymo))

108038


In [8]:
print("\n--------------------------------")
print("Loading Images......")
pop_count_tr_honda=0
# Run twice to verify first pop worked correctly (if any)
for a in range(2):
    pop_tr_honda=[]
    im_honda_tr =[]
    print("Run :", a)
    for i in range(len(train_images_honda)):
        try: 
            path = train_images_honda[i]
            im_honda_tr.append(np.asarray(Image.open(path)))

        except PIL.UnidentifiedImageError:
            pop_tr_honda.append(i)
            problem_file_paths.append(path)
            print("To be popped :",i)

    shift = 0
    if pop_tr_honda:
        for j in range(len(pop_tr_honda)):
            train_images_honda.pop(pop_tr_honda[j] - shift)
            shift+=1
            pop_count_tr_honda+=1
            
print("Honda (Train): {} loaded, {} popped".format(len(im_honda_tr), pop_count_tr_honda))


--------------------------------
Loading Images......
Run : 0
Run : 1
Honda (Train): 100000 loaded, 0 popped


In [9]:
print("\n--------------------------------")
print("Loading Images......")
pop_count_tr_waymo=0
# Run twice to verify first pop worked correctly (if any)
for a in range(2):
    pop_tr_waymo=[]
    im_waymo_tr =[]
    print("Run :", a)
    for i in range(len(train_images_waymo)):
        try: 
            path = train_images_waymo[i]
            im_waymo_tr.append(np.asarray(Image.open(path)))

        except PIL.UnidentifiedImageError:
            pop_tr_waymo.append(i)
            problem_file_paths.append(path)
            print("To be popped :",i)

    shift = 0
    if pop_tr_waymo:
        for j in range(len(pop_tr_waymo)):
            train_images_waymo.pop(pop_tr_waymo[j] - shift)
            shift+=1
            pop_count_tr_waymo+=1
            
print("Waymo (Train): {} loaded, {} popped".format(len(im_waymo_tr), pop_count_tr_waymo))


--------------------------------
Loading Images......
Run : 0
To be popped : 95024
To be popped : 106847
Run : 1
Waymo (Train): 108036 loaded, 2 popped


In [10]:
print("\n--------------------------------")
print("Loading Images......")
pop_count_val_honda =0
# Run twice to verify first pop worked correctly (if any)
for a in range(2):
    pop_val_honda=[]
    im_honda_val =[]
    print("Run :", a)
    for i in range(len(val_images_honda)):
        try: 
            path = val_images_honda[i]
            im_honda_val.append(np.asarray(Image.open(path)))

        except PIL.UnidentifiedImageError:
            pop_val_honda.append(i)
            problem_file_paths.append(path)
            print("To be popped :",i)

    shift = 0
    if pop_val_honda:
        for j in range(len(pop_val_honda)):
            val_images_honda.pop(pop_val_honda[j] - shift)
            shift+=1
            pop_count_val_honda+=1
            
print("Honda (Val): {} loaded, {} popped".format(len(im_honda_val), pop_count_val_honda))


--------------------------------
Loading Images......
Run : 0
Run : 1
Honda (Val): 10000 loaded, 0 popped


In [11]:
print("\n--------------------------------")
print("Loading Images......")
pop_count_val_waymo=0
# Run twice to verify first pop worked correctly (if any)
for a in range(2):
    pop_val_waymo=[]
    im_waymo_val =[]
    print("Run :", a)
    for i in range(len(val_images_waymo)):
        try: 
            path = val_images_waymo[i]
            im_waymo_val.append(np.asarray(Image.open(path)))

        except PIL.UnidentifiedImageError:
            pop_val_waymo.append(i)
            problem_file_paths.append(path)
            print("To be popped :",i)

    shift = 0
    if pop_val_waymo:
        for j in range(len(pop_val_waymo)):
            val_images_waymo.pop(pop_val_waymo[j] - shift)
            shift+=1
            pop_count_val_waymo+=1
            
print("Waymo (Val): {} loaded, {} popped".format(len(im_waymo_val), pop_count_val_waymo))


--------------------------------
Loading Images......
Run : 0
Run : 1
Waymo (Val): 33418 loaded, 0 popped


In [12]:
print("\n--------------------------------")
print("Loading Labels......")

train_labels_honda = pd.read_csv(directory+ '/labelsHonda100k_train.csv', delimiter=",,,", header= None)
train_labels_waymo = pd.read_csv(directory+ '/labelsWaymo_train.csv', delimiter=",,,", header= None)
val_labels_honda = pd.read_csv(directory+ '/labelsHonda100k_val.csv', delimiter=",,,", header= None)
val_labels_waymo = pd.read_csv(directory+ '/labelsWaymo_val.csv', delimiter=",,,", header= None)


--------------------------------
Loading Labels......


  return func(*args, **kwargs)


In [13]:
print(train_labels_honda.shape)
print(train_labels_waymo.shape)
print(val_labels_honda.shape)
print(val_labels_waymo.shape)

train_labels_honda.head

# Waymo (train labels) has way more values than it should have had

(100000, 2)
(162916, 2)
(10000, 2)
(33418, 2)


<bound method NDFrame.head of                 0         1
0           0.jpg -0.252324
1           1.jpg -0.252324
2           2.jpg -0.205843
3           3.jpg -0.146082
4           4.jpg -0.139442
...           ...       ...
99995  109994.jpg -0.026560
99996  109995.jpg -0.066401
99997  109996.jpg -0.139442
99998  109997.jpg -0.139442
99999  109998.jpg -0.119522

[100000 rows x 2 columns]>

In [14]:
true_labels_honda_train = np.zeros(len(im_honda_tr))
images_no_labels_ht = []
# Go through all the images
for i in range(len(im_honda_tr)):
    # Get the name of the image that was stored (numbers may be skipped, thats why)
    name = train_images_honda[i].split('/')[-1]
    # Go through all the labels and find the name
    df=train_labels_honda[train_labels_honda[0]==name]
    
    if df.empty: 
        #print("Found no label")
        images_no_labels_ht.append(name)
        continue
    else:
        # Get the label
        true_labels_honda_train[i] = df[1]
        
print("Total labels not found:", len(images_no_labels_ht))

Total labels not found: 0


In [15]:
true_labels_waymo_train = np.zeros(len(im_waymo_tr))
images_no_labels_wt = []
# Go through all the images
for i in range(len(im_waymo_tr)):
    # Get the name of the image that was stored (numbers may be skipped, thats why)
    name = train_images_waymo[i].split('/')[-1]
    # Go through all the labels and find the name
    df=train_labels_waymo[train_labels_waymo[0]==name]
    
    if df.empty: 
        #print("Found no label")
        images_no_labels_wt.append(name)
        continue
    else: 
        # Get the label
        true_labels_waymo_train[i] = df[1]
        
print("Total labels not found:", len(images_no_labels_wt))

Total labels not found: 91


In [16]:
true_labels_honda_val = np.zeros(len(im_honda_val))
images_no_labels_hv =[]
# Go through all the images
for i in range(len(im_honda_val)):
    # Get the name of the image that was stored (numbers may be skipped, thats why)
    name = val_images_honda[i].split('/')[-1]
    # Go through all the labels and find the name
    df=val_labels_honda[val_labels_honda[0]==name]
    
    if df.empty:
        #print("Found no label")
        images_no_labels_hv.append(name)
        continue
    else:
        true_labels_honda_val[i] = df[1]
        
print("Total labels not found:", len(images_no_labels_hv))

Total labels not found: 0


In [17]:
true_labels_waymo_val = np.zeros(len(im_waymo_val))
images_no_labels_wv=[]
# Go through all the images
for i in range(len(im_waymo_val)):
    # Get the name of the image that was stored (numbers may be skipped, thats why)
    name = val_images_waymo[i].split('/')[-1]
    # Go through all the labels and find the name
    df=val_labels_waymo[val_labels_waymo[0]==name]
    
    if df.empty: 
        #print("Found no label")
        images_no_labels_wv.append(name)
        continue
    else:
        true_labels_waymo_val[i] = df[1]
        
print("Total labels not found:", len(images_no_labels_wv))

Total labels not found: 0


In [18]:
print(type(true_labels_honda_train))
print(type(true_labels_waymo_val[0]))
print(true_labels_honda_val[0])

<class 'numpy.ndarray'>
<class 'numpy.float64'>
-0.1394422310756972


In [19]:
# Remove the images with no labels (Doing just for waymo because we know that it only occured here)
# Paths of images (that contain names) are in: train_images_waymo
# Actual images are in: im_waymo_tr

print("Current Length:",len(im_waymo_tr), len(train_images_waymo))
# Go through all images with no labels
for i in images_no_labels_wt: 
    # Go through all the paths to locate that name
    for j in range(len(train_images_waymo)):
        name_in_path = train_images_waymo[j].split('/')[-1]
        if i == name_in_path: 
        # Get the index if name found:
            index = j
            break
    # Delete image in that index and also the path
    print("Item deleted: {} at index: {}".format(train_images_waymo[index], index))
    del im_waymo_tr[index]
    del train_images_waymo[index]
    
print("New Length:",len(im_waymo_tr), len(train_images_waymo))

Current Length: 108036 108036
Item deleted: ./trainWaymo/19301.jpg at index: 12650
Item deleted: ./trainWaymo/19302.jpg at index: 12650
Item deleted: ./trainWaymo/19303.jpg at index: 12650
Item deleted: ./trainWaymo/19305.jpg at index: 12650
Item deleted: ./trainWaymo/19308.jpg at index: 12650
Item deleted: ./trainWaymo/19310.jpg at index: 12650
Item deleted: ./trainWaymo/19311.jpg at index: 12650
Item deleted: ./trainWaymo/19312.jpg at index: 12650
Item deleted: ./trainWaymo/19315.jpg at index: 12650
Item deleted: ./trainWaymo/19316.jpg at index: 12650
Item deleted: ./trainWaymo/19317.jpg at index: 12650
Item deleted: ./trainWaymo/19321.jpg at index: 12650
Item deleted: ./trainWaymo/19322.jpg at index: 12650
Item deleted: ./trainWaymo/19323.jpg at index: 12650
Item deleted: ./trainWaymo/19324.jpg at index: 12650
Item deleted: ./trainWaymo/19325.jpg at index: 12650
Item deleted: ./trainWaymo/19326.jpg at index: 12650
Item deleted: ./trainWaymo/19327.jpg at index: 12650
Item deleted: ./

In [20]:
# Redo waymo train, because we removed images (for which we dont want labels): 
true_labels_waymo_train = np.zeros(len(im_waymo_tr))
images_no_labels_wt = []
# Go through all the images
for i in range(len(im_waymo_tr)):
    # Get the name of the image that was stored (numbers may be skipped, thats why)
    name = train_images_waymo[i].split('/')[-1]
    # Go through all the labels and find the name
    df=train_labels_waymo[train_labels_waymo[0]==name]
    
    if df.empty: 
        #print("Found no label")
        images_no_labels_wt.append(name)
        continue
    else: 
        # Get the label
        true_labels_waymo_train[i] = df[1]
        
print("Total labels not found:", len(images_no_labels_wt))

Total labels not found: 0


In [21]:
# Count the zeros (the ones that were not replaced after being init to 0)
# Why is one of each left here?
print("Zeros left: {}, {}, {}, {}".format(len(np.where( true_labels_honda_train == 0)),
                                          len(np.where( true_labels_waymo_train == 0)),
                                          len(np.where( true_labels_honda_val == 0)),
                                          len(np.where( true_labels_waymo_val == 0))))

train_targets_honda = np.asarray(true_labels_honda_train)
train_targets_waymo = np.asarray(true_labels_waymo_train)

val_targets_honda = np.asarray(true_labels_honda_val)
val_targets_waymo = np.asarray(true_labels_waymo_val)

print(train_targets_honda)
print(val_targets_waymo)

Zeros left: 1, 1, 1, 1
[-0.25232404 -0.25232404 -0.20584329 ... -0.13944223 -0.13944223
 -0.11952191]
[-0.15680866 -0.16177003  0.08695273 ... -0.42162678 -0.03659157
  0.37296704]


In [22]:
# Images that would not load
print("Files with problems:\n")
for m in problem_file_paths: print(m, end="\n")
    
# Append lists --> arrays
train_images_honda = np.asarray(im_honda_tr)
train_images_waymo = np.asarray(im_waymo_tr)

val_images_honda = np.asarray(im_honda_val) 
val_images_waymo = np.asarray(im_waymo_val)

print(type(train_images_honda))
print(type(train_images_honda))

Files with problems:

./trainWaymo/143441.jpg
./trainWaymo/161244.jpg
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [23]:
# See if all images and labels sizes check out
print("Secured Images Honda: train:{} , val:{}".format(train_images_honda.shape, val_images_honda.shape))
print("Secured Images Waymo: train:{} , val:{}".format(train_images_waymo.shape, val_images_waymo.shape))

print("Paths of Secured Honda Images: train:{} , val:{}".format(len(train_images_honda),len(val_images_honda)))
print("Paths of Secured Waymo Images: train:{} , val:{}".format(len(train_images_waymo), len(val_images_waymo)))

print("Secured Targets Honda: train:{} , val:{}".format(train_targets_honda.shape, val_targets_honda.shape))
print("Secured Targets Waymo: train:{} , val:{}".format(train_targets_waymo.shape, val_targets_waymo.shape))

Secured Images Honda: train:(100000, 66, 200, 3) , val:(10000, 66, 200, 3)
Secured Images Waymo: train:(107945, 66, 200, 3) , val:(33418, 66, 200, 3)
Paths of Secured Honda Images: train:100000 , val:10000
Paths of Secured Waymo Images: train:107945 , val:33418
Secured Targets Honda: train:(100000,) , val:(10000,)
Secured Targets Waymo: train:(107945,) , val:(33418,)


In [24]:
# Pytorch expects: [batch, channel, width, height], so reshape
train_images_honda = np.moveaxis(train_images_honda,-1,1)
val_images_honda = np.moveaxis(val_images_honda,-1,1)

train_images_waymo = np.moveaxis(train_images_waymo,-1,1)
val_images_waymo = np.moveaxis(val_images_waymo,-1,1)

# Targets dont need reshape
print("Reshaped Honda: Train {}, Val {}".format(train_images_honda.shape, val_images_honda.shape))
print("Reshaped Waymo: Train {}, Val {}".format(train_images_waymo.shape, val_images_waymo.shape))

# Release data without window
# Save as NPZ
np.savez('./train_honda.npz',train_images=train_images_honda, train_targets=train_targets_honda)
np.savez('./val_honda.npz',val_images=val_images_honda, val_targets=val_targets_honda)

np.savez('./train_waymo.npz',train_images=train_images_waymo, train_targets=train_targets_waymo)
np.savez('./val_waymo.npz',val_images=val_images_waymo, val_targets=val_targets_waymo)

Reshaped Honda: Train (100000, 3, 66, 200), Val (10000, 3, 66, 200)
Reshaped Waymo: Train (107945, 3, 66, 200), Val (33418, 3, 66, 200)
