***
ENGR 418 Project Stage 2


Group - 26


Stuart Watts - 94854395

Spencer Marchand - 42569939

***



Import needed modules
***

In [818]:
import os
import numpy as np
from numpy import asarray
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import confusion_matrix, accuracy_score
from PIL import Image, ImageFilter
import math

Function Definition
***

Function to get image data from specified folder 


In [819]:
def get_data(folder,im_width,n_samples,angle):
    file_names = os.listdir(folder)#make list of file names
    #define empty arrays
    x = np.empty((n_samples,im_width**2))
    y = np.empty((n_samples,1))
    #step through each image in the folder
    for i in range(n_samples):
        path = folder+file_names[i]
        #extract image, convert to grey scale
        im = Image.open(path).convert('L').rotate(angle)
        #put image data into an array 
        im = im.resize((im_width, im_width))
        im_array = asarray(im)
        x[i,:] = im_array.reshape(1,-1)

        # set up class labels
        if file_names[i].startswith('c'):
            y[i,:] = 0
        elif file_names[i].startswith('r'):
            y[i,:] = 1  
        else:
            y[i,:] = 2
  
    return x,y


Function to get the number of files in a folder


In [820]:
def get_samp_num(folder):
    #make list of files in target folder
    file_names = os.listdir(folder)
    #return number of files in traget folder
    n_samples = len(file_names)
    return n_samples

Function to return the sum of the edge data for each image


In [821]:
def process_image_pix_sum(image_array,edge_thresh,peak_thresh,im_width):
    # filter the data to find edges
    image_array = (image_array-np.min(image_array))*255/(np.max(image_array)-np.min(image_array))
    im = Image.fromarray(image_array.reshape(im_width,im_width)).convert('L')
    edges_image = im.filter(ImageFilter.FIND_EDGES)
    edges_array = np.asarray(edges_image)
    edges_array_scaled = edges_array.copy()[1:im_width-1,1:im_width-1]
    edges_array_scaled[edges_array_scaled < edge_thresh,] = 0 

    # set up edge histograms
    edges_v = np.sum(edges_array_scaled,axis=0).astype(float)
    edges_h = np.sum(edges_array_scaled,axis=1).astype(float)

    #initialize sums
    v_sum = 0.
    h_sum = 0.

    #loop and add up histogram values
    for i in range (im_width-2):
        v_sum = v_sum + edges_v[i]
        h_sum = h_sum + edges_h[i]
    
    return v_sum,h_sum

Function to return the number on non zero columns and rows in the image passed to it


In [822]:
def process_image_non0(image_array,edge_thresh,peak_thresh,im_width):
    # filter the data to find edges
    image_array = (image_array-np.min(image_array))*255/(np.max(image_array)-np.min(image_array))
    im = Image.fromarray(image_array.reshape(im_width,im_width)).convert('L')
    edges_image = im.filter(ImageFilter.FIND_EDGES)
    edges_array = np.asarray(edges_image)
    edges_array_scaled = edges_array.copy()[1:im_width-1,1:im_width-1]
    edges_array_scaled[edges_array_scaled < edge_thresh,] = 0 
    
    # setup eadge histograms
    edges_v = np.sum(edges_array_scaled,axis=0).astype(float)
    edges_h = np.sum(edges_array_scaled,axis=1).astype(float)
    
    # initialize counters for number of non zero values
    v_count = 0
    h_count = 0

    # count non zero columns and rows
    for i in range (im_width-2):
        if edges_v[i] != 0:
            v_count += 1
        if edges_h[i] !=0:
            h_count += 1
   
    return v_count,h_count

Function to return the average of the columns and rows in the image passed to it


In [823]:
def process_image_pix_ave(image_array,edge_thresh,peak_thresh, im_width):
    # filter the data to find edges
    image_array = (image_array-np.min(image_array))*255/(np.max(image_array)-np.min(image_array))                  
    im = Image.fromarray(image_array.reshape(im_width,im_width)).convert('L')
    edges_image = im.filter(ImageFilter.FIND_EDGES)                                                                 
    edges_array = np.asarray(edges_image)
    edges_array_scaled = edges_array.copy()[1:im_width-1,1:im_width-1]
    edges_array_scaled[edges_array_scaled < edge_thresh,] = 0 

    #setup edge histograms
    edges_v = np.sum(edges_array_scaled,axis=0).astype(float)
    edges_h = np.sum(edges_array_scaled,axis=1).astype(float)

    #initialize count and sum variables
    v_count = 0                                                                                                    
    h_count = 0                                                                                                  
    edges_v_sum = 0                                                                                            
    edges_h_sum = 0       

    #loop through array indexes and calculate sum and number of non zero values
    for i in range (im_width-2):
        edges_v_sum += edges_v[i]                                                                       
        if edges_v[i] != 0:
            v_count += 1    
        edges_h_sum += edges_h[i]                                                                     
        if edges_h[i] !=0:
            h_count += 1                                                                                         
    
    #calculate averages
    ave_v = edges_v_sum/v_count                                                                               
    ave_h = edges_h_sum/h_count   
                                                                           
    return ave_v,ave_h

Function to return the norm of the columns and rows in the image passed to it


In [824]:
def process_image_pix_norm(image_array,edge_thresh,peak_thresh, im_width):
    # filter the data to find edges
    image_array = (image_array-np.min(image_array))*255/(np.max(image_array)-np.min(image_array))       
    im = Image.fromarray(image_array.reshape(im_width,im_width)).convert('L')
    edges_image = im.filter(ImageFilter.FIND_EDGES)
    edges_array = np.asarray(edges_image)
    edges_array_scaled = edges_array.copy()[1:im_width-1,1:im_width-1]
    edges_array_scaled[edges_array_scaled < edge_thresh,] = 0 

    #setup edge histograms
    edges_v = np.sum(edges_array_scaled,axis=0).astype(float)
    edges_h = np.sum(edges_array_scaled,axis=1).astype(float)

    #initialize count and sum variables
    v_count = 0                                                                
    h_count = 0                                                                  
    edges_v_sum = 0                                                           
    edges_h_sum = 0  

    #loop through and calculate the sum of the squares                                       
    for i in range (im_width-2):
        edges_v_sum += edges_v[i] **2                 
        edges_h_sum += edges_h[i] **2     

    # root the sum of the squares                                                                           
    norm_v = math.sqrt(edges_v_sum)                                                                                  
    norm_h = math.sqrt(edges_h_sum)                                                                                  
    return norm_v,norm_h

Function to return the number of values above the standard deviation columns and rows in the image passed to it


In [825]:
def process_image_above_std(image_array,edge_thresh,peak_thresh, im_width):
    # filter the data to find edges
    image_array = (image_array-np.min(image_array))*255/(np.max(image_array)-np.min(image_array))                   
    im = Image.fromarray(image_array.reshape(im_width,im_width)).convert('L')
    edges_image = im.filter(ImageFilter.FIND_EDGES)
    edges_array = np.asarray(edges_image)
    edges_array_scaled = edges_array.copy()[1:im_width-1,1:im_width-1]
    edges_array_scaled[edges_array_scaled < edge_thresh,] = 0 
    #setup edge histograms
    edges_v = np.sum(edges_array_scaled,axis=0).astype(float)
    edges_h = np.sum(edges_array_scaled,axis=1).astype(float)    

    #initialize counters                                                                                                                                                                         
    v_std_above = 0
    h_std_above = 0          

    #calculate the standard deviation of the non zero values                                                                                                                                                                  
    v_std = np.std(np.ma.masked_equal(edges_v,0))  
    h_std = np.std(np.ma.masked_equal(edges_h,0))  

    #calculate the number of indexes above the standrd deviation
    for i in range (im_width-2):
        if (edges_v[i]>=(v_std)):
            v_std_above += 1   
        if (edges_h[i]>=(h_std)):
            h_std_above += 1
    return v_std_above,h_std_above

Function to test a model and present the confusion matrix and accuracy score


In [826]:
# this function will test the model given to it with the data given to it
def test(test_data,test_class,model):
    # make prediction
    y_pred = model.predict(test_data)

    # print scores using the given data
    print("Accuracy Score - " , 100*accuracy_score(test_class,y_pred),"%")
    print("\nConfusion Matrix\n")
    print(confusion_matrix(test_class,y_pred))

Variable setup
***

**IMPORTANT** Change the path of the data when running on new machine
***

In [827]:
#define image width
im_width = 64 

# define path to training and testing folder, change when running on new machine or when data is moved
folder_training = 'C:/Users/stuar/Documents/4th year/1st semester/engr 418/Project/Lego_dataset_2/Lego_dataset_2/training' + '/'  
folder_testing =  'C:/Users/stuar/Documents/4th year/1st semester/engr 418/Project/Lego_dataset_2/Lego_dataset_2/testing' + '/' 

#define the number of samples in the target array
n_training = get_samp_num(folder_training)
n_testing = get_samp_num(folder_testing)

#define empty arrays to be filled with training and test data
x_train = np.empty((n_training,im_width**2))
y_train = np.empty((n_training,1))
x_test = np.empty((n_testing, im_width**2))
y_test = np.empty((n_testing,1))

#set variables for use in data processing
ET = 64
PT = 0.5

#get image data 
x_train, y_train= get_data(folder_training, im_width,n_training,0)
x_test, y_test= get_data(folder_testing, im_width,n_testing,0)

#set up empty arrays for engineered features
edge_features_train_pix_sum = np.zeros((n_training,2))
edge_features_test_pix_sum = np.zeros((n_testing,2))
edge_features_train_num_non0 = np.zeros((n_training,2))
edge_features_test_num_non0 = np.zeros((n_testing,2))
edge_features_train_pix_ave = np.zeros((n_training,2))
edge_features_test_pix_ave = np.zeros((n_testing,2))
edge_features_train_pix_norm = np.zeros((n_training,2))
edge_features_test_pix_norm = np.zeros((n_testing,2))
edge_features_train_above_std = np.zeros((n_training,2))
edge_features_test_above_std = np.zeros((n_testing,2))

Feature Engineering
***

Training features

In [828]:
#loop through  images, get features
for i in range(n_training):
    # extract sum of histograms
    edge_features_train_pix_sum[i,:] = process_image_pix_sum(x_train[i,:],ET,PT,im_width)
    # extract number of non 0 indexes
    edge_features_train_num_non0[i,:] = process_image_non0(x_train[i,:],ET,PT,im_width) 
    # extract the ave of the non zero values
    edge_features_train_pix_ave[i,:] = process_image_pix_ave(x_train[i,:],ET,PT,im_width) 
    # extract the norm of the histograms
    edge_features_train_pix_norm[i,:] = process_image_pix_norm(x_train[i,:],ET,PT,im_width) 
    # extact the number of indexes above he std
    edge_features_train_above_std[i,:] = process_image_above_std(x_train[i,:],ET,PT,im_width) 

print("Training features complete")

Training features complete


Testing features 

In [829]:
#loop through  images, get features
for i in range(n_testing):
    # extract sum of histograms
    edge_features_test_pix_sum[i,:] = process_image_pix_sum(x_test[i,:],ET,PT,im_width) 
    # extract number of non 0 indexes   
    edge_features_test_num_non0[i,:] = process_image_non0(x_test[i,:],ET,PT,im_width) 
    # extract the ave of the non zero values
    edge_features_test_pix_ave[i,:] = process_image_pix_ave(x_test[i,:],ET,PT,im_width)
    # extract the norm of the histograms 
    edge_features_test_pix_norm[i,:] = process_image_pix_norm(x_test[i,:],ET,PT,im_width) 
    # extact the number of indexes above the std
    edge_features_test_above_std[i,:] = process_image_above_std(x_test[i,:],ET,PT,im_width)

print("Testing features complete")

Testing features complete


Feature Selection
***

In [830]:
# combine features into one array for testing and one array for training
features_train = np.hstack((edge_features_train_pix_sum,edge_features_train_num_non0,edge_features_train_pix_ave,
edge_features_train_pix_norm,edge_features_train_above_std))
features_test = np.hstack((edge_features_test_pix_sum,edge_features_test_num_non0,edge_features_test_pix_ave,
edge_features_test_pix_norm,edge_features_test_above_std))

#define the logistic pregression model
model = LogisticRegression(max_iter=100000)

#select the features
sfs = SequentialFeatureSelector(model, n_features_to_select=4)
sfs.fit(features_train, np.ravel(y_train))

#show features that are used and not used
print(sfs.get_support())

#make array's of the selected features
train_features_selected = sfs.transform(features_train)
test_features_selected = sfs.transform(features_test)

[False False  True  True False False False  True  True False]


Model Training
***

In [831]:
#train the model using the selected data
model.fit(train_features_selected, np.ravel(y_train))

#make sure data and model is properly set, we should be 100%
print("Training Data Test\n")
test(train_features_selected,y_train,model)


Training Data Test

Accuracy Score -  100.0 %

Confusion Matrix

[[27  0  0]
 [ 0 27  0]
 [ 0  0 27]]


Model Testing
***

In [832]:
#test the model against the test data and display outcome
print("Testing Data Test\n")
test(test_features_selected,y_test,model)


Testing Data Test

Accuracy Score -  97.53086419753086 %

Confusion Matrix

[[26  0  1]
 [ 0 27  0]
 [ 0  1 26]]
