# Feature detection

During the feature detection we retrieve all the postal codes and split them into four seperate numbers per postal code. We grayscale the image and extract features. At last we export the features to a csv file.

In [124]:
# imports

from SimpleCV import *
# from IPython.display import HTML
import matplotlib.pyplot as plt
import pandas as pd
# import cv2
import glob

toevoegingen

- matrix misschien veranderen naar/toevoegen: hoeveel pixels in stukje zwart
- blob.contours -- aantal hoeken
- naar 8x8 doen ipv 4x4
- edge detection


### Image subtraction and splitting

- We get the images from the folder. We binarize, dilate, and erode to respectively make them black and white to make the number stand out, to make the letters smoother and thicker, and to remove noise. Next we invert them again to make the numbers black and the background white and embiggen the images. We save the image and the inverted image with the correct label.
- next we split the postal code in four seperate images of 32x32

In [125]:
disp = Display(displaytype='notebook')
size = 32

imagelist = []
horizontalSplit=[]

# get all files and add them to tuple with the postal codes
for filename in glob.glob("../dataset-images/*.png"): 
    originalImage = Image(filename)
    image = originalImage.binarize().dilate().erode()
    invertedImage = image.invert().embiggen( (128,40), Color.WHITE, (3,3))
    answer = filename[18:22]
    imagelist.append((answer, image, invertedImage))

numbers = []
   
for image in imagelist:
    for i in range(4):
        sizeHeight = 40
        sizeWidth = 32
        digit = image[0][(0 if i+1 == 1 else i):i+1]
        normalImage = image[1].crop(i*size, 0, size, size*(i+1))
        invertedImage = image[2].crop(i*sizeWidth, 0, sizeWidth, sizeHeight*(i+1))
        numbers.append((digit, normalImage, invertedImage))

In [126]:
# # divide number images into pieces of 8x8 and create matrix. image is 32X32
# def createMatrix(image):
#     size = 4
#     m = np.zeros((8,8))

#     pieces = []
#     for y in range(8):
#         for x in range(8):
#             crop = image.crop(x*size, y*size, size, size)
#             pieces.append(crop)
#             m[x][y] = 1 if crop.meanColor()[0] > 80 else 0
#     return m.T

## Functions for feature extraction

- createMatrix - divide the pictures in pieces of 8x8. *crop.getNumpy* get's a numpy array of the pixels and takes one of the three rgb colours. Because they are grayscale it doesn't matter which one we take. Divide by 255 to see if it's one or zero. one is white and zero is black. Next add up all the 0's and 1's and you get the amount of white pixels in a 8x8 grid. The white pixels are the numbers and the black pixels are the background.
- isCentroidPixelBlack - checks if the centroid - weighted center - is black. This means that the center of the number should be empty. In case of a 0 for example, the middle pixel should be black and thus should be empty.
- getBlob - finds the blob from the image and returns this.
- centroidTopBottom - split's the image in half horizontaly. Then checks the centroid of the image and returns 1 if it's black. For example with an 8, the centroid of the two halfs should both be black.

In [127]:
# divide number images into pieces of 8x8 and create matrix. image is 32X32
def createMatrix(image):
    size = 4
    m = np.zeros((8,8))

    for x in range(8):
        for y in range(8):
            crop = image.crop(x*size, y*size, size, size)
            pixels = crop.getNumpy()[:,:,0] / 255
            m[x][y] = pixels.sum()
    return m.T.astype(int)

In [128]:
# check if centroid pixel is black. If black return 1 else 0 
def isCentroidPixelBlack(image, centroid):
    img = image.crop(centroid[0], centroid[1], 1, 1)
    color = img.meanColor()
    return 0 if color[0] > 200 else 1

In [129]:
# get the blob from the image
def getBlob(image):
    image.show()
    blobs = image.findBlobs()
    image.draw(blobs, autocolor=True)
    return blobs[0]

def centroidTopBottom(image, centroidY, height):    
    croppedImageTop = image.crop(0,0,height,centroidY)
    blobTop = getBlob(croppedImageTop)
    centroidTop = blobTop.centroid()
    pixelBlackTop = isCentroidPixelBlack(croppedImageTop,centroidTop)
    
    croppedImageBottom = image.crop(0,centroidY, height,(height-centroidY))   
    blobBottom = getBlob(croppedImageBottom)
    centroidBottom = blobBottom.centroid()
    pixelBlackBottom = isCentroidPixelBlack(croppedImageBottom,centroidBottom)

    return (pixelBlackTop, pixelBlackBottom)

In [137]:
numbersDict = {
    "blob_area" : [],
    "blob_centroid_x" : [],
    "blob_centroid_y" : [],
    "blob_angle" : [],
    "blob_height": [],
    "blob_width": [],
    "blob_amount_contours": [],
    "number_of_holes":[],
    "centroid_is_empty": [],
    "centroid_top_is_empty":[],
    "centroid_bottom_is_empty":[],
    "label" : []
}

for x in range(8):
    for y in range(8):
        numbersDict["grid_" + str(x) + "_" + str(y)] = []

    
for number in numbers:
    digit = number[0]
    image = number[1]
    invertedImage = number[2]
    invertedBlobCount = len(invertedImage.findBlobs()) -1
    blob = getBlob(image)
    matrix = createMatrix(image)
    centroid = blob.centroid()
    split = centroidTopBottom(image, centroid[1], blob.height())
    for x in range(8):
        for y in range(8):
            numbersDict["grid_" + str(x) + "_" + str(y)].append(matrix[y][x])
    numbersDict["blob_area"].append(blob.area())
    numbersDict["blob_centroid_x"].append(centroid[0])
    numbersDict["blob_centroid_y"].append(centroid[1])
    numbersDict["blob_angle"].append(blob.angle())
    numbersDict["blob_height"].append(blob.height())
    numbersDict["blob_width"].append(blob.width())
    numbersDict["blob_amount_contours"].append(len(blob.contour()))
    numbersDict["number_of_holes"].append(invertedBlobCount)
    numbersDict["centroid_is_empty"].append(isCentroidPixelBlack(image, centroid))
    numbersDict["centroid_top_is_empty"].append(split[0])
    numbersDict["centroid_bottom_is_empty"].append(split[1])
    numbersDict["label"].append(number[0])    

df = pd.DataFrame(numbersDict)
columns = ["blob_area", 
         "blob_centroid_x", 
         "blob_centroid_y",
         "blob_angle",
         "blob_height",
         "blob_width",
         "blob_amount_contours",
         "number_of_holes",
         "centroid_is_empty",
         "centroid_top_is_empty",
         "centroid_bottom_is_empty"]
for x in range(8):
    for y in range(8):
        columns.append("grid_" + str(x) + "_" + str(y))
        
columns.append("label")
df = df[columns]

df = df.sort_values("label")

In [138]:
df.to_csv("../dataset-numpy/dataset_features_contours.csv", sep=',', encoding='utf-8', index=False)

In [None]:
#####move to feature analysis ####

df.describe()

In [None]:
# #### naar feature analysis verplaatsen #######

# show labels with the holes and amount
df.groupby(['label', 'number_of_holes']).size()