In [1]:
'''
MAIN FEATURE EXTRACTION -- CREATES CSV with all features
requires a separate folder from CSAFE database of writers
'''
import cv2
import numpy as np
import os
import pandas as pd
from skimage.feature import hog, local_binary_pattern
from skimage.morphology import skeletonize
from skimage.measure import label, regionprops

In [2]:
def preprocess_image(image_path):
    # Load the image in grayscale mode
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

    # Apply a 3x3 median filter for noise removal
    denoised_image = cv2.medianBlur(image, 3)

    # Apply Otsu's thresholding for binarization
    _, binarized_image = cv2.threshold(denoised_image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    return binarized_image

In [3]:
def crop_center(img, cropx, cropy):
    y, x = img.shape
    startx = x // 2 - cropx // 2
    starty = y // 2 - cropy // 2
    return img[starty:starty + cropy, startx:startx + cropx]

def calculate_pixel_percentages(img, m=100):
    cropped_img = crop_center(img, m, m)
    black_pixels = np.sum(cropped_img < 128)
    white_pixels = np.sum(cropped_img >= 128)
    total_pixels = m * m
    percentage_black_pixels = (black_pixels / total_pixels) * 100
    percentage_white_pixels = (white_pixels / total_pixels) * 100

    return percentage_black_pixels, percentage_white_pixels
def calculate_line_irregularity(binary_img):
    # Ensure binary_img is of type uint8
    binary_img = binary_img.astype(np.uint8) * 255
    kernel = np.ones((5, 5), np.uint8)
    dilated_img = cv2.dilate(binary_img, kernel, iterations=1)

    # Calculate projection profile as a sum of pixels along each row
    projection = np.sum(dilated_img, axis=1)
    
    # Make projection 2-D for processing with label and regionprops
    projection_2d = np.expand_dims(projection, axis=1)

    # Basic peak detection
    peaks = projection_2d > np.mean(projection)
    labeled_peaks, num_features = label(peaks, return_num=True)
    props = regionprops(labeled_peaks)

    # Calculate the standard deviation of the line heights
    heights = [prop.bbox[2] - prop.bbox[0] for prop in props]  # Height is max_row - min_row
    line_irregularity = np.std(heights) if heights else 0

    return line_irregularity

def extract_features(image):
    # Binary image for pixel percentage calculation
    _, binary_img = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    edges = cv2.Canny(image, 100, 200)
    sobelx = cv2.Sobel(image, cv2.CV_64F, 1, 0, ksize=5)
    sobely = cv2.Sobel(image, cv2.CV_64F, 0, 1, ksize=5)
    sobel = np.sqrt(sobelx**2 + sobely**2)
    hog_features, _ = hog(image, orientations=8, pixels_per_cell=(16, 16),
                          cells_per_block=(1, 1), visualize=True, feature_vector=True)
    
    numPoints = 24
    radius = 8
       # Local Binary Pattern
    lbp = local_binary_pattern(binary_img, numPoints, radius, method="uniform")
    lbp_hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, numPoints + 3), range=(0, numPoints + 2))
    lbp_hist = lbp_hist.astype("float")
    lbp_hist /= (lbp_hist.sum() + 1e-7)  # Normalize

        
        # Combine HOG and LBP embeddings
    embedding = np.append(hog_features, lbp_hist)


    skeleton = skeletonize(binary_img > 0)
    endpoints = np.array((skeleton & np.roll(~skeleton, shift=1, axis=0)) |
                         (skeleton & np.roll(~skeleton, shift=-1, axis=0)) |
                         (skeleton & np.roll(~skeleton, shift=1, axis=1)) |
                         (skeleton & np.roll(~skeleton, shift=-1, axis=1)))
    num_endpoints = np.sum(endpoints)

    dist_transform = cv2.distanceTransform(binary_img, cv2.DIST_L2, 5)
    avg_thickness = np.mean(dist_transform)

    y, x = np.nonzero(sobel)
    angles = np.arctan2(sobely[y, x], sobelx[y, x])
    angle_diff = np.diff(angles)
    avg_curvature = np.mean(np.abs(angle_diff))

    percentage_black_pixels, percentage_white_pixels = calculate_pixel_percentages(binary_img, m=100)
    line_irregularity = calculate_line_irregularity(binary_img)

    avg_canny = np.mean(edges)
    avg_sobel = np.mean(sobel)
    avg_hog = np.mean(hog_features)

    return (avg_canny, avg_sobel, avg_hog, num_endpoints, avg_thickness, avg_curvature,
            percentage_black_pixels, percentage_white_pixels, line_irregularity, embedding.mean())

In [4]:
def process_students_folders(base_dir):
    feature_data = []
    
    students = [f for f in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, f))]
    
    for student in students:
        student_dir = os.path.join(base_dir, student)
        images = [f for f in os.listdir(student_dir) if f.endswith('.png')]
        
        for image_name in images:
            image_path = os.path.join(student_dir, image_name)
            print(f"Processing {image_name} in {student}")
            preprocessed_image = preprocess_image(image_path)
            features = extract_features(preprocessed_image)

            feature_row = [student, image_name] + list(features)
            feature_data.append(feature_row)

    feature_columns = [
        'Student', 'Image Name', 'Canny Edge', 'Sobel Edge', 'HOG Features', 'Endpoints', 
        'Thickness', 'Curvature', 'Percentage Black Pixels', 'Percentage White Pixels', 
        'Line Irregularity', 'LBP Mean'
    ]
    
    df = pd.DataFrame(feature_data, columns=feature_columns)

    return df

base_dir = 'student_images'
features_df = process_students_folders(base_dir)

Processing w0011_s01_pLND_r01.png in student_11
Processing w0011_s01_pLND_r02.png in student_11
Processing w0011_s01_pLND_r03.png in student_11
Processing w0011_s01_pWOZ_r01.png in student_11
Processing w0011_s01_pWOZ_r02.png in student_11
Processing w0011_s01_pWOZ_r03.png in student_11
Processing w0011_s01_pPHR_r01.png in student_11
Processing w0011_s01_pPHR_r03.png in student_11
Processing w0011_s01_pPHR_r02.png in student_11
Processing w0029_s01_pPHR_r01.png in student_29
Processing w0029_s01_pPHR_r03.png in student_29
Processing w0029_s01_pPHR_r02.png in student_29
Processing w0029_s01_pWOZ_r01.png in student_29
Processing w0029_s01_pWOZ_r02.png in student_29
Processing w0029_s01_pWOZ_r03.png in student_29
Processing w0029_s01_pLND_r01.png in student_29
Processing w0029_s01_pLND_r02.png in student_29
Processing w0029_s01_pLND_r03.png in student_29
Processing w0016_s01_pPHR_r01.png in student_16
Processing w0016_s01_pPHR_r02.png in student_16
Processing w0016_s01_pPHR_r03.png in stu

In [5]:
features_df

Unnamed: 0,Student,Image Name,Canny Edge,Sobel Edge,HOG Features,Endpoints,Thickness,Curvature,Percentage Black Pixels,Percentage White Pixels,Line Irregularity,LBP Mean
0,student_11,w0011_s01_pLND_r01.png,1.954612,228.313829,0.018168,77422,683.093872,0.562531,0.0,100.0,362.653247,0.018171
1,student_11,w0011_s01_pLND_r02.png,1.785334,208.698919,0.017240,77259,469.437134,0.567101,0.0,100.0,347.330635,0.017242
2,student_11,w0011_s01_pLND_r03.png,1.870264,217.957042,0.017592,74389,737.896179,0.558024,0.0,100.0,412.611509,0.017594
3,student_11,w0011_s01_pWOZ_r01.png,1.513690,177.134953,0.013801,69528,386.094299,0.575596,0.0,100.0,427.714826,0.013804
4,student_11,w0011_s01_pWOZ_r02.png,1.462108,170.826273,0.013291,61386,920.609863,0.576085,0.0,100.0,487.912184,0.013293
...,...,...,...,...,...,...,...,...,...,...,...,...
337,student_30,w0030_s01_pWOZ_r03.png,1.318809,155.998330,0.012506,52648,662.307373,0.479814,0.0,100.0,466.910301,0.012509
338,student_30,w0030_s01_pWOZ_r02.png,1.330033,157.798340,0.012531,49873,897.073364,0.483180,0.0,100.0,598.032088,0.012534
339,student_30,w0030_s01_pPHR_r01.png,0.272372,32.587327,0.002557,15787,1304.768555,0.466026,0.0,100.0,980.102333,0.002561
340,student_30,w0030_s01_pPHR_r02.png,0.252497,30.021547,0.002405,17325,1307.375854,0.477515,0.0,100.0,866.288022,0.002409


In [6]:
features_df.to_csv('student_features.csv', index=False)
