Import Libraries

In [2]:
import os
import numpy as np
from PIL import Image
import pandas as pd
import matplotlib.pyplot as plt

Convert all the pixels from all images into arrays using flatten

In [3]:
# Set the input directory path
input_directory = "../Dataset/Foto_Resize_50x50"

# List all folders in the input directory
folders = [f for f in os.listdir(input_directory) if os.path.isdir(os.path.join(input_directory, f))]
print(f"Found folders: {folders}")

# Initialize arrays for inputs and outputs
X = []
y = []

# Create a mapping from folder names to one-hot encoded labels
label_map = {folder: idx for idx, folder in enumerate(folders)}
num_classes = len(folders)

# Process each folder and photo
for folder in folders:
    folder_path = os.path.join(input_directory, folder)
    
    # Sort the list of photos to ensure consistent order
    photos = sorted([p for p in os.listdir(folder_path) if p.endswith(('.png', '.jpg', '.jpeg'))])
    
    for photo in photos:
        photo_path = os.path.join(folder_path, photo)
        image = Image.open(photo_path)
        
        # Convert the image to a numpy array and flatten it
        image_array = np.array(image).flatten()
        X.append(image_array)
        
        # Create a one-hot encoded label
        label = np.zeros(num_classes)
        label[label_map[folder]] = 1
        y.append(label)

# Convert lists to numpy arrays
X = np.array(X)
y = np.array(y)

print(f"Input array shape: {X.shape}")
print(f"Output array shape: {y.shape}")

Found folders: ['Azmira', 'David', 'Dimas', 'Fadhli', 'Fadlin', 'Hafidz', 'Haidar', 'Hanna', 'Keiko', 'Khansa', 'Mikhael', 'Puti', 'Raesa', 'Satwika', 'Toni']
Input array shape: (1384, 2500)
Output array shape: (1384, 15)


Display the data to confirm whether the array was made nicely or not

In [4]:
# Convert X and y to DataFrame for better visualization
X_df = pd.DataFrame(X)
y_df = pd.DataFrame(y)

# Display the first few entries of X and y
print("First few entries of X:")
display(X_df.head())

print("First few entries of y:")
display(y_df.head())

# Combine X and y arrays
combined_array = np.hstack((X, y))

# Convert the combined array to a DataFrame for better visualization
combined_df = pd.DataFrame(combined_array)

# Display the first few entries of the combined DataFrame
print("First few entries of the combined array:")
display(combined_df.head())

First few entries of X:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
0,218,220,223,224,224,224,225,227,226,226,...,22,21,23,25,133,93,141,149,130,111
1,252,253,253,252,249,245,241,239,235,235,...,199,191,189,181,178,156,168,147,132,118
2,111,119,139,159,154,160,194,118,18,25,...,226,226,225,224,224,223,222,222,218,218
3,122,117,150,182,150,191,172,190,195,197,...,235,236,238,241,244,247,250,252,252,252
4,225,226,227,228,229,230,229,229,232,232,...,22,31,50,156,204,166,167,151,140,121


First few entries of y:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


First few entries of the combined array:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2505,2506,2507,2508,2509,2510,2511,2512,2513,2514
0,218.0,220.0,223.0,224.0,224.0,224.0,225.0,227.0,226.0,226.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,252.0,253.0,253.0,252.0,249.0,245.0,241.0,239.0,235.0,235.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,111.0,119.0,139.0,159.0,154.0,160.0,194.0,118.0,18.0,25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,122.0,117.0,150.0,182.0,150.0,191.0,172.0,190.0,195.0,197.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,225.0,226.0,227.0,228.0,229.0,230.0,229.0,229.0,232.0,232.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Display the image a bit to confirm

In [5]:
# Function to plot at least 10 photos for each label
def plot_images_per_label(images, labels, num_images_per_label=10):
    num_classes = labels.shape[1]
    plt.figure(figsize=(15, num_classes * 3))  # Adjust the figure size to reduce gaps
    
    for label_idx in range(num_classes):
        label_count = 0
        for i in range(len(images)):
            if np.argmax(labels[i]) == label_idx:
                plt.subplot(num_classes, num_images_per_label, label_idx * num_images_per_label + label_count + 1)
                plt.imshow(images[i].reshape(50, 50), cmap='gray')
                plt.title(f"Label: {label_idx}")
                plt.axis('off')
                label_count += 1
                if label_count >= num_images_per_label:
                    break
    plt.tight_layout(pad=1.0)  # Adjust the padding between subplots
    plt.show()

# Plot at least 10 photos for each label // uncomment below to run
# plot_images_per_label(X, y, num_images_per_label=10)

Normalize the input

In [6]:
class MinMaxScaler:
    def __init__(self):
        self.min = None
        self.max = None

    def fit(self, data):
        self.min = data.min(axis=0)
        self.max = data.max(axis=0)

    def transform(self, data):
        #modified so the range of normalized data is [-1, 1]
        return (data - self.min) / (self.max - self.min)

    def inverse_transform(self, data):
        return data * (self.max - self.min) + self.min

scalerinput = MinMaxScaler()
scalerinput.fit(X)
X_normalized = scalerinput.transform(X)

# Combine the normalized X and y arrays
combined_array = np.hstack((X_normalized, y))

# Convert the combined array to a DataFrame for better visualization
combined_df = pd.DataFrame(combined_array)

# Display the first few entries of the combined DataFrame
print("First few entries of the combined array:")
display(combined_df.head())

First few entries of the combined array:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2505,2506,2507,2508,2509,2510,2511,2512,2513,2514
0,0.846473,0.854167,0.864407,0.869748,0.869748,0.870293,0.88,0.886179,0.882114,0.881633,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.987552,0.991667,0.991525,0.987395,0.97479,0.958159,0.944,0.934959,0.918699,0.918367,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.40249,0.433333,0.508475,0.596639,0.57563,0.60251,0.756,0.443089,0.036585,0.061224,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.448133,0.425,0.555085,0.693277,0.558824,0.732218,0.668,0.735772,0.756098,0.763265,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.875519,0.879167,0.881356,0.886555,0.890756,0.895397,0.896,0.894309,0.906504,0.906122,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Split data training, testing, and validating

In [7]:
# Shuffle the combined array
np.random.shuffle(combined_array)

# Calculate the split indices
num_samples = combined_array.shape[0]
train_end = int(0.7 * num_samples)
test_end = int(0.85 * num_samples)

# Split the data into training, testing, and validation sets
train_data = combined_array[:train_end]
test_data = combined_array[train_end:test_end]
val_data = combined_array[test_end:]

# Convert the splits to DataFrames for better visualization
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)
val_df = pd.DataFrame(val_data)

# Display the first few entries of each split
print("First few entries of the training data:")
display(train_df.head())

print(train_df.shape)
print(test_df.shape)
print(val_df.shape)

First few entries of the training data:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2505,2506,2507,2508,2509,2510,2511,2512,2513,2514
0,0.946058,0.95,0.949153,0.953782,0.953782,0.949791,0.948,0.947154,0.930894,0.926531,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.829876,0.829167,0.830508,0.836134,0.836134,0.841004,0.852,0.849593,0.849593,0.84898,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.029046,0.070833,0.067797,0.079832,0.088235,0.054393,0.056,0.056911,0.117886,0.044898,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.784232,0.741667,0.758475,0.722689,0.756303,0.707113,0.476,0.369919,0.369919,0.514286,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.792531,0.791667,0.792373,0.798319,0.798319,0.803347,0.816,0.813008,0.821138,0.820408,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


(968, 2515)
(208, 2515)
(208, 2515)


Split the input and output data

In [8]:
# Split the combined data into input (X) and output (y) components
def split_input_output(data, num_input_features):
    X_data = data[:, :num_input_features]
    y_data = data[:, num_input_features:]
    return X_data, y_data

# Number of input features (excluding the output labels)
num_input_features = X.shape[1]

# Split the training data
X_train, y_train = split_input_output(train_data, num_input_features)

# Split the testing data
X_test, y_test = split_input_output(test_data, num_input_features)

# Split the validation data
X_val, y_val = split_input_output(val_data, num_input_features)

# Display the shapes of the split data
print("Training data shapes:")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")

print("Testing data shapes:")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

print("Validation data shapes:")
print(f"X_val: {X_val.shape}, y_val: {y_val.shape}")

Training data shapes:
X_train: (968, 2500), y_train: (968, 15)
Testing data shapes:
X_test: (208, 2500), y_test: (208, 15)
Validation data shapes:
X_val: (208, 2500), y_val: (208, 15)


The preprocessing is done, now let's save it first so that we don't need to do this data preprocessing anymore

In [9]:
# Save the data to CSV files
def save_to_csv(data, filename):
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)

# Save the training data
save_to_csv(X_train, "X_train.csv")
save_to_csv(y_train, "y_train.csv")

# Save the testing data
save_to_csv(X_test, "X_test.csv")
save_to_csv(y_test, "y_test.csv")

# Save the validation data
save_to_csv(X_val, "X_val.csv")
save_to_csv(y_val, "y_val.csv")

print("Data saved to CSV files.")

Data saved to CSV files.
