In [1]:
import numpy as np
import pandas as pd
from skimage.feature import hog
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score
import joblib
from collections import Counter
height, width = 10, 10

In [2]:
data = pd.read_csv(f"../archive/ascii_character_classification_{height}_x_{width}.csv", header=0).sample(frac=.05)
label_counts = Counter(data.iloc[:, 0])
print(label_counts)

Counter({0: 2552, 51: 268, 32: 267, 71: 263, 65: 260, 30: 257, 23: 257, 9: 256, 54: 255, 62: 255, 13: 253, 1: 253, 80: 252, 60: 251, 8: 251, 38: 249, 72: 249, 26: 249, 50: 249, 53: 247, 94: 247, 34: 247, 59: 247, 14: 246, 81: 245, 28: 245, 31: 245, 83: 244, 77: 244, 40: 243, 22: 243, 6: 242, 20: 241, 21: 241, 92: 241, 69: 240, 66: 240, 85: 240, 76: 240, 2: 240, 79: 238, 55: 238, 90: 238, 36: 238, 93: 238, 57: 238, 33: 238, 35: 237, 45: 237, 75: 237, 73: 237, 82: 236, 44: 236, 70: 236, 49: 235, 16: 234, 24: 234, 64: 234, 74: 234, 19: 234, 43: 233, 88: 233, 12: 233, 91: 233, 18: 232, 87: 232, 68: 232, 47: 232, 67: 231, 89: 231, 11: 231, 29: 230, 7: 230, 58: 230, 86: 230, 41: 226, 4: 225, 48: 223, 15: 222, 5: 222, 52: 222, 37: 221, 63: 221, 17: 220, 56: 220, 25: 218, 3: 216, 46: 215, 78: 214, 10: 214, 42: 211, 95: 210, 27: 209, 61: 207, 84: 207, 39: 203})


In [3]:
X = data.iloc[:, 1:].astype("float64")
y = data.iloc[:, 0].astype("float64")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
joblib.dump(clf, '../artifacts/random_forest_model_10_x_10.pkl')

['../artifacts/random_forest_model_10_x_10.pkl']

In [5]:
y_pred = clf.predict(X_train)
train_accuracy = accuracy_score(y_train, y_pred)
y_pred = clf.predict(X_test)
print("test Shape:", X_test.shape)

test_accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print(f"Train Accuracy: {train_accuracy*100:.4f}%")
print(f"Test Accuracy: {test_accuracy*100:.4f}%")
print(f"F1 Score: {f1*100:.4f}%")
print(f"Recall: {recall*100:.4f}%")

test Shape: (5000, 100)
Train Accuracy: 98.1000%
Test Accuracy: 91.4400%
F1 Score: 90.7340%
Recall: 91.4400%


In [6]:
def extract_hog_features(images):
    hog_features = []
    for image in images:
        image_reshaped = image.reshape((height, width))
        features = hog(image_reshaped, pixels_per_cell=(2, 2), cells_per_block=(1, 1), feature_vector=True)
        hog_features.append(features)
    return np.array(hog_features)

X_hog = extract_hog_features(np.array(X))
X_train, X_test, y_train, y_test = train_test_split(X_hog, y, test_size=0.2, random_state=42)

In [7]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
joblib.dump(clf, '../artifacts/random_forest_model_hog_10_x_10.pkl')

['../artifacts/random_forest_model_hog_10_x_10.pkl']

In [8]:
y_pred = clf.predict(X_train)
train_accuracy = accuracy_score(y_train, y_pred)
y_pred = clf.predict(X_test)
print("test Shape:", X_test.shape)

test_accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print(f"Train Accuracy: {train_accuracy*100:.4f}%")
print(f"Test Accuracy: {test_accuracy*100:.4f}%")
print(f"F1 Score: {f1*100:.4f}%")
print(f"Recall: {recall*100:.4f}%")

test Shape: (5000, 225)
Train Accuracy: 98.0400%
Test Accuracy: 94.9600%
F1 Score: 94.2236%
Recall: 94.9600%


In [9]:
import cv2

def extract_sift_features(images):
    sift = cv2.SIFT_create()
    sift_features = []
    
    for image in images:
        image_reshaped = image.reshape((height, width)).astype(np.uint8)
        keypoints, descriptors = sift.detectAndCompute(image_reshaped, None)
        
        # If no keypoints are found, use a zero array of the same length as a typical descriptor
        if descriptors is None:
            descriptors = np.zeros((1, sift.descriptorSize()), dtype=np.float32)
        
        # Flatten descriptors and use them as features
        features = descriptors.flatten()
        sift_features.append(features)
    
    return np.array(sift_features)

# Extract SIFT features
X_sift = extract_sift_features(np.array(X))

In [10]:
X_sift = extract_sift_features(np.array(X))

# Since the number of features might vary, we need to ensure consistent feature vector size
# Here, we'll pad with zeros to the maximum descriptor length found
max_len = max(len(f) for f in X_sift)
X_sift = np.array([np.pad(f, (0, max_len - len(f)), 'constant') for f in X_sift])

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_sift, y, test_size=0.2, random_state=42)

In [11]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
joblib.dump(clf, '../artifacts/random_forest_model_sift_10_x_10.pkl')

['../artifacts/random_forest_model_sift_10_x_10.pkl']

In [12]:
y_pred = clf.predict(X_train)
train_accuracy = accuracy_score(y_train, y_pred)
y_pred = clf.predict(X_test)
print("test Shape:", X_test.shape)

test_accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print(f"Train Accuracy: {train_accuracy*100:.4f}%")
print(f"Test Accuracy: {test_accuracy*100:.4f}%")
print(f"F1 Score: {f1*100:.4f}%")
print(f"Recall: {recall*100:.4f}%")

test Shape: (5000, 128)
Train Accuracy: 10.1350%
Test Accuracy: 10.5000%
F1 Score: 1.9955%
Recall: 10.5000%
