Load and prepare the dataset

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

def load_famous48_file(file_path):
    with open(file_path, "r") as f:
        lines = f.readlines()

    L = int(lines[0].strip())  # number of examples
    N = int(lines[1].strip())  # number of pixels
    data, labels = [], []

    for line in lines[2:]:
        values = list(map(float, line.strip().split()))
        if len(values) >= N + 8:
            features = values[:N]
            class_label = int(values[-6])  # class label (a3)
            data.append(features)
            labels.append(class_label)

    return pd.DataFrame(data), pd.Series(labels)

# Load all parts
X1, y1 = load_famous48_file("famous48/x24x24.txt")
X2, y2 = load_famous48_file("famous48/y24x24.txt")
X3, y3 = load_famous48_file("famous48/z24x24.txt")

# Combine
X = pd.concat([X1, X2, X3], ignore_index=True)
y = pd.concat([y1, y2, y3], ignore_index=True)

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

Train and Evaluate

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import time

def evaluate_models(X_train, X_test, y_train, y_test):
    results = {}

    # Decision Tree
    start = time.time()
    dt = DecisionTreeClassifier(random_state=42)
    dt.fit(X_train, y_train)
    results["Decision Tree"] = {
        "accuracy": accuracy_score(y_test, dt.predict(X_test)),
        "time": time.time() - start,
        "model": dt
    }

    # Random Forest
    start = time.time()
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    results["Random Forest"] = {
        "accuracy": accuracy_score(y_test, rf.predict(X_test)),
        "time": time.time() - start,
        "model": rf
    }

    # ANN
    start = time.time()
    mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
    mlp.fit(X_train, y_train)
    results["Neural Network"] = {
        "accuracy": accuracy_score(y_test, mlp.predict(X_test)),
        "time": time.time() - start,
        "model": mlp
    }

    return results

Feature Importance

In [5]:
from scipy.stats import skew, kurtosis
from scipy.ndimage import sobel
import numpy as np
def extract_custom_features(X_raw):
    features = []
    for row in X_raw.to_numpy():
        img = row.reshape(24, 24)
        gx = sobel(img, axis=1)
        gy = sobel(img, axis=0)
        mag = np.sqrt(gx**2 + gy**2)
        mean = np.mean(img)
        std = np.std(img)
        symmetry = -np.mean(np.abs(img - np.fliplr(img)))
        grad_mean = np.mean(mag)
        grad_std = np.std(mag)
        skewness = skew(row)
        kurt = kurtosis(row)
        grid_means = [np.mean(img[i:i+6, j:j+6]) for i in range(0, 24, 6) for j in range(0, 24, 6)]
        feats = [mean, std, skewness, kurt, symmetry, grad_mean, grad_std] + grid_means
        features.append(feats)
    return pd.DataFrame(features)

# Top 30 features from RF
rf_temp = RandomForestClassifier(n_estimators=100, random_state=42)
rf_temp.fit(X_train, y_train)
top_indices = rf_temp.feature_importances_.argsort()[::-1][:30]
X_train_top = X_train.iloc[:, top_indices]
X_test_top = X_test.iloc[:, top_indices]

# Handcrafted
X_train_hand = extract_custom_features(X_train)
X_test_hand = extract_custom_features(X_test)

# Combined
X_train_combined = pd.concat([X_train_top.reset_index(drop=True), X_train_hand.reset_index(drop=True)], axis=1)
X_test_combined = pd.concat([X_test_top.reset_index(drop=True), X_test_hand.reset_index(drop=True)], axis=1)


Retrain

In [6]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
def evaluate_models(X_tr, X_te, y_tr, y_te):
    results = {}
    models = {
        "Decision Tree": DecisionTreeClassifier(random_state=42),
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
        "Neural Network": make_pipeline(StandardScaler(), MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42))
    }
    for name, model in models.items():
        start = time.time()
        model.fit(X_tr, y_tr)
        acc = accuracy_score(y_te, model.predict(X_te))
        duration = time.time() - start
        results[name] = (acc, duration)
    return results


Full Run

In [7]:
results = {
    "Raw Pixels": evaluate_models(X_train, X_test, y_train, y_test),
    "Handcrafted Features": evaluate_models(X_train_hand, X_test_hand, y_train, y_test),
    "Combined Features": evaluate_models(X_train_combined, X_test_combined, y_train, y_test)
}

for feature_set, res in results.items():
    print(f"\n== {feature_set} ==")
    for model, (acc, t) in res.items():
        print(f"{model}: Accuracy = {acc:.4f}, Time = {t:.2f}s")

  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b



== Raw Pixels ==
Decision Tree: Accuracy = 0.2370, Time = 3.63s
Random Forest: Accuracy = 0.6524, Time = 9.59s
Neural Network: Accuracy = 0.8157, Time = 7.64s

== Handcrafted Features ==
Decision Tree: Accuracy = 0.1978, Time = 0.13s
Random Forest: Accuracy = 0.4231, Time = 1.58s
Neural Network: Accuracy = 0.4664, Time = 2.16s

== Combined Features ==
Decision Tree: Accuracy = 0.2300, Time = 0.31s
Random Forest: Accuracy = 0.5307, Time = 2.67s
Neural Network: Accuracy = 0.6144, Time = 2.21s


  ret = a @ b
  ret = a @ b
  ret = a @ b
