In [1]:
import cupy as cp
import cudf
from cuml.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, recall_score
import joblib
import numpy as np
import pandas as pd
from skimage.feature import hog
from sklearn.model_selection import train_test_split
# import cv2
from collections import Counter

In [10]:
# Load and sample the dataset
data = pd.read_csv("../archive/ascii_character_classification.csv", header=0).sample(frac=0.05)

label_counts = Counter(data.iloc[:, 0])
print(label_counts)

Counter({0: 2576, 70: 268, 28: 264, 23: 263, 13: 263, 72: 263, 84: 261, 92: 261, 18: 258, 14: 257, 48: 257, 34: 257, 15: 254, 19: 253, 6: 253, 46: 252, 78: 252, 45: 250, 11: 249, 59: 249, 26: 249, 10: 248, 31: 248, 44: 248, 27: 247, 69: 246, 33: 244, 86: 244, 20: 244, 36: 243, 32: 243, 3: 243, 76: 243, 2: 241, 60: 240, 74: 240, 73: 240, 83: 239, 43: 239, 82: 239, 81: 239, 17: 239, 57: 238, 63: 238, 5: 238, 80: 237, 50: 237, 95: 237, 22: 236, 64: 236, 7: 236, 61: 236, 1: 235, 30: 234, 88: 233, 55: 233, 9: 233, 29: 233, 94: 232, 79: 232, 68: 232, 25: 232, 38: 232, 40: 231, 89: 229, 39: 229, 42: 228, 75: 228, 8: 228, 16: 227, 77: 227, 65: 227, 67: 227, 47: 225, 54: 224, 90: 223, 4: 223, 37: 223, 91: 222, 41: 222, 21: 221, 56: 221, 12: 221, 66: 221, 87: 220, 93: 220, 35: 219, 53: 218, 58: 216, 24: 213, 71: 212, 52: 208, 85: 206, 62: 204, 51: 203, 49: 198})


In [11]:
# Separate features and labels
X = data.iloc[:, 1:].astype("float64")  # Features are all columns except the first one
y = data.iloc[:, 0].astype("float64")     # Labels are the first column, converted to integer type

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)

# Convert Pandas DataFrames and Series to CuPy arrays
X_train_cupy = cp.asarray(X_train, dtype=cp.float64)
X_test_cupy = cp.asarray(X_test, dtype=cp.float64)
y_train_cupy = cp.asarray(y_train, dtype=cp.float64)
y_test_cupy = cp.asarray(y_test, dtype=cp.float64)

In [14]:
# Train and save the SVM model
default_parameters = {
    'penalty': 'l2',
    'loss': 'squared_hinge',
    'fit_intercept': True,
    'penalized_intercept': False,
    'max_iter': 1000,
    'linesearch_max_iter': 100,
    'lbfgs_memory': 5,
    'class_weight': None,
    'verbose': False,
    'C': 1.0,
    'grad_tol': 0.0001,
    'change_tol': 1e-05,
    'tol': None,
    'probability': False
}

# Create LinearSVC with default parameters
clf = LinearSVC(**default_parameters)
clf.fit(X_train_cupy, y_train_cupy)
joblib.dump(clf, '../artifacts/svm_model_gpu.pkl')

['../artifacts/svm_model_gpu.pkl']

In [15]:
# Predict and evaluate
y_pred = cp.asnumpy(clf.predict(X_test_cupy))
print(type(y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy*100:.4f}%")
print(f"F1 Score: {f1*100:.4f}%")
print(f"Recall: {recall*100:.4f}%")

<class 'numpy.ndarray'>
Accuracy: 29.4200%
F1 Score: 26.0238%
Recall: 29.4200%


In [6]:
# HOG feature extraction
def extract_hog_features(images):
    hog_features = []
    for image in images:
        image_reshaped = image.reshape((10, 10))
        features = hog(image_reshaped, pixels_per_cell=(2, 2), cells_per_block=(1, 1), feature_vector=True)
        hog_features.append(features)
    return np.array(hog_features)

X_hog = extract_hog_features(np.array(X))

In [7]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_hog, y, test_size=0.2, random_state=42)
# X_train.reset_index(drop=True, inplace=True)
# y_train.reset_index(drop=True, inplace=True)

# Convert Pandas DataFrames and Series to CuPy arrays
X_train_cupy = cp.asarray(X_train, dtype=cp.float64)
X_test_cupy = cp.asarray(X_test, dtype=cp.float64)
y_train_cupy = cp.asarray(y_train, dtype=cp.float64)
y_test_cupy = cp.asarray(y_test, dtype=cp.float64)


In [8]:
# Train and save the SVM model
default_parameters = {
    'penalty': 'l2',
    'loss': 'squared_hinge',
    'fit_intercept': True,
    'penalized_intercept': False,
    'max_iter': 1000,
    'linesearch_max_iter': 100,
    'lbfgs_memory': 5,
    'class_weight': None,
    'verbose': False,
    'C': 1.0,
    'grad_tol': 0.0001,
    'change_tol': 1e-05,
    'tol': None,
    'probability': False
}

# Create LinearSVC with default parameters
clf = LinearSVC(**default_parameters)
clf.fit(X_train_cupy, y_train_cupy)
joblib.dump(clf, '../artifacts/svm_hog_model_gpu.pkl')

['../artifacts/svm_hog_model_gpu.pkl']

In [9]:
# Predict and evaluate
y_pred = cp.asnumpy(clf.predict(X_test_cupy))
print(type(y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy*100:.4f}%")
print(f"F1 Score: {f1*100:.4f}%")
print(f"Recall: {recall*100:.4f}%")

<class 'numpy.ndarray'>
Accuracy: 82.1000%
F1 Score: 81.1978%
Recall: 82.1000%
