In [12]:
import numpy as np
import tensorflow as tf
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from goodpoints import kt

In [13]:
(X_train_full, y_train_full), (X_test, y_test) = tf.keras.datasets.mnist.load_data()

In [14]:
X_train_full = X_train_full.reshape(-1, 28*28).astype(np.float32)
X_test = X_test.reshape(-1, 28*28).astype(np.float32)


In [15]:
X_train_full /= 255.0
X_test /= 255.0

In [16]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=42
)

In [18]:
def gaussian_kernel(x, X, bandwidth=1.0):
    return np.exp(-np.linalg.norm(X - x, axis=1) ** 2 / (2 * bandwidth**2))

In [19]:
coreset_indices = kt.thin(
    X_train, m=3, split_kernel=gaussian_kernel, swap_kernel=gaussian_kernel
)

In [None]:
X_coreset, y_coreset = X_train[coreset_indices], y_train[coreset_indices]


In [None]:
coreset_model = RandomForestClassifier(random_state=42)
coreset_model.fit(X_coreset, y_coreset)

In [None]:
coreset_preds = coreset_model.predict(X_val)
print("Coreset Model Performance:")
print(classification_report(y_val, coreset_preds))

In [None]:
final_coreset_preds = coreset_model.predict(X_test)
print("Final Coreset Model Performance on Test Data:")
print(classification_report(y_test, final_coreset_preds))