# Samuel Thelin, samthe-1 & Albin Mårtensson, albmrt-1

In [None]:
import numpy as np
from sklearn.model_selection import KFold
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from sklearn.datasets import load_wine
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Supervised model: ESN

In [4]:
class EchoStateNetwork:
    def __init__(self, input_dim, reservoir_dim, output_dim, spectral_radius=0.99, input_scaling=0.25, reg_param=1e-8):
        self.input_dim = input_dim
        self.reservoir_dim = reservoir_dim
        self.output_dim = output_dim
        self.spectral_radius = spectral_radius
        self.input_scaling = input_scaling
        self.reg_param = reg_param

        self.W_in = np.random.uniform(-1, 1, (reservoir_dim, input_dim)) * input_scaling
        self.W = np.random.uniform(-1, 1, (reservoir_dim, reservoir_dim))
        max_eigenvalue = max(abs(np.linalg.eigvals(self.W)))
        self.W *= spectral_radius / max_eigenvalue
        self.W_out = None

    def _update_reservoir(self, x_t, r_prev):
        return np.tanh(np.dot(self.W_in, x_t) + np.dot(self.W, r_prev))

    def train(self, X_train, y_train_onehot):
        reservoir_states = []
        for signal in X_train:
            r_prev = np.zeros(self.reservoir_dim)
            r_prev = self._update_reservoir(signal, r_prev)
            reservoir_states.append(r_prev)
        reservoir_states = np.array(reservoir_states)

        extended_states = np.hstack([reservoir_states, np.ones((reservoir_states.shape[0], 1))])
        self.W_out = np.linalg.solve(
            np.dot(extended_states.T, extended_states) + self.reg_param * np.eye(extended_states.shape[1]),
            np.dot(extended_states.T, y_train_onehot)
        )

    def predict(self, X_test):
        predictions = []
        for signal in X_test:
            r_prev = np.zeros(self.reservoir_dim)
            r_prev = self._update_reservoir(signal, r_prev)
            extended_state = np.hstack([r_prev, 1])
            y_pred = np.dot(self.W_out.T, extended_state)
            predictions.append(y_pred)
        return np.array(predictions)

In [5]:
def test_ESN_datasets(dataset, name):
    X = dataset['data']
    y = dataset['target']

    encoder = OneHotEncoder(sparse_output=False)
    y_onehot = encoder.fit_transform(y.reshape(-1, 1))

    X_train, X_test, y_train_onehot, y_test = train_test_split(X, y_onehot, test_size=0.2, random_state=42)

    X_train_flat = X_train.astype(np.float32)
    X_test_flat = X_test.astype(np.float32)

    input_dim = X_train_flat.shape[1]  
    reservoir_dim = 1000
    output_dim = y_train_onehot.shape[1] 

    esn = EchoStateNetwork(input_dim, reservoir_dim, output_dim)
    esn.train(X_train_flat, y_train_onehot)

    y_pred_logits = esn.predict(X_test_flat)
    y_pred = np.argmax(y_pred_logits, axis=1)
    y_true = np.argmax(y_test, axis=1)

    accuracy = accuracy_score(y_true, y_pred)
    print('Dataset Label: ',name)
    print(f"Test set accuracy: {accuracy:.4f}")
    print('\n===================================================')

def fit_MNIST_for_test_method():
    (X_train, y_train), (X_test, y_test) = mnist.load_data()

    X = np.vstack((X_train, X_test)) 
    y = np.hstack((y_train, y_test))

    X_flat = X.reshape(X.shape[0], -1)

    return {'data': X_flat, 'target': y}

def fit_image_segmentation_for_test_method():
    dataset = fetch_openml(name="segment", version=1, as_frame=False, parser="auto")
    X = dataset.data
    y = dataset.target

    encoder = LabelEncoder()
    y_encoded = encoder.fit_transform(y)

    return {'data': X, 'target': y_encoded}


## Accuracy report of our supervised model (ESN):

In [None]:
test_ESN_datasets(load_iris(),'Iris')
test_ESN_datasets(load_wine(),'Wine')
test_ESN_datasets(load_breast_cancer(),'Breast Cancer')
image_segmentation_dataset = fit_image_segmentation_for_test_method()
test_ESN_datasets(image_segmentation_dataset, "Image Segmentation")
mnist_dataset = fit_MNIST_for_test_method()
test_ESN_datasets(mnist_dataset, "MNIST")

Dataset Label:  Iris
Test set accuracy: 0.9667

Dataset Label:  Wine
Test set accuracy: 0.8889

Dataset Label:  Breast Cancer
Test set accuracy: 0.8684

Dataset Label:  Image Segmentation
Test set accuracy: 0.8745

Dataset Label:  MNIST
Test set accuracy: 0.9137



# Unsupervised model: K-means

In [15]:
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, train_test_split
from sklearn.datasets import load_iris, load_wine, load_breast_cancer, load_digits
import numpy as np

def kmeans_predict(train_set, train_labels, test_set, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(train_set)

    cluster_to_label = {}
    for cluster in range(n_clusters):
        cluster_indices = np.where(kmeans.labels_ == cluster)[0]
        cluster_labels = train_labels[cluster_indices]
        if len(cluster_labels) > 0:
            cluster_to_label[cluster] = np.bincount(cluster_labels).argmax()

    test_clusters = kmeans.predict(test_set)
    test_labels = np.array([cluster_to_label[cluster] for cluster in test_clusters])

    return test_labels

def find_optimal_k_kmeans(training_set, training_labels, k_values, num_folds=3):
    best_k = None
    best_accuracy = 0
    accuracies_for_k = {k: [] for k in k_values}

    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    for k in k_values:
        fold_accuracies = []

        for train_index, val_index in kf.split(training_set):
            X_train_fold, X_val_fold = training_set[train_index], training_set[val_index]
            y_train_fold, y_val_fold = training_labels[train_index], training_labels[val_index]

            predictions = kmeans_predict(X_train_fold, y_train_fold, X_val_fold, k)

            accuracy = np.mean(predictions == y_val_fold)
            fold_accuracies.append(accuracy)

        avg_accuracy = np.mean(fold_accuracies)
        accuracies_for_k[k].append(avg_accuracy)

        if avg_accuracy > best_accuracy:
            best_accuracy = avg_accuracy
            best_k = k

    return best_k, best_accuracy, accuracies_for_k

In [16]:
def test_datasets_kmeans_image_segmentation(dataset_label='Image Segmentation'):
    dataset = fetch_openml(name="segment", version=1, as_frame=False, parser="auto")
    X = dataset.data
    y = dataset.target

    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    y = le.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    k_values = list(range(1, 11))

    best_k, best_accuracy, accuracies_for_k = find_optimal_k_kmeans(X_train, y_train, k_values, num_folds=5)
    print('\nDataset Label: ', dataset_label, '\n')
    print(f"\nOptimal k: {best_k} with cross-validated accuracy: {best_accuracy:.4f}\n")
    for k, accuracy in accuracies_for_k.items():
        print(f"With k = {k}: {accuracy[0]:.4f}")

    y_pred = kmeans_fit_predict(X_train, y_train, X_test, best_k)
    test_accuracy = accuracy_score(y_test, y_pred)

    print(f"\nTest set accuracy with k={best_k}: {test_accuracy:.4f}\n===================================================")

def test_datasets_kmeans(dataset, dataset_label):
    X = dataset.data
    y = dataset.target

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    k_values = list(range(1, 11))

    best_k, best_accuracy, accuracies_for_k = find_optimal_k_kmeans(X_train, y_train, k_values, num_folds=5)
    print('\nDataset Label: ', dataset_label, '\n')
    print(f"\nOptimal k: {best_k} with cross-validated accuracy: {best_accuracy:.4f}\n")
    for k, accuracy in accuracies_for_k.items():
        print(f"With k = {k}: {accuracy[0]:.4f}")

    y_pred = kmeans_predict(X_train, y_train, X_test, best_k)
    test_accuracy = accuracy_score(y_test, y_pred)

    print(f"\nTest set accuracy with k={best_k}: {test_accuracy:.4f}\n===================================================")

## Accuracy report of our unsupervised model (K-means)

In [17]:
test_datasets_kmeans(load_iris(), 'Iris')
test_datasets_kmeans(load_wine(), 'Wine')
test_datasets_kmeans(load_breast_cancer(), 'Breast Cancer')
test_datasets_kmeans_image_segmentation()

from tensorflow.keras.datasets import mnist

(X_train, y_train), (X_test, y_test) = mnist.load_data()

num_sample = 500
Tr_set = X_train[:num_sample].reshape(num_sample, -1)
Ltr_set = y_train[:num_sample]

X_test_flat = X_test[:num_sample].reshape(num_sample, -1)
y_test_subset = y_test[:num_sample]

k_values = list(range(1, 11))

best_k, best_accuracy, accuracies_for_k = find_optimal_k_kmeans(Tr_set, Ltr_set, k_values)
print('\nDataset Label: MNIST\n')
print(f"\nOptimal k: {best_k} with cross-validated accuracy: {best_accuracy:.4f}\n")
for k, accuracy in accuracies_for_k.items():
    print(f"With k = {k}: {accuracy[0]:.4f}")

y_pred = kmeans_predict(Tr_set, Ltr_set, X_test_flat, best_k)
test_accuracy = accuracy_score(y_test_subset, y_pred)

print(f"\nTest set accuracy with k={best_k}: {test_accuracy:.4f}")
print("=" * 50)


Dataset Label:  Iris 


Optimal k: 8 with cross-validated accuracy: 0.9667

With k = 1: 0.2417
With k = 2: 0.5917
With k = 3: 0.8667
With k = 4: 0.7917
With k = 5: 0.9250
With k = 6: 0.9250
With k = 7: 0.9250
With k = 8: 0.9667
With k = 9: 0.9667
With k = 10: 0.9667

Test set accuracy with k=8: 0.9667

Dataset Label:  Wine 


Optimal k: 7 with cross-validated accuracy: 0.7039

With k = 1: 0.4017
With k = 2: 0.6399
With k = 3: 0.6759
With k = 4: 0.6687
With k = 5: 0.6892
With k = 6: 0.6606
With k = 7: 0.7039
With k = 8: 0.6963
With k = 9: 0.6892
With k = 10: 0.6894

Test set accuracy with k=7: 0.7778

Dataset Label:  Breast Cancer 


Optimal k: 9 with cross-validated accuracy: 0.8945

With k = 1: 0.6286
With k = 2: 0.8396
With k = 3: 0.8615
With k = 4: 0.8132
With k = 5: 0.8659
With k = 6: 0.8747
With k = 7: 0.8681
With k = 8: 0.8681
With k = 9: 0.8945
With k = 10: 0.8923

Test set accuracy with k=9: 0.9035

Dataset Label:  Image Segmentation 


Optimal k: 10 with cross-validated accur