In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import mode
import time
import ray

In [10]:
ray.init()

RuntimeError: Maybe you called ray.init twice by accident? This error can be suppressed by passing in 'ignore_reinit_error=True' or by calling 'ray.shutdown()' prior to 'ray.init()'.

In [None]:
# Loading dataset from CSV
def load_dataset_from_csv(file_path):
    data = pd.read_csv(file_path)
    X = data.iloc[:, :-1].values  # all columns except the last one as features
    y = data.iloc[:, -1].values   # the last column as labels
    return X, y

In [None]:
@ray.remote
def knn_worker(X_train_subset, y_train_subset, X_test, n_neighbors):
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train_subset, y_train_subset)  # Train on a subset of the data
    y_pred = knn.predict(X_test)  # Predict on the full test set
    return y_pred

In [None]:
# Parallel KNN using Ray
def train_and_time_knn_parallel(X_train, y_train, X_test, y_test, n_neighbors=3, num_actors=5):
    # Split the training data across multiple actors
    X_train_splits = np.array_split(X_train, num_actors)
    y_train_splits = np.array_split(y_train, num_actors)

    start_time = time.time()

    # Launch parallel KNN training on multiple actors
    futures = [knn_worker.remote(X_train_splits[i], y_train_splits[i], X_test, n_neighbors)
               for i in range(num_actors)]
    
    # Aggregate predictions from all actors
    all_predictions = ray.get(futures)

    # Combine predictions using majority voting 
    y_pred_combined = mode(all_predictions, axis=0)[0].flatten().astype(int)

    # End timing
    end_time = time.time()

    # Calculate accuracy and time taken
    accuracy = accuracy_score(y_test, y_pred_combined)
    time_taken = end_time - start_time

    return accuracy, time_taken

In [None]:
X, y = load_dataset_from_csv('classification_dataset1.csv')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
accuracy, time_taken = train_and_time_knn_parallel(X_train, y_train, X_test, y_test)
print(f"KNN (with Ray) -> Accuracy: {accuracy * 100:.2f}%, Time taken: {time_taken:.4f} seconds")

  y_pred_combined = mode(all_predictions, axis=0)[0].flatten().astype(int)


KNN (with Ray) -> Accuracy: 85.63%, Time taken: 11.5982 seconds
