In [79]:
# Core Libraries
import numpy as np                      # For numerical computations
import pandas as pd                     # For data manipulation and analysis

# Scikit-learn Libraries
from sklearn.model_selection import train_test_split, GridSearchCV  # For data splitting and hyperparameter tuning
from sklearn.neighbors import KNeighborsClassifier                  # KNN model
from sklearn.metrics import classification_report, accuracy_score   # For model evaluation

# Saving and Loading Models
import pickle  # To save and load the trained model



In [80]:
# Load data
data = pd.read_csv('Data/data.csv', index_col='id').reset_index(drop=True)
data.drop('Unnamed: 32', axis=1, inplace=True)

# Preview data
print('Dataframe shape:', data.shape)
data.head(3)

Dataframe shape: (569, 31)


Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758


In [81]:
# Features and labels
X = data.drop('diagnosis', axis=1)
y = data['diagnosis']

# Encode target to binary
y = (y=='M').astype('int')

In [82]:
# Split dataset into training and testing data (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

In [83]:
import optuna

# Define the KNN model
knn = KNeighborsClassifier()

# Define the Optuna objective function
def objective(trial):
    # Suggest values for the hyperparameters
    n_neighbors = trial.suggest_int('n_neighbors', 1, 20)  # Range for k
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    metric = trial.suggest_categorical('metric', ['euclidean', 'manhattan', 'minkowski'])
    p = trial.suggest_int('p', 1, 3) if metric == 'minkowski' else 2  # Only used for Minkowski
    algorithm = trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute'])
    leaf_size = trial.suggest_int('leaf_size', 10, 50, step=10)
    
    # Create and train the model
    model = KNeighborsClassifier(
        n_neighbors=n_neighbors,
        weights=weights,
        metric=metric,
        p=p,
        algorithm=algorithm,
        leaf_size=leaf_size
    )
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Return the accuracy as the objective to maximize
    return accuracy_score(y_test, y_pred)

# Create an Optuna study
study = optuna.create_study(direction='maximize')  # We want to maximize accuracy
study.optimize(objective, n_trials=50)  # Adjust n_trials for exploration depth

# Print the best parameters and accuracy
print("Best Parameters:", study.best_params)
print("Best Accuracy:", study.best_value)

[I 2024-12-19 22:08:57,577] A new study created in memory with name: no-name-7d765f4d-4e1d-4a3e-863f-a5f8037a4e05
[I 2024-12-19 22:08:57,591] Trial 0 finished with value: 0.8859649122807017 and parameters: {'n_neighbors': 4, 'weights': 'distance', 'metric': 'euclidean', 'algorithm': 'kd_tree', 'leaf_size': 50}. Best is trial 0 with value: 0.8859649122807017.
[I 2024-12-19 22:08:57,643] Trial 1 finished with value: 0.8859649122807017 and parameters: {'n_neighbors': 17, 'weights': 'uniform', 'metric': 'minkowski', 'p': 3, 'algorithm': 'auto', 'leaf_size': 40}. Best is trial 0 with value: 0.8859649122807017.
[I 2024-12-19 22:08:57,660] Trial 2 finished with value: 0.9035087719298246 and parameters: {'n_neighbors': 3, 'weights': 'uniform', 'metric': 'minkowski', 'p': 1, 'algorithm': 'kd_tree', 'leaf_size': 30}. Best is trial 2 with value: 0.9035087719298246.
[I 2024-12-19 22:08:57,690] Trial 3 finished with value: 0.8859649122807017 and parameters: {'n_neighbors': 14, 'weights': 'uniform',

Best Parameters: {'n_neighbors': 4, 'weights': 'uniform', 'metric': 'minkowski', 'p': 2, 'algorithm': 'kd_tree', 'leaf_size': 20}
Best Accuracy: 0.9122807017543859


In [84]:
import numpy as np
from concurrent.futures import ThreadPoolExecutor

class KNN:
    def __init__(self, k=3, metric='euclidean', p=1):
        self.k = k
        self.metric = metric
        self.p = p

    def fit(self, X, y):
        self.X_train = np.array(X, dtype=np.float32)  # Use float32 for smaller memory
        self.y_train = np.array(y, dtype=np.int32)    # Integer labels for indexing

    def _compute_distance(self, x1, x2):
        if self.metric == 'euclidean':
            return np.sqrt(np.sum((x1 - x2) ** 2))
        elif self.metric == 'manhattan':
            return np.sum(np.abs(x1 - x2))
        elif self.metric == 'minkowski':
            return np.sum(np.abs(x1 - x2) ** self.p) ** (1 / self.p)
        else:
            raise ValueError("Unsupported metric. Use 'euclidean', 'manhattan', or 'minkowski'.")

    def _get_neighbors(self, x):
        distances = [self._compute_distance(x, x_train) for x_train in self.X_train]
        neighbors_idx = np.argsort(distances)[:self.k]
        neighbors_distances = np.array(distances)[neighbors_idx]
        return neighbors_idx, neighbors_distances

    def _predict_single(self, x):
        neighbors_idx, neighbors_distances = self._get_neighbors(x)
        neighbor_labels = self.y_train[neighbors_idx]

        # Weighted voting
        weights = 1 / (neighbors_distances + 1e-5)  # Avoid division by zero
        weighted_votes = {}
        for label, weight in zip(neighbor_labels, weights):
            weighted_votes[label] = weighted_votes.get(label, 0) + weight

        # Return the class with the highest weighted vote
        return max(weighted_votes, key=weighted_votes.get)

    def predict(self, X_test):
        X_test = np.array(X_test, dtype=np.float32)
        predictions = [self._predict_single(x) for x in X_test]
        return np.array(predictions)

    def predict_parallel(self, X_test, num_workers=4):
        """Parallelize predictions across multiple CPU cores."""
        X_test_split = np.array_split(X_test, num_workers)
        with ThreadPoolExecutor(max_workers=num_workers) as executor:
            results = list(executor.map(self.predict, X_test_split))
        return np.concatenate(results)


# def predict(self, X_test):
    #     X_test = np.array(X_test, dtype=float)
    #     predictions = []

    #     for x in X_test:
    #         distances = self.dist_metric(x, self.X_train)

    #         nearest_indices = np.argpartition(distances, self.k)[:self.k]
    #         nearest_distances = distances[nearest_indices]
    #         nearest_labels = self.y_train[nearest_indices]

    #         # Weighted voting: Use inverse of distance as weight
    #         weights = 1 / (nearest_distances + 1e-5)
    #         weighted_votes = {}
    #         for label, weight in zip(nearest_labels, weights):
    #             weighted_votes[label] = weighted_votes.get(label, 0) + weight

    #         predictions.append(max(weighted_votes, key=weighted_votes.get))

    #     return predictions

In [85]:
# Define values of p to test
p_values = [1, 2, 3, 4, 5]

# Compare performance
results = []

# Using your KNN implementation
for p in p_values:
    knn = KNN(k=3, metric='minkowski', p=p)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results.append((p, 'from_scratch', accuracy))

# Using scikit-learn
for p in p_values:
    knn_sklearn = KNeighborsClassifier(n_neighbors=3, metric='minkowski', p=p)
    knn_sklearn.fit(X_train, y_train)
    y_pred = knn_sklearn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results.append((p, 'scikit_learn', accuracy))

# Display results
print("Results for different p values in Minkowski metric:")
for p, method, accuracy in results:
    print(f"p={p}, Method={method}, Accuracy={accuracy:.4f}")

Results for different p values in Minkowski metric:
p=1, Method=from_scratch, Accuracy=0.9035
p=2, Method=from_scratch, Accuracy=0.8772
p=3, Method=from_scratch, Accuracy=0.8596
p=4, Method=from_scratch, Accuracy=0.8596
p=5, Method=from_scratch, Accuracy=0.8596
p=1, Method=scikit_learn, Accuracy=0.9035
p=2, Method=scikit_learn, Accuracy=0.8772
p=3, Method=scikit_learn, Accuracy=0.8596
p=4, Method=scikit_learn, Accuracy=0.8596
p=5, Method=scikit_learn, Accuracy=0.8596


In [86]:


# Define metrics to test
distance_metrics = ['euclidean', 'manhattan', 'minkowski', 'cosine', 'chebyshev', 'hamming']

# Iterate through the distance metrics and test the KNN model
for metric in distance_metrics:
    print(f"Testing KNN with {metric} distance metric:")
    
    # Create an instance of the KNN class
    knn = KNN(k=5, metric=metric)  # k=5 is an example; you can tune it
    
    # Fit the model
    knn.fit(X_train, y_train)
    
    # Make predictions
    y_pred = knn.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("-" * 50)


Testing KNN with euclidean distance metric:
Accuracy: 0.88
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.89      0.91        80
           1       0.76      0.85      0.81        34

    accuracy                           0.88       114
   macro avg       0.85      0.87      0.86       114
weighted avg       0.88      0.88      0.88       114

--------------------------------------------------
Testing KNN with manhattan distance metric:
Accuracy: 0.91
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.94      0.94        80
           1       0.85      0.85      0.85        34

    accuracy                           0.91       114
   macro avg       0.90      0.90      0.90       114
weighted avg       0.91      0.91      0.91       114

--------------------------------------------------
Testing KNN with minkowski distance metric:
Accuracy: 0.91
Classification Report