## KNN ABLATION STUDY


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 1. Load Data
df = pd.read_csv(r"C:\Users\rithy\OneDrive\Desktop\python\archive\data.csv") 

# 2. Drop the useless columns
df = df.drop(columns=['id', 'Unnamed: 32'], errors='ignore')

# Clean strings and map diagnosis
df['diagnosis'] = df['diagnosis'].astype(str).str.strip()
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

# Drop rows where diagnosis couldn't be mapped
df = df.dropna(subset=['diagnosis'])

# 3. Prepare X and y
X = df.drop(columns=['diagnosis'])
y = df['diagnosis']

# 4. Scale features (Critical for KNN)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 5. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 6. Initial KNN Model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

# 7. Record Original Metrics
original_metrics = {
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred),
    'Recall': recall_score(y_test, y_pred),
    'F1 Score': f1_score(y_test, y_pred)
}

print("ORIGINAL METRICS")
print(pd.DataFrame(original_metrics, index=['Baseline']).round(4))

ORIGINAL METRICS
          Accuracy  Precision  Recall  F1 Score
Baseline    0.9474     0.9302  0.9302    0.9302


In [2]:
ablation_results = []

for feature in X.columns:
    # 1. Drop feature and scale in one flow
    X_temp = X.drop(columns=[feature])
    X_temp_scaled = scaler.fit_transform(X_temp)
    
    # 2. Consistent Split
    X_tr, X_te, y_tr, y_te = train_test_split(X_temp_scaled, y, test_size=0.2, random_state=42)
    
    # 3. Fit & Predict
    knn_temp = KNeighborsClassifier(n_neighbors=5).fit(X_tr, y_tr)
    y_pr = knn_temp.predict(X_te)
    
    # 4. Store metrics
    ablation_results.append({
        'Removed Feature': feature,
        'Accuracy': accuracy_score(y_te, y_pr),
        'Precision': precision_score(y_te, y_pr),
        'Recall': recall_score(y_te, y_pr),
        'F1 Score': f1_score(y_te, y_pr)
    })

# Create DataFrame and calculate Drop
ablation_df = pd.DataFrame(ablation_results)
ablation_df['Acc Drop'] = original_metrics['Accuracy'] - ablation_df['Accuracy']

# Reorder and Sort
full_report = ablation_df[['Removed Feature', 'Accuracy', 'Acc Drop', 'Precision', 'Recall', 'F1 Score']]
full_report = full_report.sort_values(by='Acc Drop', ascending=False)

# Display as DataFrame
full_report

Unnamed: 0,Removed Feature,Accuracy,Acc Drop,Precision,Recall,F1 Score
0,radius_mean,0.947368,0.0,0.930233,0.930233,0.930233
2,perimeter_mean,0.947368,0.0,0.930233,0.930233,0.930233
3,area_mean,0.947368,0.0,0.930233,0.930233,0.930233
8,symmetry_mean,0.947368,0.0,0.930233,0.930233,0.930233
7,concave points_mean,0.947368,0.0,0.930233,0.930233,0.930233
6,concavity_mean,0.947368,0.0,0.930233,0.930233,0.930233
10,radius_se,0.947368,0.0,0.930233,0.930233,0.930233
11,texture_se,0.947368,0.0,0.930233,0.930233,0.930233
13,area_se,0.947368,0.0,0.930233,0.930233,0.930233
9,fractal_dimension_mean,0.947368,0.0,0.930233,0.930233,0.930233


### Observations :
No single feature removal caused a performance drop, revealing that the dataset is highly **redundant** because geometric features like radius and area provide **overlapping information**. Also, several features acted as **noise**, meaning their removal actually sharpened the model's focus and **boosted accuracy**. 
This demonstrates that for **KNN**, a smaller, cleaner set of features is often more effective than a large, redundant one.

In [3]:
# 1. Sort by Acc Drop ascending (most negative = best improvement)
top_10_noisy = ablation_df.sort_values(by='Acc Drop').head(10)

# 2. Select and organize all metrics for the report
noisy_report = top_10_noisy[[
    'Removed Feature', 
    'Accuracy', 
    'Acc Drop', 
    'Precision', 
    'Recall', 
    'F1 Score'
]]

# 3. Extract the list of names for your final model training
noisy_feature_list = top_10_noisy['Removed Feature'].tolist()

# 4. Display as a DataFrame
print("TOP 10 NOISY FEATURES (REMOVAL IMPROVES METRICS)")
noisy_report

TOP 10 NOISY FEATURES (REMOVAL IMPROVES METRICS)


Unnamed: 0,Removed Feature,Accuracy,Acc Drop,Precision,Recall,F1 Score
4,smoothness_mean,0.964912,-0.017544,0.953488,0.953488,0.953488
18,symmetry_se,0.964912,-0.017544,0.953488,0.953488,0.953488
28,symmetry_worst,0.964912,-0.017544,0.97561,0.930233,0.952381
1,texture_mean,0.95614,-0.008772,0.952381,0.930233,0.941176
19,fractal_dimension_se,0.95614,-0.008772,0.931818,0.953488,0.942529
24,smoothness_worst,0.95614,-0.008772,0.952381,0.930233,0.941176
5,compactness_mean,0.95614,-0.008772,0.952381,0.930233,0.941176
12,perimeter_se,0.95614,-0.008772,0.931818,0.953488,0.942529
27,concave points_worst,0.95614,-0.008772,0.952381,0.930233,0.941176
25,compactness_worst,0.95614,-0.008772,0.952381,0.930233,0.941176


In [4]:
# 1. Drop the 10 noisy features
X_optimized = X.drop(columns=noisy_feature_list)

# 2. Re-scale the reduced feature set
X_opt_scaled = scaler.fit_transform(X_optimized)

# 3. Split the data
X_train_o, X_test_o, y_train_o, y_test_o = train_test_split(
    X_opt_scaled, y, test_size=0.2, random_state=42
)

# 4. Train and Evaluate
knn_final = KNeighborsClassifier(n_neighbors=5)
knn_final.fit(X_train_o, y_train_o)
y_pred_o = knn_final.predict(X_test_o)

# 5. Final results in a DataFrame
print("METRICS WITHOUT NOISY FEATURES")
final_results_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score'],
    'Value': [
        accuracy_score(y_test_o, y_pred_o),
        precision_score(y_test_o, y_pred_o),
        recall_score(y_test_o, y_pred_o),
        f1_score(y_test_o, y_pred_o)
    ]
})

final_results_df

METRICS WITHOUT NOISY FEATURES


Unnamed: 0,Metric,Value
0,Accuracy,0.973684
1,Precision,0.97619
2,Recall,0.953488
3,F1 Score,0.964706


### KNN FROM SCRATCH

In [9]:
import numpy as np
from collections import Counter

def euclidean_distance(x1, x2):
    distance = np.sqrt(np.sum((x1-x2)**2))
    return distance

class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = [self._predict(x) for x in X]
        return predictions

    def _predict(self, x):
        # 1. Compute distances between x and all examples in the training set
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
    
        # 2. Sort by distance and return indices of the first k neighbors
        k_indices = np.argsort(distances)[:self.k]
        
        # 3. Extract the labels of the k nearest neighbor training samples
        # Using iloc or just simple indexing if it's a numpy array
        k_nearest_labels = [self.y_train[i] for i in k_indices]

        # 4. Majority vote, most common class label
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]

In [14]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.preprocessing import StandardScaler


X = df.drop(columns=['diagnosis']).values 
y = df['diagnosis'].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

clf = KNN(k=5)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Convert your scratch predictions to a standard NumPy array
y_pred_scratch = np.array(predictions)

# Calculate the four metrics
metrics_scratch = {
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score'],
    'Value': [
        accuracy_score(y_test, y_pred_scratch),
        precision_score(y_test, y_pred_scratch),
        recall_score(y_test, y_pred_scratch),
        f1_score(y_test, y_pred_scratch)
    ]
}

# Display in a clean DataFrame
scratch_results_df = pd.DataFrame(metrics_scratch)

print("SCRATCH KNN MODEL PERFORMANCE")
scratch_results_df

SCRATCH KNN MODEL PERFORMANCE


Unnamed: 0,Metric,Value
0,Accuracy,0.929825
1,Precision,0.911111
2,Recall,0.911111
3,F1 Score,0.911111
