In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [3]:
# Load dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
columns = ['ID', 'Diagnosis'] + [f'feature_{i}' for i in range(1, 31)]  # There are 30 features
data = pd.read_csv(url, header=None, names=columns)

# Drop the 'ID' column
data.drop('ID', axis=1, inplace=True)

# Display the first few rows of the data
print(data.head())


  Diagnosis  feature_1  feature_2  feature_3  feature_4  feature_5  feature_6  \
0         M      17.99      10.38     122.80     1001.0    0.11840    0.27760   
1         M      20.57      17.77     132.90     1326.0    0.08474    0.07864   
2         M      19.69      21.25     130.00     1203.0    0.10960    0.15990   
3         M      11.42      20.38      77.58      386.1    0.14250    0.28390   
4         M      20.29      14.34     135.10     1297.0    0.10030    0.13280   

   feature_7  feature_8  feature_9  ...  feature_21  feature_22  feature_23  \
0     0.3001    0.14710     0.2419  ...       25.38       17.33      184.60   
1     0.0869    0.07017     0.1812  ...       24.99       23.41      158.80   
2     0.1974    0.12790     0.2069  ...       23.57       25.53      152.50   
3     0.2414    0.10520     0.2597  ...       14.91       26.50       98.87   
4     0.1980    0.10430     0.1809  ...       22.54       16.67      152.20   

   feature_24  feature_25  feature_26 

In [4]:
# Convert the diagnosis column to binary values
data['Diagnosis'] = data['Diagnosis'].map({'M': 1, 'B': 0})


In [5]:
# Separate features and target variable
X = data.drop('Diagnosis', axis=1)  # Features
y = data['Diagnosis']  # Target


In [6]:
# Separate features and target variable
X = data.drop('Diagnosis', axis=1)  # Features
y = data['Diagnosis']  # Target


In [7]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)


In [9]:
# Initialize KNN model with k=5 (you can experiment with different k values)
knn = KNeighborsClassifier(n_neighbors=5)

# Fit the model to the training data
knn.fit(X_train, y_train)


In [10]:
# Make predictions on the test set
y_pred = knn.predict(X_test)


In [11]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 95.91%
Confusion Matrix:
[[105   3]
 [  4  59]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.97       108
           1       0.95      0.94      0.94        63

    accuracy                           0.96       171
   macro avg       0.96      0.95      0.96       171
weighted avg       0.96      0.96      0.96       171



In [12]:
# Trying different k values and checking accuracy
for k in range(1, 11):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'k={k}, Accuracy: {accuracy * 100:.2f}%')


k=1, Accuracy: 95.32%
k=2, Accuracy: 95.32%
k=3, Accuracy: 95.91%
k=4, Accuracy: 96.49%
k=5, Accuracy: 95.91%
k=6, Accuracy: 96.49%
k=7, Accuracy: 95.91%
k=8, Accuracy: 96.49%
k=9, Accuracy: 97.08%
k=10, Accuracy: 95.91%


In [13]:
new_instance = np.array([[15.0, 20.5, 90.1, 125.5, 0.1, 0.08, 0.05, 0.06, 0.18, 0.06,
                          0.5, 1.0, 3.2, 35.0, 0.002, 0.005, 0.003, 0.005, 0.02, 0.004,
                          18.0, 25.5, 100.5, 150.0, 0.15, 0.1, 0.07, 0.08, 0.3, 0.08]])


In [14]:
new_instance_scaled = scaler.transform(new_instance)




In [15]:
prediction = knn.predict(new_instance_scaled)


In [16]:
if prediction == 1:
    print("The tumor is predicted to be Malignant.")
else:
    print("The tumor is predicted to be Benign.")


The tumor is predicted to be Benign.
