<a href="https://colab.research.google.com/github/shifa441/D_M/blob/main/DATA_MINING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [109]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                            f1_score, confusion_matrix, classification_report
)


In [110]:
data = pd.read_csv("Dataset.csv")   # Upload Dataset.csv manually in Colab/Notebook
print("Dataset Shape:", data.shape)
print("Columns:", data.columns.tolist())


Dataset Shape: (569, 33)
Columns: ['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32']


In [111]:
data["diagnosis_encoded"] = data["diagnosis"].map({"M": 1, "B": 0})
print("\nEncoded Diagnosis Counts:")
print(data["diagnosis_encoded"].value_counts())


Encoded Diagnosis Counts:
diagnosis_encoded
0    357
1    212
Name: count, dtype: int64


In [112]:
columns_to_drop = [
    col for col in data.columns
    if col.lower() == "id" or "unnamed" in col.lower()
]
data = data.drop(columns=columns_to_drop, errors="ignore")
print("\nShape After Dropping Columns:", data.shape)


Shape After Dropping Columns: (569, 32)


In [113]:
y = data["diagnosis_encoded"]
X = data.drop(columns=["diagnosis", "diagnosis_encoded"], errors="ignore")

# Remove non-numeric columns if any exist
non_numeric= X.select_dtypes(exclude=[np.number]).columns
X= X.drop(columns=non_numeric, errors="ignore")

In [114]:
scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [115]:
X_train, X_test, y_train, y_test= train_test_split(X_scaled, y, test_size=0.20, stratify=y, random_state=42
)


In [116]:
kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(X_scaled)

print("\nK-Means Clustering vs Actual Diagnosis:")
print(pd.crosstab(cluster_labels, y))

# Calculate cluster purity
cluster_table = pd.crosstab(cluster_labels, y)
purity = cluster_table.max(axis=1).sum() / cluster_table.sum().sum()
print("\nCluster Purity:", purity)


K-Means Clustering vs Actual Diagnosis:
diagnosis_encoded    0    1
row_0                      
0                    9  180
1                  348   32

Cluster Purity: 0.9279437609841827


In [120]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Predict on test data
y_pred = knn.predict(X_test)

In [118]:
print("\n KNN Model Performance:")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1-score :", f1_score(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


 KNN Model Performance:
Accuracy : 0.9649122807017544
Precision: 1.0
Recall   : 0.9047619047619048
F1-score : 0.95

Confusion Matrix:
[[72  0]
 [ 4 38]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97        72
           1       1.00      0.90      0.95        42

    accuracy                           0.96       114
   macro avg       0.97      0.95      0.96       114
weighted avg       0.97      0.96      0.96       114

