In [2]:
import pandas as pd

df = pd.read_csv("data_refined.csv")

print("Shape:")
print(df.shape)

print("\nColumns:")
print(df.columns)

print("\nFirst 5 rows:")
print(df.head())

print("\nInfo:")
print(df.info())

Shape:
(569, 31)

Columns:
Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'diagnosis'],
      dtype='object')

First 5 rows:
   radius_mean  texture_mean  perimeter_mean  area_mean  smoothness_mean  \
0     1.097064     -2.073335        1.269934   0.984375         1.568466   
1     1.829821     -0.353632        1.685955   1.908708        -0.826962   
2     1.579888      0.456187        1.566503   1.558884         0.942210   
3    -0.76

In [4]:
# ===============================
# Breast Cancer Campaign Mini Project
# Full-features vs Reduced-features comparison
# Dataset: data_refined.csv
# ===============================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

# -------------------------------
# 1) Load dataset
# -------------------------------
df = pd.read_csv("data_refined.csv")
print("Dataset shape:", df.shape)

if "diagnosis" not in df.columns:
    raise ValueError("Column 'diagnosis' was not found. Please check your CSV file.")

X_full = df.drop(columns=["diagnosis"])
y = df["diagnosis"]

# -------------------------------
# 2) Feature selection (Correlation)
# -------------------------------
corr = X_full.corrwith(y).abs().sort_values(ascending=False)

threshold = 0.5
selected_features = corr[corr > threshold].index.tolist()

print("\nSelected Important Features (corr > {:.2f}):".format(threshold))
print(selected_features)
print("Number of selected features:", len(selected_features))

X_reduced = X_full[selected_features]

# -------------------------------
# 3) Split data (80 / 10 / 10) with stratify
# -------------------------------
def split_80_10_10(X, y, random_state=42):
    # 10% test
    X_train_full, X_test, y_train_full, y_test = train_test_split(
        X, y, test_size=0.10, random_state=random_state, stratify=y
    )
    # from remaining 90%: take 10% of total as val => 10/90 = 0.1111...
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_full, y_train_full, test_size=0.1111111111, random_state=random_state, stratify=y_train_full
    )
    return X_train, X_val, X_test, y_train, y_val, y_test

# -------------------------------
# 4) Choose best k using VALIDATION set (as asked)
# -------------------------------
def best_k_by_validation(X_train_scaled, y_train, X_val_scaled, y_val, k_values):
    best_k = None
    best_acc = -1

    for k in k_values:
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train_scaled, y_train)
        val_pred = knn.predict(X_val_scaled)
        val_acc = accuracy_score(y_val, val_pred)

        if val_acc > best_acc:
            best_acc = val_acc
            best_k = k

    return best_k, best_acc

# -------------------------------
# 5) Train + evaluate a set of models
# -------------------------------
def train_and_evaluate(X, y, title, random_state=42):
    print("\n" + "="*70)
    print(title)
    print("="*70)

    X_train, X_val, X_test, y_train, y_val, y_test = split_80_10_10(X, y, random_state=random_state)
    print(f"Train size: {X_train.shape} | Val size: {X_val.shape} | Test size: {X_test.shape}")

    # Scaling (useful for KNN and SVC; harmless for RF)
    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_val_s   = scaler.transform(X_val)
    X_test_s  = scaler.transform(X_test)

    # KNN: pick best k using validation (exactly what instructions said)
    k_values = list(range(1, 32, 2))  # odd k values: 1..31
    best_k, best_val_acc = best_k_by_validation(X_train_s, y_train, X_val_s, y_val, k_values)
    print(f"\nBest k (by validation): {best_k} | Best validation accuracy: {best_val_acc:.4f}")

    knn = KNeighborsClassifier(n_neighbors=best_k)
    knn.fit(X_train_s, y_train)

    rf = RandomForestClassifier(random_state=random_state)
    rf.fit(X_train_s, y_train)

    svc = SVC()
    svc.fit(X_train_s, y_train)

    models = {
        "KNN": knn,
        "Random Forest": rf,
        "SVC": svc
    }

    print("\nFINAL TEST RESULTS")
    results = {}
    for name, model in models.items():
        y_pred = model.predict(X_test_s)
        acc = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)
        results[name] = acc

        print(f"\n{name} Accuracy: {acc:.4f}")
        print("Confusion Matrix:")
        print(cm)

    return results

# -------------------------------
# Run BOTH: Full-features and Reduced-features (as asked)
# -------------------------------
results_full = train_and_evaluate(X_full, y, title="FULL FEATURES DATASET RESULTS", random_state=42)
results_reduced = train_and_evaluate(X_reduced, y, title="REDUCED FEATURES DATASET RESULTS (Correlation-selected)", random_state=42)

print("\n" + "="*70)
print("COMPARISON (Test Accuracy)")
print("="*70)
print("Full features:", {k: round(v, 4) for k, v in results_full.items()})
print("Reduced features:", {k: round(v, 4) for k, v in results_reduced.items()})

# Minimum required accuracy check (94%)
min_required = 0.94
best_full = max(results_full.values())
best_reduced = max(results_reduced.values())
print(f"\nMinimum required accuracy: {min_required:.2f}")
print(f"Best (Full): {best_full:.4f} | Pass? {best_full >= min_required}")
print(f"Best (Reduced): {best_reduced:.4f} | Pass? {best_reduced >= min_required}")

Dataset shape: (569, 31)

Selected Important Features (corr > 0.50):
['concave points_worst', 'perimeter_worst', 'concave points_mean', 'radius_worst', 'perimeter_mean', 'area_worst', 'radius_mean', 'area_mean', 'concavity_mean', 'concavity_worst', 'compactness_mean', 'compactness_worst', 'radius_se', 'perimeter_se', 'area_se']
Number of selected features: 15

FULL FEATURES DATASET RESULTS
Train size: (455, 30) | Val size: (57, 30) | Test size: (57, 30)

Best k (by validation): 3 | Best validation accuracy: 1.0000

FINAL TEST RESULTS

KNN Accuracy: 0.9474
Confusion Matrix:
[[35  1]
 [ 2 19]]

Random Forest Accuracy: 1.0000
Confusion Matrix:
[[36  0]
 [ 0 21]]

SVC Accuracy: 0.9825
Confusion Matrix:
[[36  0]
 [ 1 20]]

REDUCED FEATURES DATASET RESULTS (Correlation-selected)
Train size: (455, 15) | Val size: (57, 15) | Test size: (57, 15)

Best k (by validation): 3 | Best validation accuracy: 1.0000

FINAL TEST RESULTS

KNN Accuracy: 0.9123
Confusion Matrix:
[[33  3]
 [ 2 19]]

Random Fo