In [5]:
import numpy as np
import pandas as pd
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

In [4]:


# Load dataset
data = load_breast_cancer(as_frame=True)
df = data.frame.copy()

print("Dataset shape:", df.shape)
print("Number of features:", len(data.feature_names))
print("Class distribution:\n", df["target"].value_counts())

# Split features and target
X = df.drop(columns=["target"])
y = df["target"]

# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\nTrain shape:", X_train.shape)
print("Test shape:", X_test.shape)


Dataset shape: (569, 31)
Number of features: 30
Class distribution:
 target
1    357
0    212
Name: count, dtype: int64

Train shape: (455, 30)
Test shape: (114, 30)


In [9]:




# XGBoost (install if needed)
try:
    from xgboost import XGBClassifier
    xgb_available = True
except ImportError:
    xgb_available = False
    print("⚠️ XGBoost not installed. Install with: pip install xgboost")

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    
    # For AUC, need probability of class=1
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
    else:
        y_prob = y_pred  # fallback

    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }

# Preprocessing (scaling)
scaler = StandardScaler()

models = {
    "Logistic Regression": Pipeline([
        ("scaler", scaler),
        ("model", LogisticRegression(max_iter=500))
    ]),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": Pipeline([
        ("scaler", scaler),
        ("model", KNeighborsClassifier(n_neighbors=5))
    ]),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42)
}

if xgb_available:
    models["XGBoost"] = XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.9,
        colsample_bytree=0.9,
        eval_metric="logloss",
        random_state=42
    )

# Train + Evaluate
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    metrics = evaluate_model(model, X_test, y_test)
    metrics["Model"] = name
    results.append(metrics)

results_df = pd.DataFrame(results).set_index("Model")
print(results_df)
import joblib

# Train again (safe) + save with fixed filenames
model_files = {
    "Logistic_Regression": models["Logistic Regression"],
    "Decision_Tree": models["Decision Tree"],
    "KNN": models["KNN"],
    "Naive_Bayes": models["Naive Bayes"],
    "Random_Forest": models["Random Forest"],
    "XGBoost": models["XGBoost"]
}

for file_name, model in model_files.items():
    model.fit(X_train, y_train)
    joblib.dump(model, f"{file_name}.pkl")

print("✅ All models saved as .pkl files!")



                     Accuracy       AUC  Precision    Recall        F1  \
Model                                                                    
Logistic Regression  0.982456  0.995370   0.986111  0.986111  0.986111   
Decision Tree        0.912281  0.915675   0.955882  0.902778  0.928571   
KNN                  0.956140  0.978836   0.958904  0.972222  0.965517   
Naive Bayes          0.938596  0.987765   0.945205  0.958333  0.951724   
Random Forest        0.956140  0.993056   0.958904  0.972222  0.965517   
XGBoost              0.956140  0.995040   0.946667  0.986111  0.965986   

                          MCC  
Model                          
Logistic Regression  0.962302  
Decision Tree        0.817412  
KNN                  0.905447  
Naive Bayes          0.867553  
Random Forest        0.905447  
XGBoost              0.905824  
✅ All models saved as .pkl files!


In [10]:
import joblib

for name, model in models.items():
    model.fit(X_train, y_train)
    joblib.dump(model, f"{name.replace(' ', '_')}.pkl")

print("Models saved successfully!")


Models saved successfully!


In [11]:
test_df = X_test.copy()
test_df["target"] = y_test.values
test_df.to_csv("breast_cancer_test.csv", index=False)

print("✅ Saved breast_cancer_test.csv (use this to test the app)")


✅ Saved breast_cancer_test.csv (use this to test the app)
