In [17]:
# 1.Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import xgboost as xgb

# To clean up warnings
import warnings
warnings.filterwarnings('ignore')


In [19]:
# 2.Load and Inspect Data
df = pd.read_csv("C:/Users/punit/OneDrive/Desktop/ACM-30Days/breast-cancer.csv")
print("Shape:", df.shape)
print(df.head())

# Map target variable
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

# Drop ID column if exists
if 'id' in df.columns:
    df.drop('id', axis=1, inplace=True)

print("\nTarget Distribution:")
print(df['diagnosis'].value_counts())


Shape: (569, 32)
         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  radius_worst  texture_worst  perimete

In [21]:
# 3.Preprocess (Train/Test Split & Scaling)
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, random_state=42, test_size=0.2
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [23]:
# 4.Train & Evaluate Models
# Define Models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(algorithm='SAMME', random_state=42),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# Dictionary to store accuracy
results = {}

# Train each model
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    acc = accuracy_score(y_test, y_pred)
    results[name] = acc

    print(f"\n=== {name} ===")
    print(f" Accuracy: {acc:.4f}")
    print(" Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print(" Classification Report:\n", classification_report(y_test, y_pred))



=== Random Forest ===
 Accuracy: 0.9737
 Confusion Matrix:
 [[72  0]
 [ 3 39]]
 Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98        72
           1       1.00      0.93      0.96        42

    accuracy                           0.97       114
   macro avg       0.98      0.96      0.97       114
weighted avg       0.97      0.97      0.97       114


=== AdaBoost ===
 Accuracy: 0.9825
 Confusion Matrix:
 [[72  0]
 [ 2 40]]
 Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99        72
           1       1.00      0.95      0.98        42

    accuracy                           0.98       114
   macro avg       0.99      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114


=== XGBoost ===
 Accuracy: 0.9737
 Confusion Matrix:
 [[72  0]
 [ 3 39]]
 Classification Report:
               precision    recall  f1-scor

In [25]:
# 5: Comparison Table
# Create comparison DataFrame
comparison = pd.DataFrame(list(results.items()), columns=["Model", "Accuracy"])
comparison.sort_values(by="Accuracy", ascending=False, inplace=True)

print("Final Model Comparison:")
display(comparison.style.background_gradient(cmap='Blues').format({"Accuracy": "{:.4f}"}))


Final Model Comparison:


Unnamed: 0,Model,Accuracy
1,AdaBoost,0.9825
0,Random Forest,0.9737
2,XGBoost,0.9737
