In [17]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

In [2]:
data = load_breast_cancer()

X = data.data
y = data.target  # Labels (0 = malignant, 1 = benign)


In [5]:

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [6]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))

y_pred_proba = model.predict_proba(X_test)[:, 1] 
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f'AUC-ROC Score: {auc_score:.2f}')

Model accuracy: 0.96
              precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114

AUC-ROC Score: 1.00


In [9]:
# Get feature importance
importance = model.feature_importances_

feature_importance_df = pd.DataFrame({'Feature': data.feature_names, 'Importance': importance})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
top_5_features = feature_importance_df.head(5)
print("Top 5 Most Important Features:")
print(top_5_features)

Top 5 Most Important Features:
                 Feature  Importance
23            worst area    0.153892
27  worst concave points    0.144663
7    mean concave points    0.106210
20          worst radius    0.077987
6         mean concavity    0.068001


In [18]:
selected_features = ['worst area', 'worst concave points', 'mean concave points', 'worst radius', 'mean concavity']

data = load_breast_cancer()
X = np.array(pd.DataFrame(data.data, columns=data.feature_names)[selected_features])
y = data.target  # Labels (0 = malignant, 1 = benign)

In [21]:
pd.DataFrame(data.data, columns=data.feature_names)[selected_features]

Unnamed: 0,worst area,worst concave points,mean concave points,worst radius,mean concavity
0,2019.0,0.2654,0.14710,25.380,0.30010
1,1956.0,0.1860,0.07017,24.990,0.08690
2,1709.0,0.2430,0.12790,23.570,0.19740
3,567.7,0.2575,0.10520,14.910,0.24140
4,1575.0,0.1625,0.10430,22.540,0.19800
...,...,...,...,...,...
564,2027.0,0.2216,0.13890,25.450,0.24390
565,1731.0,0.1628,0.09791,23.690,0.14400
566,1124.0,0.1418,0.05302,18.980,0.09251
567,1821.0,0.2650,0.15200,25.740,0.35140
