In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib # For saving the model and scaler


In [5]:
cancer = load_breast_cancer()
X = pd.DataFrame(cancer.data, columns=cancer.feature_names)
y = cancer.target # 0 for malignant, 1 for benign

In [4]:
print(X.head())

   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst radius  worst texture  worst perimeter  \
0           

In [3]:
print("--- Dataset Information ---")
print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print(f"Target names: {cancer.target_names}")
print(f"Feature names: {list(X.columns)}")
print("\nFirst 5 rows of features (X):")

print("\nTarget distribution:")

--- Dataset Information ---
Features (X) shape: (569, 30)
Target (y) shape: (569,)
Target names: ['malignant' 'benign']
Feature names: ['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness', 'mean compactness', 'mean concavity', 'mean concave points', 'mean symmetry', 'mean fractal dimension', 'radius error', 'texture error', 'perimeter error', 'area error', 'smoothness error', 'compactness error', 'concavity error', 'concave points error', 'symmetry error', 'fractal dimension error', 'worst radius', 'worst texture', 'worst perimeter', 'worst area', 'worst smoothness', 'worst compactness', 'worst concavity', 'worst concave points', 'worst symmetry', 'worst fractal dimension']

First 5 rows of features (X):

Target distribution:


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (455, 30)
X_test shape: (114, 30)
y_train shape: (455,)
y_test shape: (114,)


In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert scaled arrays back to DataFrames for easier inspection
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)


print(X_train_scaled_df.head())


   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0    -1.072001     -0.658425       -1.088080  -0.939274        -0.135940   
1     1.748743      0.066502        1.751157   1.745559         1.274468   
2    -0.974734     -0.931124       -0.997709  -0.867589        -0.613515   
3    -0.145103     -1.215186       -0.123013  -0.253192         0.664482   
4    -0.771617     -0.081211       -0.803700  -0.732927        -0.672282   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0         -1.008718       -0.968359            -1.102032       0.281062   
1          0.842288        1.519852             1.994664      -0.293045   
2         -1.138154       -1.092292            -1.243358       0.434395   
3          0.286762       -0.129729            -0.098605       0.555635   
4         -1.006099       -0.798502            -0.684484       0.737495   

   mean fractal dimension  ...  worst radius  worst texture  worst perimeter  \
0           

In [9]:

# use RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
# n_estimators: number of trees in the forest
# random_state: for reproducibility


print("\n--- Training Model ---")
model.fit(X_train_scaled, y_train)
print("Model training complete.")



--- Training Model ---
Model training complete.


In [10]:
y_pred_test = model.predict(X_test_scaled)
y_pred_train = model.predict(X_train_scaled)

In [11]:
# testing accurecy
test_accuracy = accuracy_score(y_test, y_pred_test)
train_accuracy = accuracy_score(y_train, y_pred_train)
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

Training Accuracy: 1.0000
Test Accuracy: 0.9474


In [12]:
cm = confusion_matrix(y_test, y_pred_test)
print(cm)
print(f"True Malignant: {cm[0,0]}")
print(f"False Positives (Predicted Malignant, Actually Benign): {cm[1,0]}")
print(f"False Negatives (Predicted Benign, Actually Malignant): {cm[0,1]}") # This is critical to minimize
print(f"True Benign: {cm[1,1]}")


[[39  3]
 [ 3 69]]
True Malignant: 39
False Positives (Predicted Malignant, Actually Benign): 3
False Negatives (Predicted Benign, Actually Malignant): 3
True Benign: 69


In [None]:
model_filename = 'breast_cancer_rfc_model.joblib'
scaler_filename = 'breast_cancer_scaler.joblib'

joblib.dump(model, model_filename)
joblib.dump(scaler, scaler_filename)

print(f"\nModel saved as: {model_filename}")
print(f"Scaler saved as: {scaler_filename}")
print("\n--- Script Finished ---")



Model saved as: breast_cancer_rfc_model.joblib
Scaler saved as: breast_cancer_scaler.joblib

--- Script Finished ---
