# Random Forest On Breast Cancer Dataset

In [1]:
# Step 1:imports

import numpy as np 
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix

In [3]:
# Step 2: Load Datasets 

data = load_breast_cancer()
X = data.data
y = data.target

print("Features Shape:", X.shape)
print("Labels shape:", y.shape)
print("Classes", data.target_names)

Features Shape: (569, 30)
Labels shape: (569,)
Classes ['malignant' 'benign']


In [4]:
# Step 3 : Train-Test Split 

X_train, X_test, y_train, y_test = train_test_split(
    X,y, test_size=0.2, random_state=42, stratify= y   
)



In [6]:
# Step 4: Train Random Forest

rf = RandomForestClassifier(
    n_estimators=100,
    max_depth= None, 
    random_state=42
)

rf.fit(X_train, y_train)

# n_estimator -> build 100 trees
# max_depth -> trees expand fully 
# fit() -> trains the forest on training data

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [7]:
# Step 5 : Make Predictions

y_pred = rf.predict(X_test)

#predict() -> makes prediction
# the forest outputs class predictions for unseen data

In [9]:
# Step 6: Evaluate Model 

accuracy = accuracy_score(y_test,y_pred)
print("Accuracy:",accuracy)

print("\n Classification Report: \n")
print(classification_report(y_test,y_pred,target_names=data.target_names))

print("\n Confusion Matrix: \n")
print(confusion_matrix(y_test,y_pred))

Accuracy: 0.956140350877193

 Classification Report: 

              precision    recall  f1-score   support

   malignant       0.95      0.93      0.94        42
      benign       0.96      0.97      0.97        72

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114


 Confusion Matrix: 

[[39  3]
 [ 2 70]]


In [13]:
# Step 7 : Hyperparameter Tuning

param_grid ={
    'n_estimators': [50,100,200],
    'max_depth': [None, 5, 10],
    'criterion': ['gini', 'entropy']   
}

grid = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid = param_grid,
    cv=5,
    n_jobs=-1, 
    verbose=1
    
)

grid.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


0,1,2
,estimator,RandomForestC...ndom_state=42)
,param_grid,"{'criterion': ['gini', 'entropy'], 'max_depth': [None, 5, ...], 'n_estimators': [50, 100, ...]}"
,scoring,
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,50
,criterion,'entropy'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [14]:
# Step 8: Best Model Reasult

print("Best parameters:", grid.best_params_)
best_model = grid.best_estimator_

y_pred_best = best_model.predict(X_test)
print("Best Model Accuracy:", accuracy_score(y_test , y_pred_best))

Best parameters: {'criterion': 'entropy', 'max_depth': None, 'n_estimators': 50}
Best Model Accuracy: 0.956140350877193
