In [None]:
#Random Forest on Breast Cancer Dataset

In [None]:
#import

import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# step2: Load Dataset

data = load_breast_cancer()
X = data.data
y = data.target

print("Features Shape:", X.shape)
print("Labels Shape:", y.shape)
print("Classes:", data.target_names)

Features Shape: (569, 30)
Labels Shape: (569,)
Classes: ['malignant' 'benign']


In [None]:
#step3: Train-test-split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)

In [None]:
#step4: Train Random Forest

rf = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
rf.fit(X_train, y_train)

# n_estimators -> build 100 trees
# max_depth -> trees expand fully
# fit() -> trains the forest on trainning data

In [None]:
# Step5: Make Predictions

y_pred = rf.predict(X_test)

# predict() -> make predictions
# the forest outputs classes predictions for unseen data

In [None]:
# step6: Evaluate Model

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\n Classification Report: \n")
print(classification_report(y_test, y_pred, target_names = data.target_names))

print("\n Confusion Matrix: \n")
print(confusion_matrix(y_test, y_pred))

# Remember it:
'''
Accuracy = overall correctness

'''

Accuracy: 0.956140350877193

 Classification Report: 

              precision    recall  f1-score   support

   malignant       0.95      0.93      0.94        42
      benign       0.96      0.97      0.97        72

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114


 Confusion Matrix: 

[[39  3]
 [ 2 70]]


'\nAccuracy = overall correctness\n\n'

In [None]:
#step 7: Hyperparameter Tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None,5, 10],
    'criterion': ['gini', 'entropy']
}

grid = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    verbose=1
)
grid.fit(X_train, y_train)

#param-grid-> list of possible settings
#GridSearchCV->checks all combinations
#cv =5-> fold cross validation
#n_jobs=-1-> uses all CPU sores for speed
#fit() -> performs turning & training

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [None]:
# step8: Best Model Results

print("Best parameters:", grid.best_params_)
best_model = grid.best_estimator_
y_pred_best = best_model.predict(X_test)
print("Best Model Accuracy:", accuracy_score(y_test, y_pred_best))

#GridScarchCV gives best performing model based on validation scores.

Best parameters: {'criterion': 'entropy', 'max_depth': None, 'n_estimators': 50}
Best Model Accuracy: 0.956140350877193
