Default Model with no hyperparameter tuning

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import math
from sklearn.metrics import accuracy_score

data = pd.read_csv('breastcancer.csv') 

features_selected = ['radius_mean', 'texture_mean', 'smoothness_mean', 'compactness_mean', 
                     'concavity_mean', 'symmetry_mean', 'radius_se', 'concave points_se', 
                     'smoothness_worst', 'compactness_worst', 'concavity_worst', 
                     'symmetry_worst', 'fractal_dimension_worst'] # Features that were highly correlated when feature selection was done


X = data[features_selected]
y = data['diagnosis']

X = X.dropna()
y = y.dropna()

X = X.loc[y.index] # make sure X & y line up together

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)

num_features = len(features_selected) # Number of features to complete square root of

# Random Forest Model Implementation
random_forest = RandomForestClassifier(max_features = math.ceil(math.sqrt(num_features)), n_estimators = 5, random_state=23)
random_forest.fit(X_train, y_train)

# Predict and evaluate
y_pred_rf = random_forest.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f'Random Forest Accuracy: {accuracy_rf:.2f}')

Random Forest Accuracy: 0.96


In [13]:
# Print all the parameters of the random forest out
print("Random Forest Model Parameters:", random_forest.get_params())

Random Forest Model Parameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 4, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 5, 'n_jobs': None, 'oob_score': False, 'random_state': 23, 'verbose': 0, 'warm_start': False}


Perform Hyperparameter Tuning using Grid Search 

In [28]:
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier(random_state=23) #initialize

# Hyperparameter tuning using grid search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': [None, 'sqrt', 'log2'],
    'bootstrap': [True, False]

}

grid = GridSearchCV(estimator=rf, param_grid=param_grid, n_jobs=-1, cv=5, error_score = 'raise') # Perform grid search with cv folds of 5

grid_result = grid.fit(X_train, y_train)

print("Best parameters: ", grid_result.best_params_)


Best parameters:  {'bootstrap': False, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}


Updated Random Forest Model using the best parameters from the Grid Search

In [29]:
# Random Forest Model Implementation with best parameters
best_rf = RandomForestClassifier(
    bootstrap=False,
    max_depth=10,
    max_features='sqrt',
    min_samples_leaf=1,
    min_samples_split=5,
    n_estimators=100,
    random_state=23
)

# Fit the model
best_rf.fit(X_train, y_train)

# Predict and evaluate the model
y_pred_best_rf = best_rf.predict(X_test)
accuracy_best_rf = accuracy_score(y_test, y_pred_best_rf)

# Print the accuracy of the best Random Forest model
print(f'Best Random Forest Accuracy: {accuracy_best_rf:.2f}')


Best Random Forest Accuracy: 0.97
