In [1]:
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [2]:
csv_file = "../02_models/mush_data_binned.csv"
mushroom_df = pd.read_csv(csv_file)

mushroom_df.head()

Unnamed: 0,poisonous,bruises,ring_number,cap_shape_b,cap_shape_c,cap_shape_f,cap_shape_k,cap_shape_s,cap_shape_x,cap_surface_f,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,1,1,1,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
1,0,1,1,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,1,1,1,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
4,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0


In [3]:
X = mushroom_df.drop(columns="poisonous")
y = mushroom_df["poisonous"]

In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [5]:
# Create a Decision Tree classifier
rf = RandomForestClassifier()

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 3, 5, 7],
    'min_samples_split': [2, 5, 10]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(rf, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best Decision Tree model from grid search
best_rf = grid_search.best_estimator_

# Make predictions on the test data using the best model
y_pred = best_rf.predict(X_test)

# Calculate the accuracy of the model
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Best Estimators: {best_rf.n_estimators}")
print(f"Best Max Depth: {best_rf.max_depth}")
print(f"Best Min Samples Split: {best_rf.min_samples_split}")
print(f"Accuracy: {accuracy:.2f}")
print(f"F1: {f1:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

Best Estimators: 50
Best Max Depth: None
Best Min Samples Split: 2
Accuracy: 1.00
F1: 1.00
Precision: 1.00
Recall: 1.00


In [7]:
feature_importances = best_rf.feature_importances_

feature_names = X_train.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
importance_df.head(10)

Unnamed: 0,Feature,Importance
22,odor_n,0.173905
27,gill_size_b,0.107113
19,odor_f,0.097188
39,stalk_surf_above_k,0.083291
28,gill_size_n,0.077035
0,bruises,0.066978
43,stalk_surf_below_k,0.051091
63,population_v,0.032107
40,stalk_surf_above_s,0.029891
46,stalk-color-above_b,0.017059
