In [3]:
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [16]:
csv_file = "../02_models/mushroom_data_clean.csv"
mushroom_df = pd.read_csv(csv_file)

mushroom_df.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above,stalk-color-below,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,class
0,x,s,n,t,p,f,c,n,k,e,...,w,w,p,w,o,p,k,s,u,p
1,x,s,y,t,a,f,c,b,k,e,...,w,w,p,w,o,p,n,n,g,e
2,b,s,w,t,l,f,c,b,n,e,...,w,w,p,w,o,p,n,n,m,e
3,x,y,w,t,p,f,c,n,n,e,...,w,w,p,w,o,p,k,s,u,p
4,x,s,g,f,n,f,w,b,k,t,...,w,w,p,w,o,e,n,a,g,e


In [17]:
poisonous = {"e": 0, "p": 1}

mushroom_df["class"].replace(poisonous, inplace=True)

mushroom_df.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above,stalk-color-below,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,class
0,x,s,n,t,p,f,c,n,k,e,...,w,w,p,w,o,p,k,s,u,1
1,x,s,y,t,a,f,c,b,k,e,...,w,w,p,w,o,p,n,n,g,0
2,b,s,w,t,l,f,c,b,n,e,...,w,w,p,w,o,p,n,n,m,0
3,x,y,w,t,p,f,c,n,n,e,...,w,w,p,w,o,p,k,s,u,1
4,x,s,g,f,n,f,w,b,k,t,...,w,w,p,w,o,e,n,a,g,0


In [18]:
# Dropping the Spore Print Color feature because it is not something that a hiker could reasonably asses when finding a mushroom in the wild.
# Also dropping Stalk Root feature for two reasons: 1. It would require a hiker to physically uproot the mushroom to inspect 2. It is the only feature with missing values in the dataset.

mushroom_df.drop(columns=["spore-print-color", "stalk-root"], inplace=True)
mushroom_df.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below,stalk-color-above,stalk-color-below,veil-type,veil-color,ring-number,ring-type,population,habitat,class
0,x,s,n,t,p,f,c,n,k,e,...,s,w,w,p,w,o,p,s,u,1
1,x,s,y,t,a,f,c,b,k,e,...,s,w,w,p,w,o,p,n,g,0
2,b,s,w,t,l,f,c,b,n,e,...,s,w,w,p,w,o,p,n,m,0
3,x,y,w,t,p,f,c,n,n,e,...,s,w,w,p,w,o,p,s,u,1
4,x,s,g,f,n,f,w,b,k,t,...,s,w,w,p,w,o,e,a,g,0


In [19]:
# Assigning the target, then dropping it from the dataframe.

y = mushroom_df["class"]

mushroom_df.drop(columns="class", inplace=True)
mushroom_df.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-surface-above,stalk-surface-below,stalk-color-above,stalk-color-below,veil-type,veil-color,ring-number,ring-type,population,habitat
0,x,s,n,t,p,f,c,n,k,e,s,s,w,w,p,w,o,p,s,u
1,x,s,y,t,a,f,c,b,k,e,s,s,w,w,p,w,o,p,n,g
2,b,s,w,t,l,f,c,b,n,e,s,s,w,w,p,w,o,p,n,m
3,x,y,w,t,p,f,c,n,n,e,s,s,w,w,p,w,o,p,s,u
4,x,s,g,f,n,f,w,b,k,t,s,s,w,w,p,w,o,e,a,g


In [20]:
categories = mushroom_df.columns.tolist()

print(categories)

['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-surface-above', 'stalk-surface-below', 'stalk-color-above', 'stalk-color-below', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'population', 'habitat']


In [23]:
dummy_dfs = [pd.get_dummies(mushroom_df[col], prefix=col) for col in categories]

X = pd.concat(dummy_dfs, axis=1)

In [24]:
X.head()

Unnamed: 0,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,cap-surface_y,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,1,0,0,0,1,...,1,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0


In [39]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [40]:
# Create a Decision Tree classifier
rf = RandomForestClassifier()

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 3, 5, 7],
    'min_samples_split': [2, 5, 10]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(rf, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best Decision Tree model from grid search
best_rf = grid_search.best_estimator_

# Make predictions on the test data using the best model
y_pred = best_rf.predict(X_test)

# Calculate the accuracy of the model
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Best Estimators: {best_rf.n_estimators}")
print(f"Best Max Depth: {best_rf.max_depth}")
print(f"Best Min Samples Split: {best_rf.min_samples_split}")
print(f"Accuracy: {accuracy:.2f}")
print(f"F1: {f1:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

Best Estimators: 50
Best Max Depth: None
Best Min Samples Split: 2
Accuracy: 1.00
F1: 1.00
Precision: 1.00
Recall: 1.00


In [41]:
feature_importances = best_rf.feature_importances_

feature_names = X_train.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print(importance_df)

         Feature    Importance
24        odor_f  1.502587e-01
27        odor_n  1.103058e-01
35   gill-size_b  6.483936e-02
36   gill-size_n  5.202969e-02
37  gill-color_b  5.049710e-02
..           ...           ...
12   cap-color_e  1.042582e-05
38  gill-color_e  1.451601e-06
79  veil-color_o  6.481419e-07
43  gill-color_o  0.000000e+00
77   veil-type_p  0.000000e+00

[103 rows x 2 columns]


In [28]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.datasets import make_regression

In [42]:
num_features_to_select = 5
rfe = RFE(estimator=best_rf, n_features_to_select=num_features_to_select)


In [43]:
rfe.fit(X_train, y_train)

In [44]:
feature_ranking = rfe.ranking_

In [45]:
selected_features = [f"Feature {i}" for i, rank in enumerate(feature_ranking, start=1) if rank == 1]
print("Selected features:", selected_features)

Selected features: ['Feature 25', 'Feature 28', 'Feature 36', 'Feature 37', 'Feature 53']


In [53]:
X_train.loc[:, rfe.support_]

Unnamed: 0,odor_f,odor_n,gill-size_b,gill-size_n,stalk-surface-above_k
1397,0,1,1,0,0
243,0,0,0,1,0
7639,0,0,0,1,0
6530,0,1,1,0,0
4757,1,0,1,0,1
...,...,...,...,...,...
487,0,1,0,1,0
1494,0,1,1,0,0
5946,1,0,1,0,0
1078,0,1,1,0,0


In [52]:
rfe.support_

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
        True, False, False, False, False, False, False, False,  True,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False])

In [54]:
best_rf.fit(X_train.loc[:, rfe.support_], y_train)

In [55]:
score = best_rf.score(X_test.loc[:, rfe.support_], y_test)
print("Model score:", score)


Model score: 0.9773510585918267
