I had to first run:
pip install mysql-connector-python

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [2]:
csv_file = "../02_models/mush_data_binned.csv"
mushroom_df = pd.read_csv(csv_file)

mushroom_df.head()

Unnamed: 0,poisonous,bruises,ring_number,cap_shape_b,cap_shape_c,cap_shape_f,cap_shape_k,cap_shape_s,cap_shape_x,cap_surface_f,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,1,1,1,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
1,0,1,1,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,1,1,1,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
4,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0


In [4]:
X = mushroom_df.drop(columns="poisonous")
y = mushroom_df["poisonous"]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
xgb = XGBClassifier()

In [17]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

In [18]:
grid_search = GridSearchCV(xgb, param_grid, cv=5)
grid_search.fit(X_train, y_train)

In [19]:
best_xgb = grid_search.best_estimator_

In [20]:
y_pred = best_xgb.predict(X_test)

In [21]:
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Best Estimators: {best_xgb.n_estimators}")
print(f"Best Learning Rate: {best_xgb.learning_rate}")
print(f"Best Max Depth: {best_xgb.max_depth}")
print(f"Accuracy: {accuracy:.2f}")
print(f"F1: {f1:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

Best Estimators: 100
Best Learning Rate: 0.1
Best Max Depth: 3
Accuracy: 1.00
F1: 1.00
Precision: 1.00
Recall: 1.00


In [22]:
feature_importances = best_xgb.feature_importances_

feature_names = X_train.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
importance_df.head(10)

Unnamed: 0,Feature,Importance
22,odor_n,0.307695
20,odor_l,0.12703
0,bruises,0.110464
17,odor_a,0.107707
23,odor_p,0.080141
16,cap_color_y,0.030528
19,odor_f,0.030275
45,stalk_surf_below_y,0.029064
55,stalk_color_below_y,0.023489
33,gill_color_r,0.022711
