In [1]:
# Import the modules

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import OrdinalEncoder

In [2]:
## Read the CSV file from the Resources folder into a Pandas DataFrame
mushroom_df = pd.read_csv('mushroom_data_clean.csv')

# Review the DataFrame
mushroom_df.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above,stalk-color-below,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,class
0,x,s,n,t,p,f,c,n,k,e,...,w,w,p,w,o,p,k,s,u,p
1,x,s,y,t,a,f,c,b,k,e,...,w,w,p,w,o,p,n,n,g,e
2,b,s,w,t,l,f,c,b,n,e,...,w,w,p,w,o,p,n,n,m,e
3,x,y,w,t,p,f,c,n,n,e,...,w,w,p,w,o,p,k,s,u,p
4,x,s,g,f,n,f,w,b,k,t,...,w,w,p,w,o,e,n,a,g,e


In [3]:
encoder = OrdinalEncoder()
encoded_data = encoder.fit_transform(mushroom_df)

In [4]:
mushroom_df = pd.DataFrame(encoded_data, columns = mushroom_df.columns)

In [5]:
mushroom_df.rename({
    "cap-shape": "cap_shape",
    "cap-surface": "cap_surface",
    "cap-color":"cap_color", 
    "gill-attachment": "gill_attach",
    "gill-spacing": "gill_space",
    "gill-size": "gill_size",
    "gill-color": "gill_color",
    "stalk-shape": "stalk_shape",
    "stalk-root": "stalk_root",
    "stalk-surface-below": "stalk_surf_below",
    "stalk-surface-above": "stalk_surf_above",
    "stalk-color-below": "stalk_color_below",
    "veil-type": "veil_type",
    "veil-color": "veil_color",
    "ring-number": "ring_number", 
    "ring-type": "ring_type",
    "spore-print-color": "spore_color",
    "class": "poisonous", 
})

mushroom_df.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above,stalk-color-below,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,class
0,5.0,2.0,4.0,1.0,6.0,1.0,0.0,1.0,4.0,0.0,...,7.0,7.0,0.0,2.0,1.0,4.0,2.0,3.0,5.0,1.0
1,5.0,2.0,9.0,1.0,0.0,1.0,0.0,0.0,4.0,0.0,...,7.0,7.0,0.0,2.0,1.0,4.0,3.0,2.0,1.0,0.0
2,0.0,2.0,8.0,1.0,3.0,1.0,0.0,0.0,5.0,0.0,...,7.0,7.0,0.0,2.0,1.0,4.0,3.0,2.0,3.0,0.0
3,5.0,3.0,8.0,1.0,6.0,1.0,0.0,1.0,5.0,0.0,...,7.0,7.0,0.0,2.0,1.0,4.0,2.0,3.0,5.0,1.0
4,5.0,2.0,3.0,0.0,5.0,1.0,1.0,0.0,4.0,1.0,...,7.0,7.0,0.0,2.0,1.0,0.0,3.0,0.0,1.0,0.0


In [7]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = mushroom_df['class']

# Separate the X variable, the features
X = mushroom_df.copy()
X.drop(["class", "stalk-root"], axis=1, inplace=True)
X.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below,stalk-color-above,stalk-color-below,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,5.0,2.0,4.0,1.0,6.0,1.0,0.0,1.0,4.0,0.0,...,2.0,7.0,7.0,0.0,2.0,1.0,4.0,2.0,3.0,5.0
1,5.0,2.0,9.0,1.0,0.0,1.0,0.0,0.0,4.0,0.0,...,2.0,7.0,7.0,0.0,2.0,1.0,4.0,3.0,2.0,1.0
2,0.0,2.0,8.0,1.0,3.0,1.0,0.0,0.0,5.0,0.0,...,2.0,7.0,7.0,0.0,2.0,1.0,4.0,3.0,2.0,3.0
3,5.0,3.0,8.0,1.0,6.0,1.0,0.0,1.0,5.0,0.0,...,2.0,7.0,7.0,0.0,2.0,1.0,4.0,2.0,3.0,5.0
4,5.0,2.0,3.0,0.0,5.0,1.0,1.0,0.0,4.0,1.0,...,2.0,7.0,7.0,0.0,2.0,1.0,0.0,3.0,0.0,1.0


In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
X_train

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below,stalk-color-above,stalk-color-below,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
7873,3.0,2.0,2.0,0.0,7.0,1.0,0.0,1.0,0.0,1.0,...,1.0,6.0,7.0,0.0,2.0,1.0,0.0,7.0,4.0,0.0
6515,5.0,2.0,4.0,0.0,2.0,1.0,0.0,1.0,0.0,1.0,...,2.0,7.0,7.0,0.0,2.0,1.0,0.0,7.0,4.0,4.0
6141,2.0,3.0,2.0,0.0,8.0,1.0,0.0,1.0,0.0,1.0,...,2.0,6.0,7.0,0.0,2.0,1.0,0.0,7.0,4.0,2.0
2764,2.0,0.0,4.0,1.0,5.0,1.0,0.0,0.0,9.0,1.0,...,2.0,3.0,6.0,0.0,2.0,1.0,4.0,3.0,4.0,0.0
438,0.0,3.0,9.0,1.0,3.0,1.0,0.0,0.0,4.0,0.0,...,2.0,7.0,7.0,0.0,2.0,1.0,4.0,3.0,2.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5226,5.0,3.0,4.0,0.0,2.0,1.0,0.0,1.0,0.0,1.0,...,2.0,6.0,7.0,0.0,2.0,1.0,0.0,7.0,4.0,4.0
5390,3.0,3.0,2.0,1.0,5.0,1.0,0.0,0.0,10.0,0.0,...,2.0,7.0,2.0,0.0,2.0,2.0,0.0,7.0,1.0,6.0
860,2.0,3.0,4.0,1.0,3.0,1.0,0.0,0.0,10.0,0.0,...,3.0,7.0,7.0,0.0,2.0,1.0,4.0,3.0,5.0,4.0
7603,3.0,2.0,2.0,0.0,2.0,1.0,0.0,1.0,0.0,1.0,...,2.0,6.0,6.0,0.0,2.0,1.0,0.0,7.0,4.0,4.0


In [10]:
from warnings import filterwarnings
filterwarnings('ignore')

# Create a Gradient Boosting Regressor
model = GradientBoostingClassifier()

# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best model from grid search
best_gb = grid_search.best_estimator_

# Make predictions on the test data using the best model
y_pred = best_gb.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Best Estimators: {best_gb.n_estimators}")
print(f"Best Learning Rate: {best_gb.learning_rate}")
print(f"Best Max Depth: {best_gb.max_depth}")
print(f"Accuracy: {accuracy:.2f}")
print(f"F1: {f1:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

Best Estimators: 50
Best Learning Rate: 0.01
Best Max Depth: 7
Accuracy: 1.00
F1: 1.00
Precision: 1.00
Recall: 1.00
