# CapClassifier: A Poisonous Mushroom Classification Project

In [2]:
import requests
import kaggle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [17]:
# Downloading the Mushroom Classification Dataset
kaggle.api.dataset_download_files('vishalpnaik/mushroom-classification-edible-or-poisonous', path='data', unzip=True)

Dataset URL: https://www.kaggle.com/datasets/vishalpnaik/mushroom-classification-edible-or-poisonous


In [111]:
# Replace 'path_to_file' with the actual path to your file
file_path = r'data\mushroom.csv'

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Display the DataFrame
print(df)

      class  cap-diameter cap-shape cap-surface cap-color  \
0         p         15.26         x           g         o   
1         p         16.60         x           g         o   
2         p         14.07         x           g         o   
3         p         14.17         f           h         e   
4         p         14.64         x           h         o   
...     ...           ...       ...         ...       ...   
61064     p          1.18         s           s         y   
61065     p          1.27         f           s         y   
61066     p          1.27         s           s         y   
61067     p          1.24         f           s         y   
61068     p          1.17         s           s         y   

      does-bruise-or-bleed gill-attachment gill-spacing gill-color  \
0                        f               e          NaN          w   
1                        f               e          NaN          w   
2                        f               e          NaN  

In [112]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61069 entries, 0 to 61068
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   class                 61069 non-null  object 
 1   cap-diameter          61069 non-null  float64
 2   cap-shape             61069 non-null  object 
 3   cap-surface           46949 non-null  object 
 4   cap-color             61069 non-null  object 
 5   does-bruise-or-bleed  61069 non-null  object 
 6   gill-attachment       51185 non-null  object 
 7   gill-spacing          36006 non-null  object 
 8   gill-color            61069 non-null  object 
 9   stem-height           61069 non-null  float64
 10  stem-width            61069 non-null  float64
 11  stem-root             9531 non-null   object 
 12  stem-surface          22945 non-null  object 
 13  stem-color            61069 non-null  object 
 14  veil-type             3177 non-null   object 
 15  veil-color         

# Data Preparation

In [113]:
# Apply value_counts from Pandas to extract unique the counts for unique values
for column in df.columns:
    print(f"Counts for {column}:")
    print(df[column].astype(str).value_counts(dropna=False)) # Ensuring each column data type is consistent (string)
    print("\n")

Counts for class:
class
p    33888
e    27181
Name: count, dtype: int64


Counts for cap-diameter:
cap-diameter
3.18     103
3.14      98
3.13      96
3.85      95
3.25      95
        ... 
51.39      1
47.85      1
50.18      1
45.67      1
20.42      1
Name: count, Length: 2571, dtype: int64


Counts for cap-shape:
cap-shape
x    26934
f    13404
s     7164
b     5694
o     3460
p     2598
c     1815
Name: count, dtype: int64


Counts for cap-surface:
cap-surface
nan    14120
t       8196
s       7608
y       6341
h       4974
g       4724
d       4432
e       2584
k       2303
i       2225
w       2150
l       1412
Name: count, dtype: int64


Counts for cap-color:
cap-color
n    24218
y     8543
w     7666
g     4420
e     4035
o     3656
r     1782
u     1709
p     1703
k     1279
b     1230
l      828
Name: count, dtype: int64


Counts for does-bruise-or-bleed:
does-bruise-or-bleed
f    50479
t    10590
Name: count, dtype: int64


Counts for gill-attachment:
gill-attachment
a     

In [114]:
# Compute the ratios of missing values for each column
missing_ratios = df.isnull().mean()
print(missing_ratios)

class                   0.000000
cap-diameter            0.000000
cap-shape               0.000000
cap-surface             0.231214
cap-color               0.000000
does-bruise-or-bleed    0.000000
gill-attachment         0.161850
gill-spacing            0.410405
gill-color              0.000000
stem-height             0.000000
stem-width              0.000000
stem-root               0.843931
stem-surface            0.624277
stem-color              0.000000
veil-type               0.947977
veil-color              0.878613
has-ring                0.000000
ring-type               0.040462
spore-print-color       0.895954
habitat                 0.000000
season                  0.000000
dtype: float64


In [115]:
# Set a threshold for filtering
threshold = 0.30

# Filter columns based on the threshold and create a new DataFrame from the result
filtered_columns = missing_ratios[missing_ratios >= threshold]

# Print the result
print(filtered_columns)

gill-spacing         0.410405
stem-root            0.843931
stem-surface         0.624277
veil-type            0.947977
veil-color           0.878613
spore-print-color    0.895954
dtype: float64


In [116]:
# Identify columns to drop where missing data is above the threshold
columns_to_drop = missing_ratios[missing_ratios > 0.40].index # Threshold here is 40%

# Drop these columns from the DataFrame
df = df.drop(columns=columns_to_drop)

# For the cap-surface column, remove rows where cap-surface is NaN
df = df.dropna(subset=['cap-surface'])

# Check the resulting DataFrame
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 46949 entries, 0 to 61068
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   class                 46949 non-null  object 
 1   cap-diameter          46949 non-null  float64
 2   cap-shape             46949 non-null  object 
 3   cap-surface           46949 non-null  object 
 4   cap-color             46949 non-null  object 
 5   does-bruise-or-bleed  46949 non-null  object 
 6   gill-attachment       38830 non-null  object 
 7   gill-color            46949 non-null  object 
 8   stem-height           46949 non-null  float64
 9   stem-width            46949 non-null  float64
 10  stem-color            46949 non-null  object 
 11  has-ring              46949 non-null  object 
 12  ring-type             44831 non-null  object 
 13  habitat               46949 non-null  object 
 14  season                46949 non-null  object 
dtypes: float64(3), object(12

In [117]:
df.isna().sum()

class                      0
cap-diameter               0
cap-shape                  0
cap-surface                0
cap-color                  0
does-bruise-or-bleed       0
gill-attachment         8119
gill-color                 0
stem-height                0
stem-width                 0
stem-color                 0
has-ring                   0
ring-type               2118
habitat                    0
season                     0
dtype: int64

In [118]:
# Drop rows where 'gill-attachment' or 'ring-type' are NaN
df = df.dropna(subset=['gill-attachment', 'ring-type'])

# Verify changes
df.isna().sum()

class                   0
cap-diameter            0
cap-shape               0
cap-surface             0
cap-color               0
does-bruise-or-bleed    0
gill-attachment         0
gill-color              0
stem-height             0
stem-width              0
stem-color              0
has-ring                0
ring-type               0
habitat                 0
season                  0
dtype: int64

In [119]:
# Define the target column and feature columns
target_col = 'class'
feature_cols = df.columns.drop(target_col)  # Exclude the target column from feature columns

# Filter only 'object' data type columns from feature columns
object_cols = df[feature_cols].select_dtypes(include=['object']).columns

# One-hot encode categorical features and drop original columns
df = pd.get_dummies(df, columns=object_cols, drop_first=True)

# Map the class labels to numeric values
df[target_col] = df[target_col].map({'p': 0, 'e': 1})

# Verify the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37065 entries, 0 to 61068
Data columns (total 76 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   class                   37065 non-null  int64  
 1   cap-diameter            37065 non-null  float64
 2   stem-height             37065 non-null  float64
 3   stem-width              37065 non-null  float64
 4   cap-shape_c             37065 non-null  bool   
 5   cap-shape_f             37065 non-null  bool   
 6   cap-shape_o             37065 non-null  bool   
 7   cap-shape_p             37065 non-null  bool   
 8   cap-shape_s             37065 non-null  bool   
 9   cap-shape_x             37065 non-null  bool   
 10  cap-surface_e           37065 non-null  bool   
 11  cap-surface_g           37065 non-null  bool   
 12  cap-surface_h           37065 non-null  bool   
 13  cap-surface_i           37065 non-null  bool   
 14  cap-surface_k           37065 non-null  boo

# Modeling

In [120]:
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import xgboost as xgb
import lightgbm as lgb
import warnings
import os

In [121]:
# Separate features and target
X = df.drop('class', axis=1)
y = df['class']

# Split data into training, validation, and test sets (60-20-20 split)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Transform the data via standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'XGBoost': xgb.XGBClassifier(),
    'LightGBM': lgb.LGBMClassifier(),
}

# Evaluation

In [124]:
# Create the data directory if it doesn't exist already
results_dir = 'data'
os.makedirs(results_dir, exist_ok=True)

for name, model in models.items():
    # Create a directory named after the classifier
    model_dir = os.path.join(results_dir, name.lower())
    os.makedirs(model_dir, exist_ok=True)
    
    # Perform cross-validation with 5 folds
    scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
    cv_mean_accuracy = scores.mean()
    
    # Fit the model to the training data
    model.fit(X_train_scaled, y_train)
    
    # Evaluate on the validation set
    val_predictions = model.predict(X_val_scaled)
    val_accuracy = accuracy_score(y_val, val_predictions)
    val_precision = precision_score(y_val, val_predictions)
    val_recall = recall_score(y_val, val_predictions)
    val_f1 = f1_score(y_val, val_predictions)
    
    # Evaluate on the test set
    test_predictions = model.predict(X_test_scaled)
    test_accuracy = accuracy_score(y_test, test_predictions)
    test_precision = precision_score(y_test, test_predictions)
    test_recall = recall_score(y_test, test_predictions)
    test_f1 = f1_score(y_test, test_predictions)
    
    if hasattr(model, "predict_proba"):
        # Probabilities for validation set
        val_probabilities = model.predict_proba(X_val_scaled)[:, 1]
        val_roc_auc = roc_auc_score(y_val, val_probabilities)
        val_fpr, val_tpr, _ = roc_curve(y_val, val_probabilities)

        # Probabilities for test set
        test_probabilities = model.predict_proba(X_test_scaled)[:, 1]
        test_roc_auc = roc_auc_score(y_test, test_probabilities)
        test_fpr, test_tpr, _ = roc_curve(y_test, test_probabilities)
        
        # Plot both ROC curves
        plt.figure(figsize=(10, 8))
        plt.plot(val_fpr, val_tpr, label=f'Validation ROC Curve (area = {val_roc_auc:.2f})')
        plt.plot(test_fpr, test_tpr, label=f'Test ROC Curve (area = {test_roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'{name} ROC Curves')
        plt.legend(loc="lower right")
        roc_curve_file = os.path.join(model_dir, f'{name}_ROC_curves.png')
        plt.savefig(roc_curve_file)
        plt.close()
    else:
        val_roc_auc = None
        test_roc_auc = None
    
    # Create a DataFrame to store aggregated results
    results_df = pd.DataFrame({
        'Model': [name],
        'CV Accuracy': [cv_mean_accuracy],
        'Validation Accuracy': [val_accuracy],
        'Validation Precision': [val_precision],
        'Validation Recall': [val_recall],
        'Validation F1-score': [val_f1],
        'Validation ROC AUC': [val_roc_auc],
        'Test Accuracy': [test_accuracy],
        'Test Precision': [test_precision],
        'Test Recall': [test_recall],
        'Test F1-score': [test_f1],
        'Test ROC AUC': [test_roc_auc]
    })
    
    # Save results to CSV
    results_csv_file = os.path.join(model_dir, f'{name}_results.csv')
    results_df.to_csv(results_csv_file, index=False)

[LightGBM] [Info] Number of positive: 8151, number of negative: 9640
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002564 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 981
[LightGBM] [Info] Number of data points in the train set: 17791, number of used features: 75
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.458153 -> initscore=-0.167780
[LightGBM] [Info] Start training from score -0.167780
[LightGBM] [Info] Number of positive: 8150, number of negative: 9641
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003389 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 981
[LightGBM] [Info] Number of data points in the train set: 17791, number of used features: 75
[LightGBM] [Info] [binar

In [101]:
# Empty list to store DataFrames
dfs = []

# Iterate over directories in data/
for model_dir in os.listdir('data'):
    if os.path.isdir(os.path.join('data', model_dir)):
        # Load CSV file into a DataFrame
        csv_file = os.path.join('data', model_dir, f'{model_dir}_results.csv')
        df = pd.read_csv(csv_file)
        
        # Add model name column
        df['Model'] = model_dir
        
        # Append DataFrame to list
        dfs.append(df)

# Concatenate DataFrames
combined_df = pd.concat(dfs, ignore_index=True)

# Display the sorted DataFrame
sorted_df = combined_df.sort_values(by=['Accuracy', 'Precision', 'Recall', 'F1-score', 'ROC AUC'], ascending=False)
print(sorted_df)

                 Model  Accuracy  Precision    Recall  F1-score   ROC AUC
4        random forest  0.999910   1.000000  0.999802  0.999901  1.000000
6              xgboost  0.999640   0.999405  0.999802  0.999603  0.999999
1                  knn  0.999550   1.000000  0.999009  0.999504  0.999999
5                  svm  0.998291   0.997820  0.998414  0.998117       NaN
0        decision tree  0.996313   0.995640  0.996232  0.995936  0.996306
2             lightgbm  0.993525   0.999397  0.986318  0.992814  0.999968
3  logistic regression  0.790198   0.756726  0.791989  0.773956  0.885854
