In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
column_names = [
    'class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment',
    'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
    'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type',
    'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat'
]
data = pd.read_csv(url, header=None, names=column_names)

In [3]:
feature_columns = data.columns.drop('odor')
X = data[feature_columns]
y = data['odor']
X_encoded = pd.get_dummies(X)

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [4]:
missing_indices = np.where(X_encoded.isnull().any(axis=1))[0]
non_missing_indices = np.where(~X_encoded.isnull().any(axis=1))[0]

missing_values = []

if len(missing_indices) > 0:
    knn_model = KNeighborsClassifier()
    knn_model.fit(X_encoded.iloc[non_missing_indices], y_encoded[non_missing_indices])
    imputed_values = knn_model.predict(X_encoded.iloc[missing_indices].values)
    imputed_values_original = label_encoder.inverse_transform(imputed_values)
    imputed_value_counts = Counter(imputed_values_original)
    
    for value, count in imputed_value_counts.items():
        missing_values.extend([value] * count)
        print(f"Imputed Value: {value}, Count: {count}")
    
    data.loc[missing_indices, 'odor'] = imputed_values_original
else:
    print("No missing values in the dataset.")

unique_missing_values = Counter(missing_values)

No missing values in the dataset.


In [5]:
feature_columns_full = data.columns.drop('class')
X_full = data[feature_columns_full]
y_full = data['class']
ct = ColumnTransformer([
    ('encoder', OneHotEncoder(drop='first'), feature_columns_full)
], remainder='passthrough')
X_encoded_full = ct.fit_transform(X_full)
label_encoder = LabelEncoder()
y_encoded_full = label_encoder.fit_transform(y_full)

In [6]:
X_train_full, X_test_full, y_train, y_test = train_test_split(X_encoded_full, y_encoded_full, test_size=0.2, random_state=42)

In [7]:
%%time
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_full, y_train)

CPU times: total: 609 ms
Wall time: 672 ms


In [8]:
%%time
lr_model = LogisticRegression()
lr_model.fit(X_train_full, y_train)

CPU times: total: 15.6 ms
Wall time: 94.9 ms


In [9]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    return accuracy, precision, recall

In [10]:
rf_accuracy, rf_precision, rf_recall = evaluate_model(rf_model, X_test_full, y_test)
lr_accuracy, lr_precision, lr_recall = evaluate_model(lr_model, X_test_full, y_test)

print("RandomForestClassifier:")
print(f"Accuracy: {rf_accuracy:.4f}")
print(f"Precision: {rf_precision:.4f}")
print(f"Recall: {rf_recall:.4f}")
print()

print("LogisticRegression:")
print(f"Accuracy: {lr_accuracy:.4f}")
print(f"Precision: {lr_precision:.4f}")
print(f"Recall: {lr_recall:.4f}")
print()

RandomForestClassifier:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000

LogisticRegression:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000



In [11]:
pca = PCA(n_components=X_train_full.shape[1])
pca.fit(X_train_full.toarray())

explained_variance_ratio_ = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance_ratio_)
num_components_for_variance = np.argmax(cumulative_variance >= 0.95) + 1

In [12]:
pca = PCA(n_components=num_components_for_variance)
X_train_reduced = pca.fit_transform(X_train_full.toarray())
X_test_reduced = pca.transform(X_test_full.toarray())

In [13]:
%%time
rf_reduced_model = RandomForestClassifier(random_state=42)
rf_reduced_model.fit(X_train_reduced, y_train)

CPU times: total: 8.14 s
Wall time: 9.42 s


In [14]:
%%time
lr_reduced_model = LogisticRegression()
lr_reduced_model.fit(X_train_reduced, y_train)

CPU times: total: 62.5 ms
Wall time: 30.2 ms


In [15]:
accuracy_rf_pca, precision_rf_pca, recall_rf_pca = evaluate_model(rf_reduced_model, X_test_reduced, y_test)
accuracy_lr_pca, precision_lr_pca, recall_lr_pca = evaluate_model(lr_reduced_model, X_test_reduced, y_test)

print("Random Forest (Reduced):")
print(f"Accuracy: {accuracy_rf_pca:.4f}")
print(f"Precision: {precision_rf_pca:.4f}")
print(f"Recall: {recall_rf_pca:.4f}")

print("\nLogistic Regression (Reduced):")
print(f"Accuracy: {accuracy_lr_pca:.4f}")
print(f"Precision: {precision_lr_pca:.4f}")
print(f"Recall: {recall_lr_pca:.4f}")

Random Forest (Reduced):
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000

Logistic Regression (Reduced):
Accuracy: 0.9914
Precision: 0.9873
Recall: 0.9949


In [16]:
columns = ['Model', 'Metric', 'Full Data', 'PCA Reduced']
rows = []

rows.append(['Random Forest', 'Accuracy', rf_accuracy, accuracy_rf_pca])
rows.append(['', 'Precision', rf_precision, precision_rf_pca])
rows.append(['', 'Recall', rf_recall, recall_rf_pca])

rows.append(['Logistic Regression', 'Accuracy', lr_accuracy, accuracy_lr_pca])
rows.append(['', 'Precision', lr_precision, precision_lr_pca])
rows.append(['', 'Recall', lr_recall, recall_lr_pca])

comparison_table = pd.DataFrame(rows, columns=columns)
print(comparison_table.to_string(index=False))

              Model    Metric  Full Data  PCA Reduced
      Random Forest  Accuracy        1.0     1.000000
                    Precision        1.0     1.000000
                       Recall        1.0     1.000000
Logistic Regression  Accuracy        1.0     0.991385
                    Precision        1.0     0.987310
                       Recall        1.0     0.994885


Looking at all of our models and the PCA process, both the Random Forest and Logistic Regression have perfect scores for accuracy, precision, and recall in both Full Data and PCA Reduced scenarios. Which is automatically somewhat suspicious to me.  However, it's worth noting that while Logistic Regression maintains strong numbers, it does show a slight dip in PCA Reduced accuracy. All in all, with these numbers I worry about overfitting. In terms of training time, the Random Forest appears to be a bit quicker overall. These insights underscore the models' capabilities and the trade-offs we might need to consider between accuracy, overfitting, and speed, especially when dealing with larger data sets.