In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
import sklearn.metrics
from sklearn.preprocessing import OneHotEncoder

# Handling Missing Values
there are multiple strategies to handle missing values in our datasets. In our case (for categorical features) we have the following options:
- removing samples that have missing feature values
    - the problem with this approach is that it reduces the size of our dataset. The whole sample may get removed just because of one missing feature 
- impute the most frequent value
    - we are replacing the missing values with the most frequent value of that feature in the dataset. so the values will be biased towards the filled values
- treat the missing values as a separate feature value
    - this may add some false patterns and spurious correlations because the original dataset didn't have this value

For the general case, if the missing values are from numeric columns we have the following options:
- removing samples that have missing feature values
- replacing the missing values with the mean of the column
- replacing the missing values with the median of the column
- using regression to predict the missing values of each column based on the filled values
- ...


*we have to be carefull not to apply most frequent imputation on the whole dataset before splitting. we want the train, test and validation datasets to be isolated as much as possible so that the most frequent value of our test set won't affect the training set.

In [2]:
# data = pd.read_csv("datasets/mushrooms.csv")
# train_data, val_test_data = train_test_split(data, train_size=0.7, random_state=42)
# val_data, test_data = train_test_split(val_test_data, test_size=0.33, random_state=42)

# imputer = SimpleImputer(strategy="most_frequent", missing_values="?")
# train_data = pd.DataFrame(imputer.fit_transform(train_data), columns=train_data.columns)
# val_data = pd.DataFrame(imputer.fit_transform(val_data), columns=val_data.columns)
# test_data = pd.DataFrame(imputer.fit_transform(test_data), columns=test_data.columns)

In [3]:
data = pd.read_csv("datasets/mushrooms.csv")
X_train, X_val_test, y_train, y_val_test = train_test_split(
    data.loc[:, data.columns != "class"], data["class"], train_size=0.7, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_val_test, y_val_test, test_size=0.33, random_state=42
)

imputer = SimpleImputer(strategy="most_frequent", missing_values="?")
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_val = pd.DataFrame(imputer.fit_transform(X_val), columns=X_val.columns)
X_test = pd.DataFrame(imputer.fit_transform(X_test), columns=X_test.columns)

In [9]:
concatenated = pd.concat([X_train, X_val, X_test])
enc = OneHotEncoder()
enc.fit(concatenated)


# one_hot_encoded = pd.get_dummies(concatenated)

# X_train_encoded = one_hot_encoded.iloc[:len(X_train)]
# X_val_encoded = one_hot_encoded.iloc[len(X_train):len(X_train)+len(X_val)]
# X_test_encoded = one_hot_encoded.iloc[len(X_train)+len(X_val):]

<5686x116 sparse matrix of type '<class 'numpy.float64'>'
	with 125092 stored elements in Compressed Sparse Row format>

# Desicion Tree

In [5]:
max_depths = (4, 8, 16, 24, 32)
best_acc = 0
for md in max_depths:
    dt_cls_model = DecisionTreeClassifier(criterion="gini", max_depth=md)
    dt_cls_model = dt_cls_model.fit(enc.transform(X_train), y_train)
    y_val_hat = dt_cls_model.predict(enc.transform(X_val))
    y_train_hat = dt_cls_model.predict(enc.transform(X_train))

    acc_val = sklearn.metrics.accuracy_score(y_val, y_val_hat)
    acc_train = sklearn.metrics.accuracy_score(y_train, y_train_hat)

    if acc_val > best_acc:
        best_acc = acc_val
        best_dt_cls_model = dt_cls_model
        best_max_depth = md

    print(f'acc of validation with max depth of {md} = {acc_val}')
    print(f'acc of train with max depth of {md} = {acc_train}')

# print(enc.transform(X_train).shape)
# print(enc.transform(X_test).shape)
y_test_hat = best_dt_cls_model.predict(enc.transform(X_test))
acc_test = sklearn.metrics.accuracy_score(y_test, y_test_hat)
print(f'acc of test with max depth of {best_max_depth} = {acc_test}')


acc of validation with max depth of 4 = 0.9993876301285977
acc of train with max depth of 4 = 0.9994723883221949
acc of validation with max depth of 8 = 1.0
acc of train with max depth of 8 = 1.0
acc of validation with max depth of 16 = 1.0
acc of train with max depth of 16 = 1.0
acc of validation with max depth of 24 = 1.0
acc of train with max depth of 24 = 1.0
acc of validation with max depth of 32 = 1.0
acc of train with max depth of 32 = 1.0
acc of test with max depth of 8 = 1.0


# Random Forest

In [8]:
def select_random_features(X, feature_count):
    columns = list(X.columns)
    np.random.shuffle(columns)
    return columns[:feature_count]


max_depths = (4, 8, 12, 16)
feature_counts = (3, 5, 7)
best_acc = 0
for md in max_depths:
    for fc in feature_counts:
        random_features = select_random_features(X_train, fc)
        enc2 = OneHotEncoder()
        enc2.fit(concatenated[random_features])

        rf_cls_model = RandomForestClassifier(
            n_estimators=7, criterion="gini", max_depth=md
        )
        rf_cls_model = rf_cls_model.fit(
            enc2.transform(X_train[random_features]), y_train
        )
        y_val_hat = rf_cls_model.predict(enc2.transform(X_val[random_features]))
        y_train_hat = rf_cls_model.predict(enc2.transform(X_train[random_features]))

        acc_val = sklearn.metrics.accuracy_score(y_val, y_val_hat)
        acc_train = sklearn.metrics.accuracy_score(y_train, y_train_hat)

        if acc_val > best_acc:
            best_acc = acc_val
            best_rf_cls_model = rf_cls_model
            best_md, best_fc = md, fc
            best_random_features = random_features
            best_enc2 = enc2

        print(f"acc of val with max depth of {md} and feature count {fc} = {acc_val}")
        print(f"acc of train with max depth of {md} and feature count {fc} = {acc_train}")

y_test_hat = best_rf_cls_model.predict(best_enc2.transform(X_test[best_random_features]))
acc_test = sklearn.metrics.accuracy_score(y_test, y_test_hat)
print(f'acc of test with max depth of {best_md}, feature count {best_fc} = {acc_test}')


acc of val with max depth of 4 and feature count 3 = 0.7281077770973668
acc of train with max depth of 4 and feature count 3 = 0.7360182905381639
acc of val with max depth of 4 and feature count 5 = 0.7924066135946112
acc of train with max depth of 4 and feature count 5 = 0.8149841716496659
acc of val with max depth of 4 and feature count 7 = 0.9632578077158603
acc of train with max depth of 4 and feature count 7 = 0.9671122054168132
acc of val with max depth of 8 and feature count 3 = 0.6472749540722597
acc of train with max depth of 8 and feature count 3 = 0.645444952514949
acc of val with max depth of 8 and feature count 5 = 0.9902020820575628
acc of train with max depth of 8 and feature count 5 = 0.992085824832923
acc of val with max depth of 8 and feature count 7 = 0.9571341090018372
acc of train with max depth of 8 and feature count 7 = 0.9613084769609568
acc of val with max depth of 12 and feature count 3 = 0.7262706674831598
acc of train with max depth of 12 and feature count 3