In [29]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
import sklearn.metrics

# Handling Missing Values
there are multiple strategies to handle missing values in our datasets. In our case (for categorical features) we have the following options:
- removing samples that have missing feature values
    - the problem with this approach is that it reduces the size of our dataset. The whole sample may get removed just because of one missing feature 
- impute the most frequent value
    - we are replacing the missing values with the most frequent value of that feature in the dataset. so the values will be biased towards the filled values
- treat the missing values as a separate feature value
    - this may add some false patterns and spurious correlations because the original dataset didn't have this value

For the general case, if the missing values are from numeric columns we have the following options:
- removing samples that have missing feature values
- replacing the missing values with the mean of the column
- replacing the missing values with the median of the column
- using regression to predict the missing values of each column based on the filled values
- ...


*we have to be carefull not to apply most frequent imputation on the whole dataset before splitting. we want the train, test and validation datasets to be isolated as much as possible so that the most frequent value of our test set won't affect the training set.

In [6]:
# data = pd.read_csv("datasets/mushrooms.csv")
# train_data, val_test_data = train_test_split(data, train_size=0.7, random_state=42)
# val_data, test_data = train_test_split(val_test_data, test_size=0.33, random_state=42)

# imputer = SimpleImputer(strategy="most_frequent", missing_values="?")
# train_data = pd.DataFrame(imputer.fit_transform(train_data), columns=train_data.columns)
# val_data = pd.DataFrame(imputer.fit_transform(val_data), columns=val_data.columns)
# test_data = pd.DataFrame(imputer.fit_transform(test_data), columns=test_data.columns)

In [7]:
data = pd.read_csv("datasets/mushrooms.csv")
X_train, X_val_test, y_train, y_val_test = train_test_split(
    data.loc[:, data.columns != "class"], data["class"], train_size=0.7, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_val_test, y_val_test, test_size=0.33, random_state=42
)

imputer = SimpleImputer(strategy="most_frequent", missing_values="?")
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_val = pd.DataFrame(imputer.fit_transform(X_val), columns=X_val.columns)
X_test = pd.DataFrame(imputer.fit_transform(X_test), columns=X_test.columns)

# print(X_train.shape, y_train.shape)
# print(X_val.shape, y_val.shape)
# print(X_test.shape, y_test.shape)

# Desicion Tree

In [8]:
max_depths = (4, 8, 16, 24, 32)
for md in max_depths:
    dt_cls_model = DecisionTreeClassifier(criterion="gini", max_depth=md)
    dt_cls_model = dt_cls_model.fit(pd.get_dummies(X_train), y_train)
    y_val_hat = dt_cls_model.predict(pd.get_dummies(X_val))

    print(f'acc with max depth of {md} = {sklearn.metrics.accuracy_score(y_val, y_val_hat)}')

acc with max depth of 4 = 0.9993876301285977
acc with max depth of 8 = 1.0
acc with max depth of 16 = 1.0
acc with max depth of 24 = 1.0
acc with max depth of 32 = 1.0


# Random Forest

In [33]:
def select_random_features(X, feature_count):
    columns = list(X.columns)
    np.random.shuffle(columns)
    return X[columns[:feature_count]]

max_depths = (4, 8, 12, 16)
for md in max_depths:
    rf_cls_model = RandomForestClassifier(n_estimators=7, criterion="gini",max_depth=md)
    rf_cls_model = rf_cls_model.fit(pd.get_dummies(X_train), y_train)
    y_val_hat = rf_cls_model.predict(pd.get_dummies(X_val))

    print(f'acc with max depth of {md} = {sklearn.metrics.accuracy_score(y_val, y_val_hat)}')
    


acc with max depth of 4 = 0.9185548071034905
acc with max depth of 8 = 0.9987752602571953
acc with max depth of 12 = 1.0
acc with max depth of 16 = 1.0
