In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import mean_squared_error, accuracy_score
from ucimlrepo import fetch_ucirepo 

In [3]:
# Fetch dataset 
mushroom = fetch_ucirepo(id=73) 
  
# Save data as X and y variables
X = mushroom.data.features 
y = np.ravel(mushroom.data.targets)

# Expand dataframe columns and look at view dataframe
pd.set_option('display.max_columns', None)
X.head(3)

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,x,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,u
1,x,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
2,b,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m


## Encoding Data

In [7]:
# Factorize all cloumns
for col in X.columns:
    X.loc[:, col] = pd.factorize(X[col], sort = True)[0]

# view first few rows of encoded data
X.iloc[0:5, 0:5]

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor
0,5,2,4,1,6
1,5,2,9,1,0
2,0,2,8,1,3
3,5,3,8,1,6
4,5,2,3,0,5


## Time to impute!

Does our dataset have any missing values? Let's check!

In [8]:
# Check for NAs
X.isna().sum()

cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

WE are ging  to randomly assign observations in our dataset to be missing, and then see which imputation methods perform best by comparing their results to our actual dataset. :Lets randomly assign NA observations thourghout our dtaa, We weill create a copy of our df and call it `X_Na`.

In [9]:
# Create a copy of X variables
X_Na = X.copy()

In [11]:
# Assign 10% of new dataframe with NA values
for col in X_Na.columns:
    X_Na.loc[X_Na.sample(frac = 0.1).index, col] = np.nan

In [13]:
# Check to make sure there are missing values
X_Na.isna().sum()

cap-shape                   812
cap-surface                 812
cap-color                   812
bruises                     812
odor                        812
gill-attachment             812
gill-spacing                812
gill-size                   812
gill-color                  812
stalk-shape                 812
stalk-root                  812
stalk-surface-above-ring    812
stalk-surface-below-ring    812
stalk-color-above-ring      812
stalk-color-below-ring      812
veil-type                   812
veil-color                  812
ring-number                 812
ring-type                   812
spore-print-color           812
population                  812
habitat                     812
dtype: int64

Imputation method #1: Filling NA values with the mode

In [15]:
# Impute with mode
X_mode_impute = X_Na.fillna(X_Na.mode().iloc[0])

# Check to make sure there are no NAs
X_mode_impute.isna().sum()

cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

Imputation method #2: Filling NA values with the median using `SimpleImputer`

In [17]:
# Impute with median (using SimpleImputer)
median_impute = SimpleImputer(strategy = 'median')
X_median_impute = median_impute.fit_transform(X_Na)
X_median_impute = pd.DataFrame(X_median_impute, columns = X.columns)

# Check to amke sure there are no NAs
X_median_impute.isna().sum()

cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

Imputation method #3: Filling NA values with KNN Imputer

In [18]:
# Impute with KNN Imputer
knn_impute = KNNImputer(n_neighbors = 5)
X_knn_impute = knn_impute.fit_transform(X_Na)
X_knn_impute = pd.DataFrame(X_knn_impute, columns = X.columns)

# Check to make sure there are no NAs
X_knn_impute.isna().sum()

cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

Now that we have four different dataframes with four different imputation methods, lets see which best captured our real data! We  can do this using the mean squared error!

In [19]:
# Calculation imputation accuracy using mean squared error
mse_mode = mean_squared_error(X, X_mode_impute)
mse_median = mean_squared_error(X, X_median_impute)
mse_knn = mean_squared_error(X, X_knn_impute)

# Report resutls
print(f"Mode Imputation Performance: {mse_mode}")
print(f"Median Imputation Performance: {mse_median}")
print(f"KNN Imputation Performance: {mse_knn}")

Mode Imputation Performance: 0.4429300389418558
Median Imputation Performance: 0.2580345553019113
KNN Imputation Performance: 0.12402220133387046


It looks like our KNN Imputation worked best! Let's go ahead and use our data and our real data to run a Bagged Decision Tree.

## Bagging Classifoed with original data

In [20]:
# Split actual data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 42)

In [28]:
# Number of features to include for tuning
num_features = [1, 4, 7, 10, 13, 16, 19, 22]
accuracy = []

for feature in num_features:
    rf_classifier = RandomForestClassifier(
        n_estimators = 50, 
        max_depth = 3, 
        random_state = 42, 
        max_features = feature
    )
    
    rf_classifier.fit(X_train, y_train)
    
    # Predict and evaluate results
    
    y_pred = rf_classifier.predict(X_test)
    rf_accuracy = accuracy_score(y_test, y_pred)
    accuracy.append(rf_accuracy)
    
    print(f"Number of features:{feature}; Random Forest accuracy: {rf_accuracy}")

Number of features:1; Random Forest accuracy: 0.916735028712059
Number of features:4; Random Forest accuracy: 0.9848236259228876
Number of features:7; Random Forest accuracy: 0.9868744872846595
Number of features:10; Random Forest accuracy: 0.9835931091058244
Number of features:13; Random Forest accuracy: 0.9823625922887613
Number of features:16; Random Forest accuracy: 0.9860541427399507
Number of features:19; Random Forest accuracy: 0.9819524200164069
Number of features:22; Random Forest accuracy: 0.9577522559474979


## Random Forest Classifier with imputed data:

In [29]:
# Split imputed data
X_train, X_test, y_train, y_test = train_test_split(X_knn_impute, y, test_size= 0.3, random_state= 42)

In [30]:
# Number of features to include for tuning
num_features = [1, 4, 7, 10, 13, 16, 19, 22]
accuracy = []

for feature in num_features:
    rf_classifier = RandomForestClassifier(
        n_estimators = 50, 
        max_depth = 3, 
        random_state = 42, 
        max_features = feature
    )
    
    rf_classifier.fit(X_train, y_train)
    
    # Predict and evaluate results
    
    y_pred = rf_classifier.predict(X_test)
    rf_accuracy = accuracy_score(y_test, y_pred)
    accuracy.append(rf_accuracy)
    
    print(f"Number of features:{feature}; Random Forest accuracy: {rf_accuracy}")

Number of features:1; Random Forest accuracy: 0.9175553732567678
Number of features:4; Random Forest accuracy: 0.9827727645611156
Number of features:7; Random Forest accuracy: 0.9864643150123051
Number of features:10; Random Forest accuracy: 0.9856439704675964
Number of features:13; Random Forest accuracy: 0.9881050041017228
Number of features:16; Random Forest accuracy: 0.9708777686628384
Number of features:19; Random Forest accuracy: 0.9667760459392944
Number of features:22; Random Forest accuracy: 0.9573420836751435


## Random Forest Classifier with mode data:

In [34]:
# Split mode data
X_train, X_test, y_train, y_test = train_test_split(X_mode_impute, y, test_size= 0.3, random_state= 42)

In [35]:
# Number of features to include for tuning
num_features = [1, 4, 7, 10, 13, 16, 19, 22]
accuracy = []

for feature in num_features:
    rf_classifier = RandomForestClassifier(
        n_estimators = 50, 
        max_depth = 3, 
        random_state = 42, 
        max_features = feature
    )
    
    rf_classifier.fit(X_train, y_train)
    
    # Predict and evaluate results
    
    y_pred = rf_classifier.predict(X_test)
    rf_accuracy = accuracy_score(y_test, y_pred)
    accuracy.append(rf_accuracy)
    
    print(f"Number of features:{feature}; Random Forest accuracy: {rf_accuracy}")

Number of features:1; Random Forest accuracy: 0.9142739950779327
Number of features:4; Random Forest accuracy: 0.9663658736669402
Number of features:7; Random Forest accuracy: 0.9659557013945858
Number of features:10; Random Forest accuracy: 0.9663658736669402
Number of features:13; Random Forest accuracy: 0.9680065627563577
Number of features:16; Random Forest accuracy: 0.9602132895816243
Number of features:19; Random Forest accuracy: 0.9479081214109926
Number of features:22; Random Forest accuracy: 0.9302707136997539
