In [147]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

# Hana, I think you forget to drop id column
mush = pd.read_csv('data/train.csv').drop(columns=['Id']).drop_duplicates()
mush_test= pd.read_csv('data/test.csv')
columns_to_keep = mush_test.columns.to_list()
mush.drop(mush.columns.difference(columns_to_keep+['poisonous']), axis=1, inplace=True)
X = mush.drop(columns=['poisonous'])
y = mush['poisonous']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=8)
X_train.head()
cat_col= list(X_train.select_dtypes(include=["object"]))
categorical_pipeline = Pipeline(steps=[
    ('one-hot', OneHotEncoder(handle_unknown='ignore'))
])

full_processor = ColumnTransformer(transformers=[
    ('category', categorical_pipeline, cat_col)
])
rf_pipeline = Pipeline(steps=[
    ('processor', full_processor), 
    ('model', RandomForestClassifier(max_depth=5, min_samples_leaf=15))
])
_ = rf_pipeline.fit(X_train, y_train)

In [148]:

def get_miss_predicted_poisonous_mushrooms(model, X_test, y_test):
    return (
    X_test
        .assign(
            preds =  rf_pipeline.predict(X_test),
            poisonous = y_test
        )
        .query('poisonous == 1 & preds == 0')
        .drop(columns=['preds'])
        )
new_mush = mush

In [149]:
# add miss predicted poisonous mushrooms to the train dataframe
iteration = 1
while get_miss_predicted_poisonous_mushrooms(rf_pipeline, X_test, y_test).shape[0] > 0:
    new_mush = pd.concat([new_mush, get_miss_predicted_poisonous_mushrooms(rf_pipeline, X_test, y_test)], axis=0)
    X = new_mush.drop(columns=['poisonous'])
    y = new_mush['poisonous']
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=8)
    _ = rf_pipeline.fit(X_train, y_train)
    print(f'\nIteration {iteration}')
    iteration += 1
    print(pd.crosstab(y_test, rf_pipeline.predict(X_test), rownames=['Actual'], colnames=['Predicted']))


Iteration 1
Predicted   0   1
Actual           
0          41   0
1          29  15

Iteration 2
Predicted   0   1
Actual           
0          43   6
1          15  26

Iteration 3
Predicted   0   1
Actual           
0          39  13
1           5  36

Iteration 4
Predicted   0   1
Actual           
0          32  15
1          11  36

Iteration 5
Predicted   0   1
Actual           
0          32   9
1          19  37

Iteration 6
Predicted   0   1
Actual           
0          38   6
1          11  45

Iteration 7
Predicted   0   1
Actual           
0          31  11
1           9  52

Iteration 8
Predicted   0   1
Actual           
0          29  20
1           1  54

Iteration 9
Predicted   0   1
Actual           
0          34  17
1           3  51

Iteration 10
Predicted   0   1
Actual           
0          32  16
1           4  53

Iteration 11
Predicted   0   1
Actual           
0          28  10
1           5  63

Iteration 12
Predicted   0   1
Actual           
0          29

In [150]:
# apply model to test data
preds = rf_pipeline.predict(mush_test.drop(columns=['Id']).filter(X.columns))

In [151]:
# test my submission
results = pd.read_csv('data/results.csv')

In [152]:
pd.crosstab(results['poisonous'], preds, rownames=['Actual'], colnames=['Predicted'])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,389,477
1,0,759
