In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

# Hana, I think you forget to drop id column and you don't need to use all the rows (remove duplicated)
mush = pd.read_csv('data/train.csv').drop(columns=['Id']).drop_duplicates()
mush_test= pd.read_csv('data/test.csv')
columns_to_keep = mush_test.columns.to_list()
mush.drop(mush.columns.difference(columns_to_keep+['poisonous']), axis=1, inplace=True)
X = mush.drop(columns=['poisonous'])
y = mush['poisonous']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=8)
X_train.head()
cat_col= list(X_train.select_dtypes(include=["object"]))
categorical_pipeline = Pipeline(steps=[
    ('one-hot', OneHotEncoder(handle_unknown='ignore'))
])

full_processor = ColumnTransformer(transformers=[
    ('category', categorical_pipeline, cat_col)
])
model_pipeline = Pipeline(steps=[
    ('processor', full_processor), 
    # define parameters for the model fo be sure you avoid overfitting
    ('model', DecisionTreeClassifier(max_depth=5, min_samples_leaf=15))
])
_ = model_pipeline.fit(X_train, y_train)

In [31]:
def get_miss_predicted_poisonous_mushrooms(model, X_test, y_test):
    """
    This function returns a dataframe of mushrooms that were predicted NOT to be poisonous, but actually were.
    """
    return (
    X_test
        .assign(
            preds =  model_pipeline.predict(X_test),
            poisonous = y_test
        )
        .query('poisonous == 1 & preds == 0')
        .drop(columns=['preds'])
        )
new_mush = mush

In [32]:
# add miss predicted poisonous mushrooms to the train dataframe and retrain the model to modify the weights of the poisonous mushrooms
# the model will be retrained until there are no miss predicted poisonous mushrooms
iteration = 1
while get_miss_predicted_poisonous_mushrooms(model_pipeline, X_test, y_test).shape[0] > 0:
    new_mush = pd.concat([new_mush, get_miss_predicted_poisonous_mushrooms(model_pipeline, X_test, y_test)], axis=0)
    X = new_mush.drop(columns=['poisonous'])
    y = new_mush['poisonous']
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=8)
    _ = model_pipeline.fit(X_train, y_train)
    print(f'\nIteration {iteration}')
    iteration += 1
    print(pd.crosstab(y_test, model_pipeline.predict(X_test), rownames=['Actual'], colnames=['Predicted']))


Iteration 1
Predicted   0   1
Actual           
0          25  18
1          20  19

Iteration 2
Predicted   0   1
Actual           
0          36  11
1          19  20

Iteration 3
Predicted   0   1
Actual           
0          35  10
1          11  34

Iteration 4
Predicted   0   1
Actual           
0          45   7
1          12  28

Iteration 5
Predicted   0   1
Actual           
0          27  20
1          19  28

Iteration 6
Predicted   0   1
Actual           
0          30  19
1          12  37

Iteration 7
Predicted   0   1
Actual           
0          31  14
1          14  42

Iteration 8
Predicted   0   1
Actual           
0          27  19
1           0  57


In [33]:
# apply model to test data
preds = model_pipeline.predict(mush_test.drop(columns=['Id']).filter(X.columns))

In [34]:
# test my submission
results = pd.read_csv('data/results.csv')

In [35]:
# results on test data
pd.crosstab(results['poisonous'], preds, rownames=['Actual'], colnames=['Predicted'])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,281,585
1,0,759
