link: https://www.researchgate.net/figure/Calculation-of-Precision-Recall-and-Accuracy-in-the-confusion-matrix_fig3_336402347

In [101]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

mush = pd.read_csv('../data/train.csv').drop(columns=['Id']).drop_duplicates()
mush_test= pd.read_csv('../data/test.csv')


columns_to_keep = mush_test.columns.to_list()
mush.drop(mush.columns.difference(columns_to_keep+['poisonous']), axis=1, inplace=True)

X = mush.drop(columns=['poisonous'])
y = mush['poisonous']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=.8, random_state=8)

cat_col= list(X_train.select_dtypes(include=["object"]))
categorical_pipeline = Pipeline(steps=[
    ('one-hot', OneHotEncoder(handle_unknown='ignore'))
])

full_processor = ColumnTransformer(transformers=[
    ('category', categorical_pipeline, cat_col)
])
model_pipeline = Pipeline(steps=[
    ('processor', full_processor), 
    # define parameters for the model fo be sure you avoid overfitting
    ('model', DecisionTreeClassifier(random_state=42))
])
# _ = model_pipeline.fit(X_train, y_train)



grid_search = GridSearchCV(
    model_pipeline
    , {
        'model__max_depth': range(3, 8),
        'model__min_samples_leaf': range(10, 25)
    },
    scoring='recall',
    cv=5,
    n_jobs=-1,
    verbose=1
    )
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 75 candidates, totalling 375 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('processor',
                                        ColumnTransformer(transformers=[('category',
                                                                         Pipeline(steps=[('one-hot',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         ['cap.shape',
                                                                          'cap.color',
                                                                          'stalk.color.above.ring',
                                                                          'stalk.color.below.ring',
                                                                          'population'])])),
                                       ('model',
                                        DecisionTreeClassifier(random_state=42))]),
         

In [103]:
pd.crosstab(y_test, grid_search.predict(X_test), rownames=['Actual'], colnames=['Predicted'])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,33,11
1,16,20


In [104]:
def get_miss_predicted_poisonous_mushrooms(model, X_test, y_test):
    """
    This function returns a dataframe of mushrooms that were predicted NOT to be poisonous, but actually were.
    """
    return (
    X_test
        .assign(
            preds =  grid_search.predict(X_test),
            poisonous = y_test
        )
        .query('poisonous == 1 & preds == 0')
        .drop(columns=['preds'])
        )
new_mush = mush

In [105]:
# add miss predicted poisonous mushrooms to the train dataframe and retrain the model to modify the weights of the poisonous mushrooms
# the model will be retrained until there are no miss predicted poisonous mushrooms
iteration = 1
while get_miss_predicted_poisonous_mushrooms(grid_search, X_test, y_test).shape[0] > 0:
    new_mush = pd.concat([new_mush, get_miss_predicted_poisonous_mushrooms(grid_search, X_test, y_test)], axis=0)
    X = new_mush.drop(columns=['poisonous'])
    y = new_mush['poisonous']
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=8)
    _ = grid_search.fit(X_train, y_train)
    print(f'\nIteration {iteration}')
    iteration += 1
    print(pd.crosstab(y_test, grid_search.predict(X_test), rownames=['Actual'], colnames=['Predicted']))

Fitting 5 folds for each of 75 candidates, totalling 375 fits

Iteration 1
Predicted   0   1
Actual           
0          33  10
1          20  20
Fitting 5 folds for each of 75 candidates, totalling 375 fits

Iteration 2
Predicted   0   1
Actual           
0          29  13
1          10  35
Fitting 5 folds for each of 75 candidates, totalling 375 fits

Iteration 3
Predicted   0   1
Actual           
0          26  18
1          13  32
Fitting 5 folds for each of 75 candidates, totalling 375 fits

Iteration 4
Predicted   0   1
Actual           
0          30  19
1           7  35
Fitting 5 folds for each of 75 candidates, totalling 375 fits

Iteration 5
Predicted   0   1
Actual           
0          32  15
1          23  23
Fitting 5 folds for each of 75 candidates, totalling 375 fits

Iteration 6
Predicted   0   1
Actual           
0          36  15
1           8  38
Fitting 5 folds for each of 75 candidates, totalling 375 fits

Iteration 7
Predicted   0   1
Actual           
0      

In [106]:
# apply model to test data
preds = grid_search.predict(mush_test.drop(columns=['Id']).filter(X.columns))

In [108]:
# test my submission
results = pd.read_csv('../data/results.csv')

In [116]:
(mush_test
    .assign(poisonous = grid_search.predict(mush_test.drop(columns=['Id']).filter(X.columns))
    )
    .filter(['Id', 'poisonous'])
    .to_csv('../data/submission.csv', index=False)
)


In [99]:
# results on test data
pd.crosstab(results['poisonous'], preds, rownames=['Actual'], colnames=['Predicted'])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,206,660
1,0,759
