<pre>
                       .-'~~~-.
                     .'o  oOOOo`.
                    :~~~-.oOo   o`.
                     `. \ ~-.  oOOo.
                       `.; / ~.  OO:
                       .'  ;-- `.o.'
                      ,'  ; ~~--'~
                      ;  ;
_______\|/__________\\;_\\//___\|/________

Mushrooms - Out-Of-Fold Prediction 100% accuracy
</pre>

In [None]:
import numpy as np
import pandas as pd
from numpy import array, hstack

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Prepare Data

In [None]:
path = "../input/mushroom-classification-updated-dataset/mushroomsupdated.csv"
pd.read_csv(path).head()

In [None]:
class MushroomsDataset():
    '''
        Mushrooms Dataset
    '''
    def __init__(self, path):
        self.df = pd.read_csv(path)
        self.categories_idx()
        self.one_hot_encoding()
        
    def one_hot_encoding(self):
        self.df.replace(self.categories_to_idx, inplace=True)
        
    def get_features_labels(self):
        labels = self.df['class'].values
        features = self.df[list(self.df.columns)[1:]].values
        return features, labels
        
    def categories_idx(self):
        self.categories_to_idx = {}
        self.idx_to_categories = {}

        for column in self.df.columns:
            temp = np.unique(self.df[column].values)
            self.idx_to_categories[column] = {k:v for k,v in enumerate(temp)}
            self.categories_to_idx[column] = {v:k for k,v in enumerate(temp)}

In [None]:
mdset = MushroomsDataset(path)
mdset.df.head()

In [None]:
features, labels = mdset.get_features_labels()
print(features.shape, labels.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33)

# Out of Fold

In [None]:
class OutOfFold():
    '''
        Out-of-Fold
        
        model1 - DecisionTreeClassifier
        model2 - KNeighborsClassifier
        model3 - LogisticRegression
    '''
    def __init__(self, n_split):
        self.kfold = KFold(n_splits=n_split, shuffle=True)
        
    def create_meta_dataset(self, data_x, yhat1, yhat2):
        yhat1 = array(yhat1).reshape((len(yhat1), 1))
        yhat2 = array(yhat2).reshape((len(yhat2), 1))
        # stack as separate columns
        meta_X = hstack((data_x, yhat1, yhat2))
        return meta_X

    def stack_prediction(self, model1, model2, model3, X):
        yhat1 = model1.predict_proba(X)[:, 0]
        yhat2 = model2.predict_proba(X)[:, 0]
        # create input dataset
        meta_X = self.create_meta_dataset(X, yhat1, yhat2)
        return model3.predict(meta_X)
    
    def get_accuracy(self, model1, model2, X_test, y_test):
        acc1 = accuracy_score(y_test, model1.predict(X_test))
        acc2 = accuracy_score(y_test, model2.predict(X_test))
        print('Decision_Tree Accuracy: %.3f, k-NN Accuracy: %.3f' % (acc1, acc2))
        
    def train_metadata(self, model1, model2, meta_X, data_y, X_test):
        model3 = LogisticRegression(solver='liblinear')
        model3.fit(meta_X, data_y)
        yhat = self.stack_prediction(model1, model2, model3, X_test)
        acc = accuracy_score(y_test, yhat)
        return model3, yhat, acc
        
    def train(self, X, y):
        data_x, data_y = list(), list()
        dtree_yhat, knn_yhat = list(), list()
        
        for ix_train, ix_test in self.kfold.split(X):
            data_x.extend(X[ix_test])
            data_y.extend(y[ix_test])
            model1, model2, yhat1, yhat2 = self.fit(X[ix_train], 
                                                    X[ix_test],
                                                    y[ix_train])
            dtree_yhat.extend(yhat1)
            knn_yhat.extend(yhat2)
        
        return model1, model2, \
               data_x, data_y, \
               knn_yhat, dtree_yhat
            
    def fit(self, X_train, X_test, y_train):
        model1 = DecisionTreeClassifier()
        model1.fit(X_train, y_train)
        yhat1 = model1.predict_proba(X_test)[:, 0]
        
        model2 = KNeighborsClassifier()
        model2.fit(X_train, y_train)
        yhat2 = model2.predict_proba(X_test)[:, 0]
        
        return model1, model2, yhat1, yhat2

In [None]:
oof = OutOfFold(3)

# Train
# model1 - Decision Tree
# model2 - k-NN
model1, model2, data_x, data_y, knn_yhat, dtree_yhat = oof.train(X_train, y_train)

# get accuracy for both models
oof.get_accuracy(model1, model2, X_test, y_test)

In [None]:
# stack the k-NN + Decision Tree
meta_X = oof.create_meta_dataset(data_x, knn_yhat, dtree_yhat)

# Train
# LogisticRegression
model1, model2, _, _ = oof.fit(X_train, X_test, y_train)
meta_model, yhat, acc = oof.train_metadata(model1, model2, meta_X, data_y, X_test)
print("Metadata Accuracy:{:1.3f}".format(acc))