# Stacking ensemble

https://www.kaggle.com/uciml/mushroom-classification

Note: Doesn't actually improve prediction in this case, but I keep this as a reference

In [1]:
import pandas as pd
import numpy as np
# using sklearn for the classifiers for efficiency
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('mushrooms.csv')
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
# convert all labels to ordinal
# even though this isn't correct, it will serve the purpose of this demonstration
for i in df.columns:
    convert = {v:k for k,v in enumerate(df[i].unique())}
    df[i] = df[i].map(convert)
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,1,1,1
2,1,1,0,2,0,2,0,0,1,1,...,0,0,0,0,0,0,0,1,1,2
3,0,0,1,2,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,3,1,3,0,1,1,0,...,0,0,0,0,0,0,1,1,2,1


In [4]:
# classification is too easy, I will only take some of the features
X = df.values[:,1::5]
X.shape

(8124, 5)

In [5]:
y = df['class'].values
y.shape

(8124,)

In [6]:
# classification is too easy, so I am selecting a large test set
X_train, X_left, y_train, y_left = train_test_split(X, y, test_size=0.9, random_state=42)
X_train2, X_test, y_train2, y_test = train_test_split(X_left, y_left, test_size=0.89, random_state=38)

In [7]:
# generate all models
models = []
models.append(LogisticRegression())
models.append(KNeighborsClassifier())
models.append(DecisionTreeClassifier())
models.append(GaussianNB())

In [8]:
# train and assess all models
for m in models:
    m.fit(X_train,y_train)
    print('Train set:\t',round(m.score(X_train,y_train),3))
    print('Test set:\t',round(m.score(X_test,y_test),3))

Train set:	 0.648
Test set:	 0.65
Train set:	 0.802
Test set:	 0.774
Train set:	 0.805
Test set:	 0.782
Train set:	 0.592
Test set:	 0.622




In [9]:
# Best is 0.782

In [10]:
agg = LogisticRegression()

In [11]:
a_train = np.zeros((len(X_train2),len(models)),dtype=int)
a_test = np.zeros((len(X_test),len(models)),dtype=int)
i = 0
for m in models:
    a_train[:,i] = m.predict(X_train2)
    a_test[:,i] = m.predict(X_test)
    i+=1

In [12]:
# 'Blind' aggregator 
agg.fit(a_train,y_train2)
agg.score(a_test,y_test)



0.7822679778733866

In [13]:
# Aggregator with access to the data
agg2 = LogisticRegression()
agg2.fit(np.hstack((X_train2,a_train)),y_train2)
agg2.score(np.hstack((X_test,a_test)),y_test)



0.7567609096496619