In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/dsrscientist/dataset1/master/mushrooms.csv")

In [3]:
df.head(10)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
5,e,x,y,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g
6,e,b,s,w,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,n,m
7,e,b,y,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,s,m
8,p,x,y,w,t,p,f,c,n,p,...,s,w,w,p,w,o,p,k,v,g
9,e,b,s,y,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,s,m


In [5]:
df.shape

(8124, 23)

In [6]:
target = 'class' # The class we want to predict
labels = df[target]

features = df.drop(target, axis=1) # Remove the target class from the dataset

In [7]:
categorical = features.columns # Since every fearure is categorical we use features.columns
features = pd.concat([features, pd.get_dummies(features[categorical])], axis=1) # Convert every categorical feature with one hot encoding
features.drop(categorical, axis=1, inplace=True) # Drop the original feature, leave only the encoded ones

labels = pd.get_dummies(labels)['p'] # Encode the target class, 1 is deadly 0 is safe to eat

In [8]:
#Split the dataset into training and testing, the 80% of the records are in the trainig set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features,labels, test_size=0.2, random_state=0)

In [9]:
X_train.head()

Unnamed: 0,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,cap-surface_y,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
7434,0,0,0,1,0,0,0,0,1,0,...,1,0,0,0,1,0,0,0,0,0
7725,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
783,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,0,0,1,0,0,0
1928,0,0,1,0,0,0,0,0,1,0,...,1,0,0,0,1,0,0,0,0,0
7466,0,0,0,1,0,0,0,0,0,1,...,0,1,0,0,0,1,0,0,0,0


In [10]:
X_test.head()

Unnamed: 0,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,cap-surface_y,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
380,0,0,0,0,0,1,0,0,0,1,...,1,0,0,0,0,0,0,0,1,0
3641,0,0,1,0,0,0,0,0,0,1,...,0,0,1,1,0,0,0,0,0,0
273,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1029,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
684,0,0,0,0,0,1,1,0,0,0,...,0,1,0,1,0,0,0,0,0,0


In [11]:
y_train.shape

(6499,)

In [12]:
y_test.shape

(1625,)

In [13]:
X_train.shape

(6499, 117)

In [14]:
X_test.shape

(1625, 117)

In [15]:
X_train.head()

Unnamed: 0,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,cap-surface_y,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
7434,0,0,0,1,0,0,0,0,1,0,...,1,0,0,0,1,0,0,0,0,0
7725,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
783,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,0,0,1,0,0,0
1928,0,0,1,0,0,0,0,0,1,0,...,1,0,0,0,1,0,0,0,0,0
7466,0,0,0,1,0,0,0,0,0,1,...,0,1,0,0,0,1,0,0,0,0


In [16]:
from __future__ import absolute_import
from __future__ import print_function
import importlib
import itertools

In [18]:
#Train the predict pipeline


from sklearn.metrics import fbeta_score, f1_score,accuracy_score

def train_predict(learner, sample_size, X_train, y_train, X_test, y_test): 
    '''
    inputs:
       - learner: the learning algorithm to be trained and predicted on
       - sample_size: the size of samples (number) to be drawn from training set
       - X_train: features training set
       - y_train: income training set
       - X_test: features testing set
       - y_test: income testing set
    '''
    
    results = {}
   
    start = time() # Get start time
    learner = learner.fit(X_train[:sample_size], y_train[:sample_size])
    end = time() # Get end time
    
    results['train_time'] = end - start
        
    start = time() # Get start time
    predictions_test = learner.predict(X_test)
    predictions_train = learner.predict(X_train[:300])
    end = time() # Get end time
    
    results['pred_time'] = end - start
            
    results['acc_train'] = accuracy_score(y_train[:300],predictions_train)
        
    results['acc_test'] = accuracy_score(y_test,predictions_test)
    
    results['f_train'] = fbeta_score(y_train[:300],predictions_train, beta=0.5)
        
    results['f_test'] = fbeta_score(y_test,predictions_test, beta=0.5)
       
    #print("{} trained on {} samples.".format(learner.__class__.__name__, sample_size))
    
    return results

In [19]:
#Model Training
from time import time
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

clf_A = GaussianNB()
clf_B = RandomForestClassifier()
clf_C = KNeighborsClassifier()

training_length = len(X_train)
samples_1 = int(training_length * 0.01)
samples_10 = int(training_length * 0.1)
samples_100 = int(training_length * 1)

results = {}
for clf in [clf_A, clf_B, clf_C]:
    clf_name = clf.__class__.__name__
    results[clf_name] = {}
    for i, samples in enumerate([samples_1, samples_10, samples_100]):
        results[clf_name][i] = \
        train_predict(clf, samples, X_train, y_train, X_test, y_test)

In [20]:
results

{'GaussianNB': {0: {'train_time': 0.002992391586303711,
   'pred_time': 0.008975982666015625,
   'acc_train': 0.8433333333333334,
   'acc_test': 0.88,
   'f_train': 0.8062645011600929,
   'f_test': 0.8390830180280435},
  1: {'train_time': 0.0029921531677246094,
   'pred_time': 0.007978200912475586,
   'acc_train': 0.93,
   'acc_test': 0.9396923076923077,
   'f_train': 0.8968058968058968,
   'f_test': 0.907916373032652},
  2: {'train_time': 0.013962984085083008,
   'pred_time': 0.007978439331054688,
   'acc_train': 0.9566666666666667,
   'acc_test': 0.9667692307692307,
   'f_train': 0.9335038363171355,
   'f_test': 0.9470717961283998}},
 'RandomForestClassifier': {0: {'train_time': 0.12066292762756348,
   'pred_time': 0.0419306755065918,
   'acc_train': 0.96,
   'acc_test': 0.9446153846153846,
   'f_train': 0.9824046920821113,
   'f_test': 0.9743223965763196},
  1: {'train_time': 0.12462306022644043,
   'pred_time': 0.03091716766357422,
   'acc_train': 1.0,
   'acc_test': 1.0,
   'f_tra

In [21]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

RandomForestClassifier()

In [22]:
#As a Bonus we can now see features which determine more whether its edible or not

z = sorted(zip(clf.feature_importances_,X_train.columns),reverse=True)

z[:6]

[(0.14620959966070385, 'odor_n'),
 (0.05815137354346382, 'stalk-surface-above-ring_k'),
 (0.05769307253198029, 'odor_f'),
 (0.05550351358721569, 'gill-color_b'),
 (0.05292494931000404, 'spore-print-color_h'),
 (0.052013252146364604, 'gill-size_n')]

In [23]:
#Saving Model to disk

from joblib import dump, load
dump(clf, 'mushrooms.joblib')

['mushrooms.joblib']

In [24]:
#Saving Model to disk

from joblib import dump, load
dump(results, 'mushrooms_result.joblib')

['mushrooms_result.joblib']

In [25]:
#Saving Model to disk

from joblib import dump, load
dump(z, 'mushrooms_f_importances.joblib')

['mushrooms_f_importances.joblib']