#Training a MLP to identify whether mushrooms are or not poisonous

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

##Read input data files


In [None]:
mushrooms = pd.read_csv("../input/mushrooms.csv")
mushrooms.head()

In the dataset, we have a class column which has "e" for edible or "p" for poisonous, and in the other columns a letter representing the category it belongs to:

    cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s

    cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s

    cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y

    bruises: bruises=t,no=f

    odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s

    gill-attachment: attached=a,descending=d,free=f,notched=n

    gill-spacing: close=c,crowded=w,distant=d

    gill-size: broad=b,narrow=n

    gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y

    stalk-shape: enlarging=e,tapering=t

    stalk-root: bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?

    stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s

    stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s

    stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

    stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

    veil-type: partial=p,universal=u

    veil-color: brown=n,orange=o,white=w,yellow=y

    ring-number: none=n,one=o,two=t

    ring-type: cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z

    spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y

    population: abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y

    habitat: grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d

###For this reason, we will encode the strings into integers with scikit-learn's DictVectorizer

In [None]:
from sklearn.feature_extraction import DictVectorizer

def encode_onehot(df):
    vec = DictVectorizer()
    
    vec_data = pd.DataFrame(vec.fit_transform(df.to_dict(orient='records')).toarray())
    vec_data.columns = vec.get_feature_names()
    vec_data.index = df.index
    return vec_data

mushrooms = encode_onehot(mushrooms)
mushrooms.head()

##Having prepared our dataset, we can now split it into training set and testing set

In [None]:
from sklearn.model_selection import train_test_split # helper method to split dataset
train, test = train_test_split(mushrooms, test_size=0.2)

##And split our training and testing sets into features (x) and labels (y)

In [None]:
train_y = train[["class=e", "class=p"]] 
train_x = train.drop(["class=e","class=p"], 1)

In [None]:
test_y = test[["class=e", "class=p"]]
test_x = test.drop(["class=e", "class=p"], 1)

#Now we can go ahead and implement a tensorflow Neural Network with an Input Layer, one Hidden Layer and an Output Layer

In [None]:
import tensorflow as tf
sess = tf.InteractiveSession()

x = tf.placeholder(tf.float32, shape=[None, 117]) # will hold features through feed_dict. Shape is [None, 117]
                                                  # because we have an undefined number of rows and 117 features
y_ = tf.placeholder(tf.float32, shape=[None, 2]) # will hold labels through feed_dict. Shape is [None, 2]
                                                 # beacause we have an undefined number of rows and 2 output classes
                                                 # or labels

W = tf.Variable(tf.zeros([117, 2])) # initialize weights and
b = tf.Variable(tf.zeros([2])) # biases

sess.run(tf.global_variables_initializer())

y = tf.sigmoid(tf.matmul(x, W) + b) # we compute our prediction and use a sigmoid activation function to get
                                    # results as probabilities

cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y, y_)) # cross entropy as our loss function
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy) # minimize loss with Gradient Descent

correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) # used to calculate accuracy later

for i in range(1000):
    train_step.run(feed_dict={x: train_x, y_: train_y})
    print("Training iteration " + str(i) + ": " + str(accuracy.eval(feed_dict={x: train_x, y_: train_y})))

print("Accuracy in test set: ", accuracy.eval(feed_dict={x: test_x, y_: test_y}))

#And that's it! We can now test whether an individual or a group of mushrooms is poisonous with the helper method below

In [None]:
def are_poisonous(mushrooms):
    predictions = sess.run(y, feed_dict={x: mushrooms})
    return [prediction[1] > prediction[0] for prediction in predictions]

In [None]:
mushrooms = mushrooms.sample(5)

In [None]:
mushrooms[["class=e", "class=p"]]

In [None]:
are_poisonous(mushrooms.drop(["class=e","class=p"], 1))