# Notes #
1. Data preparation
    + Removed 30% instances due to missing values.  
    Is it significant?


**Rough notes**
1. 

---

**Data preparation**

In [1]:
# Data preparation #

# Load CSV using Pandas
import pandas as pd
import numpy as np

names = ['classes', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment',
        'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
        'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring',
        'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color',
        'population', 'habitat']
data = pd.read_csv("mushroom.data.csv", delimiter=",", header=None, names=names, index_col=False)

# Check for columns that contain missing values #

print("Fields with missing values")
col_names = data.columns
num_data = data.shape[0]
for c in col_names:
    num_non = data[c].isin(["?"]).sum()
    if num_non > 0:
        print (c)
        print (num_non)
        print ("{0:.2f}%".format(float(num_non) / num_data * 100))
        print ("\n")

data = data[data["stalk-root"] != "?"]

# Convert categorical fields #

for col in names:
    b, c = np.unique(data[col], return_inverse=True)
    data[col] = c

print("Class distribution")    
print("Class 1 ", data['classes'].value_counts()[0] / data.shape[0] * 100)
print("Class 2 ", data['classes'].value_counts()[1] / data.shape[0] * 100)

# Split the dataset into test and train datasets #
feature_list = names[1:23]
X = data.loc[:, feature_list]
Y = data[['classes']]

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.60, random_state=0)

data.head()


Fields with missing values
stalk-root
2480
30.53%


Class distribution
Class 1  61.80014174344437
Class 2  38.199858256555636


Unnamed: 0,classes,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,2,...,2,5,5,0,0,1,3,1,3,5
1,0,5,2,7,1,0,1,0,0,2,...,2,5,5,0,0,1,3,2,2,1
2,0,0,2,6,1,3,1,0,0,3,...,2,5,5,0,0,1,3,2,2,3
3,1,5,3,6,1,6,1,0,1,3,...,2,5,5,0,0,1,3,1,3,5
4,0,5,2,3,0,5,1,1,0,2,...,2,5,5,0,0,1,0,2,0,1


---

**Keras Model**

In [2]:
# Keras #

### Logistic regression using DNN ###
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.utils import np_utils

np.random.seed(7)

# Defining model #

model = Sequential()
model.add(Dense(1,input_dim=22,activation='sigmoid'))

# Compile the model #

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit the model #

model.fit(x_train, y_train, epochs=2500, batch_size=100, verbose=0)

# Evaluate the model #

scores = model.evaluate(x_test, y_test)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

Using TensorFlow backend.



acc: 96.72%


---

**Scikit model**

In [3]:
# Scikit learn #

# Import and create an instance of your model(Logistic regression)

from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression()

# Train your model using the training dataset

logisticRegr.fit(x_train,y_train)

# Predict the output 

predictions = logisticRegr.predict(x_test)
print(predictions)

score = logisticRegr.score(x_test,y_test)
print(score)


[1 0 0 ... 0 0 0]
0.9521700620017715


  y = column_or_1d(y, warn=True)


---

**Kfold validation**

In [4]:
# Validation #
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import StratifiedKFold
import numpy

seed = 7
numpy.random.seed(seed)

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
cvscores = []

for train, test in kfold.split(X, Y):
    # create model
    model = Sequential()
    model.add(Dense(1,input_dim=22,activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # Fit the model
    model.fit(X.iloc[train], Y.iloc[train], epochs=500, batch_size=100, verbose=0)
    # evaluate the model
    scores = model.evaluate(X.iloc[test], Y.iloc[test], verbose=0)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
print("%.2f%% (+/- %.2f%%)" % (numpy.mean(cvscores), numpy.std(cvscores)))


acc: 93.81%
acc: 96.11%
acc: 95.75%
acc: 96.99%
acc: 96.81%
acc: 94.51%
acc: 96.81%
acc: 96.45%
acc: 96.45%
acc: 96.80%
96.05% (+/- 1.02%)


---

**ROC Curve and Confusion matrix**

In [5]:
# Prediction #
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

y_pred = model.predict(x_test)
y_pred = (y_pred>0.5)
fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred)
roc_auc = metrics.auc(fpr, tpr)


plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()


# Creating the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

<Figure size 640x480 with 1 Axes>

[[2096   36]
 [  79 1176]]


---

**Classification Report**

In [6]:
from sklearn.metrics import classification_report

target_names = ['Class 1', 'Class 2']
print(classification_report(y_test, y_pred, target_names=target_names))

             precision    recall  f1-score   support

    Class 1       0.96      0.98      0.97      2132
    Class 2       0.97      0.94      0.95      1255

avg / total       0.97      0.97      0.97      3387



---

# Pending #
1. 