In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**I will use the data to try to find the best model to predict whether or not a mushroom is poisonous or edible.**

In [None]:
shrooms = pd.read_csv('../input/mushroom-classification/mushrooms.csv')
shrooms.head()

****Look into the data

In [None]:
shrooms.info()

1. All data types are object, no integers
* The target in this data is 'class', which has two values, 'e' for edible or 'p' for posionous

We can observe the class columns with frequency value and bar charts

In [None]:
from matplotlib import pyplot as plt  #import matplotlib.pyplot and alias it as plt

for column in shrooms.select_dtypes(object).columns: 
    print('------' + str(column) + '------')
    shrooms[column].value_counts().plot.bar()
    plt.show()

We will convert 'e' and 'p' to 1 and 0 for better uses with sklearn

In [None]:
Y = np.zeros(shrooms.shape[0])           #create a vector of zeros with size = the data
Y[shrooms['class']=='e'] = 1                  #when the actual target is 'e', Y is assigned 1
shrooms['class'] = Y                          #assign the new labels back to the data 

In [None]:
shrooms.head()

<h3> Train/Test Splitting </h3>
* Use stratified split to preserve the distribution of the target

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
#set the parameters
split = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)

#now we need to provide a stratified column, the target is column 'class', so let's use that
#similar to before, you can reuse this code for stratified split
#you just need to change the data and the stratified column
for train_index, test_index in split.split(shrooms, shrooms['class']):
    strat_train_set = shrooms.loc[train_index]
    strat_test_set = shrooms.loc[test_index]
    
#split the target from the set
trainX = strat_train_set.iloc[:,1:]
trainY = strat_train_set.iloc[:,0]
trainX.shape, trainY.shape

Check the distributions

In [None]:
ax1 = strat_train_set['class'].value_counts().plot.bar()

In [None]:
ax1 = strat_test_set['class'].value_counts().plot.bar()

One Hot Encoder on the class columns

In [None]:
from sklearn.preprocessing import OneHotEncoder


onehot = OneHotEncoder(sparse=False)
trainX_Encoded = onehot.fit_transform(trainX.select_dtypes(object))

print(trainX_Encoded)

<h3>Modeling</h3>

<h4>Logistic Regression</h4>

Accuracy:

In [None]:
from sklearn.linear_model import LogisticRegression

logistic = LogisticRegression()

from sklearn.model_selection import cross_val_score

accuracy_3cv = cross_val_score(logistic, trainX_Encoded, trainY, cv=3, scoring="accuracy")
print(accuracy_3cv)
print(accuracy_3cv.mean())

Confusion Matrix:

In [None]:
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(logistic, trainX_Encoded, trainY, cv=3)

from sklearn.metrics import confusion_matrix

conf_matrx = confusion_matrix(trainY, y_train_pred)

print(conf_matrx)

F1-Score:

In [None]:
from sklearn.metrics import f1_score
f1_score(trainY, y_train_pred)

Let also try <b>Random Forest</b>

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

accuracy_3cv = cross_val_score(rf, trainX_Encoded, trainY, cv=3, scoring="accuracy")
print(accuracy_3cv)
print(accuracy_3cv.mean())

Confusion Matrix:

In [None]:
y_train_pred_rf = cross_val_predict(rf, trainX_Encoded, trainY, cv=3)

conf_matrx = confusion_matrix(trainY, y_train_pred_rf)

print(conf_matrx)

F1-Score:

In [None]:
f1_score(trainY, y_train_pred_rf)

Random Forest predicts the target with 100% accuracy. Problem is solved no further testing is needed. 