In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Safe to eat or deadly poision**
#### A mushroom or toadstool is the fleshy, spore-bearing fruiting body of a fungus, typically produced above ground, on soil, or on its food source. There are many species of mushrooms and some of the can be deadly. Knowing if a mushroom is poisonous or not is a very important thing as it is a matter of life or death. In this notebook I will use data science and machine learning to generate a model which is capable of telling if the mushroom is edible or not.
![ ]( https://images.wallpaperscraft.com/image/mushroom_close_up_autumn_foliage_112652_2560x1024.jpg )

## **Goal**
### Generating a model using machine learning to evaluate the safety of a species of Mushroom|

# **Potential Steps to reach the goal**
#### 1. Finding appropriate data for training.
#### 2. Data Analysis.
#### 3. Feature Engineering.
#### 4. Data Preprocessing.
#### 5. Select the best and most useful features.
#### 6. Training an ML model.
#### 7. Looking at feature importance and remove the least useful features.
#### 8. Hyperparameter Tuning.
#### 9. Re-training the model using best features and best hyperparameters
#### 10.Model Validation.
#### 11.Evaluating the model.
#### 12.Saving the model for later use or deployment.

*Note : It is important to roughly know the steps you are going to follow, these steps may change as we dig deeper into the dataset*

# **Setup and Imports**

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("dark")

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

import pickle

import warnings
warnings.filterwarnings("ignore")

# **Reading data and data description**

In [None]:
data = pd.read_csv('../input/mushroom-classification/mushrooms.csv')
data.info()

In [None]:
data.head()

## Mapping the feature values to their real values
*Note : I don't recommend hard coding these values, I used a python script to give me the mapping dicionary*

In [None]:
# Unique values in columns
for column in data.columns:
    print(f"{column} : {data[column].unique()}")

In [None]:
mapped_data = data.copy()
feature_name_mappings = {
    "class":{'e':'edible','p':'poisonous'},
    "cap-shape": {'b':"bell","c":"conical","x":"convex","f":"flat", "k":"knobbed","s":"sunken"},
    'cap-surface': {'f':"fibrous","g":"grooves","y":"scaly","s":"smooth"},
    "cap-color": {'n':"brown",'b':'buff','c':'cinnamon','g':'gray','r':"green",'p':'pink','u':'purple','e':'red','w':'white','y':'yellow'},
    "bruises":{'t': 'bruises', 'f': 'no'},
    "odor":{'a': 'almond', 'l': 'anise', 'c': 'creosote', 'y': 'fishy', 'f': 'foul', 'm': 'musty', 'n': 'none', 'p': 'pungent', 's': 'spicy'},
    "gill-attachment":{'a': 'attached', 'd': 'descending', 'f': 'free', 'n': 'notched'},
    "gill-spacing":{'c': 'close', 'w': 'crowded', 'd': 'distant'},
    "gill-size":{'b': 'broad', 'n': 'narrow'},
    "gill-color":{'k': 'black', 'n': 'brown', 'b': 'buff', 'h': 'chocolate', 'g': 'gray', 'r': ' green', 'o': 'orange', 'p': 'pink', 'u': 'purple', 'e': 'red', 'w': 'white', 'y': 'yellow'},
    "stalk-shape":{'e': 'enlarging', 't': 'tapering'},
    "stalk-root":{'b': 'bulbous', 'c': 'club', 'u': 'cup', 'e': 'equal', 'z': 'rhizomorphs', 'r': 'rooted', '?': 'missing'},
    "stalk-surface-above-ring":{'f': 'fibrous', 'y': 'scaly', 'k': 'silky', 's': 'smooth'},
    "stalk-surface-below-ring":{'f': 'fibrous', 'y': 'scaly', 'k': 'silky', 's': 'smooth'},
    "stalk-color-above-ring":{'n': 'brown', 'b': 'buff', 'c': 'cinnamon', 'g': 'gray', 'o': 'orange', 'p': 'pink', 'e': 'red', 'w': 'white', 'y': 'yellow'},
    "stalk-color-below-ring":{'n': 'brown', 'b': 'buff', 'c': 'cinnamon', 'g': 'gray', 'o': 'orange', 'p': 'pink', 'e': 'red', 'w': 'white', 'y': 'yellow'},
    "veil-type":{'p': 'partial', 'u': 'universal'},
    "veil-color":{'n': 'brown', 'o': 'orange', 'w': 'white', 'y': 'yellow'},
    "ring-number":{'n': 'none', 'o': 'one', 't': 'two'},
    "ring-type":{'c': 'cobwebby', 'e': 'evanescent', 'f': 'flaring', 'l': 'large', 'n': 'none', 'p': 'pendant', 's': 'sheathing', 'z': 'zone'},
    "spore-print-color":{'k': 'black', 'n': 'brown', 'b': 'buff', 'h': 'chocolate', 'r': 'green', 'o': 'orange', 'u': 'purple', 'w': 'white', 'y': 'yellow'},
    "population":{'a': 'abundant', 'c': 'clustered', 'n': 'numerous', 's': 'scattered', 'v': 'several', 'y': 'solitary'},
    "habitat":{'g': 'grasses', 'l': 'leaves', 'm': 'meadows', 'p': 'paths', 'u': 'urban', 'w': 'waste', 'd': 'woods'}
}
for column in mapped_data.columns:
    mapping_dict = feature_name_mappings[column]
    for index in range(len(mapped_data[column])):
        mapped_data[column][index] = mapping_dict[mapped_data[column][index]]

In [None]:
mapped_data.head()

### There seems to be no null values in the data and all the 23 columns have values represented by english alphabets, we will see what these lettes mean later. As far as the data description goes this is enough for us to start exploring the data.

# **Exploratory Data Analysis**

### Value Counts

In [None]:
plt.figure(figsize = (8,6))
sns.countplot(x = 'class',data = mapped_data,palette = 'hls')

In [None]:
mapped_data.columns

In [None]:
fig=plt.figure(figsize = (14,22)) 
columns = 2
rows = 5 
column_names = ['cap-shape', 'cap-surface', 'cap-color', 'veil-color', 'odor',
       'ring-type', 'spore-print-color', 'stalk-color-above-ring', 'gill-color',
       'stalk-color-below-ring']
for i in range(1, columns*rows +1): 
    fig.add_subplot(rows, columns, i) 
    sns.countplot(x = column_names[i-1],data = mapped_data,palette = 'hls')
fig.tight_layout()
plt.show() 

In [None]:
fig=plt.figure(figsize = (20,20)) 
columns = 2
rows = 6 
column_names = ['stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'gill-size',
       'stalk-shape', 'veil-type', 'bruises', 'ring-number',
       'gill-attachment', 'gill-spacing', 'population', 'habitat']
for i in range(1, columns*rows +1): 
    names = []
    counts = []
    for idx, name in enumerate(mapped_data[column_names[i-1]].value_counts().index.tolist()):
        names.append(name)
        counts.append(mapped_data[column_names[i-1]].value_counts()[idx])
    fig.add_subplot(rows, columns, i) 
    colors = sns.color_palette('pastel')[0:len(names)]
    plt.title(column_names[i-1])
    plt.pie(x = counts,labels = names,colors = colors,autopct='%.0f%%')
fig.tight_layout()
plt.show() 

# **Feature Engineering and Data Preprocessing**

In [None]:
#Save Encoders so if you want to get predictions on new data you will need to encode it first
encoders = {}
encoded_data = mapped_data.copy()
for feature_name in list(encoded_data.columns):
    encoder = LabelEncoder()
    encoder.fit(encoded_data[feature_name])
    encoders[feature_name] = encoder
    encoded_data[feature_name] = encoder.transform(encoded_data[feature_name])

In [None]:
for column in encoded_data.columns:
    print(f"{column} : {encoded_data[column].unique()}")

In [None]:
X = encoded_data.drop('class',axis = 1)
Y = encoded_data['class']

In [None]:
X

#### Splitting the data is very important, you don't want to test the model on the data which it has already seen.

In [None]:
xtrain, xtest,ytrain,ytest = train_test_split(X,Y,test_size = (0.3))

# **Training different ML Models**

In [None]:
svm = SVC()
random_forest = RandomForestClassifier()

In [None]:
def trainModel(model):
    model.fit(xtrain,ytrain)
    return model.score(xtest,ytest)
models = {
    "SVM":svm,
    "Random Forest" :random_forest
}
print('Accuracy')
for model in list(models.keys()):
    print(f"{model} : {round(trainModel(models[model])*100,2)}%")

### We are getting accuracy of 100% with thes models and it shows how good the data is. We don't need to tune any hyperparameters and re train the model.
### Let's validate our model even more

In [None]:
scores = cross_val_score(svm, X, Y, cv=5)

plt.plot(scores)
plt.title('Validation Scores for SVM')
plt.xticks([0,1,2,3,4])
plt.ylabel('Score')
plt.show()

In [None]:
scores = cross_val_score(random_forest, X, Y, cv=5)

plt.plot(scores)
plt.title('Validation Scores for Random Forest')
plt.xticks([0,1,2,3,4])
plt.ylabel('Score')
plt.show()

### We have poor accuracy for first and last fold. Let's try to see if we can solve that using hyperparameter tuning.

In [None]:
param_grid = {'C': [0.1, 1, 10, 100],  
              'gamma': [1, 0.1, 0.01, 0.0001], 
              'gamma':['scale', 'auto'],
              'kernel': ['linear']}  
   
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3,n_jobs=-1) 
grid.fit(xtrain, ytrain) 
print(grid.best_params_)
grid_predictions = grid.predict(xtest) 

print(classification_report(ytest, grid_predictions))

#### All the best params are default params so we don't need to tune anything

In [None]:
svm = SVC(C= 10, gamma= 'scale', kernel= 'linear')
svm.fit(xtrain,ytrain)
plot_confusion_matrix(svm, xtest, ytest, cmap=plt.get_cmap('Blues'))

# Saving the model

In [None]:
pickle.dump(svm, open('MushroomSafetySVM.sav', 'wb'))