In [None]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline

Lets import the data into a pandas dataframe for analysis.  After importing let's review the features of the data to see what features and labels to use in our model.   

In [None]:
df = pd.read_csv('../input/mushrooms.csv')

In [None]:
df.head()

In [None]:
df['class'].unique()

For the mushrooms class it appears we only have two classes P and E.  Lets take a look and see if there are any NAs in the data.  From the below results set it appears that the data does not have any missing information therefore we can use the data as is and prepare our data for our model.  

In [None]:
df.isnull().sum()

Lets review the total observations and features of the data.  From the below we can see there are a total of 8124 observations and 23 variables we can see the data types for this dataset is object.  For classification we are going to convert the features to integer values to run it through the various models.    

In [None]:
df.shape

In [None]:
df.info()

In [None]:
plt.figure(figsize=(7,7))
sns.countplot('class', data=df)
plt.show()

As we saw from above all the obeservations contains string values therefore we cannot use the string values to be fitted into an algorithm therefore using the sklearn.preprocessing we will be using the LabelEncoder to endcode the labels to numeric values.   

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder() 
for col in df.columns: 
    df[col]=labelencoder.fit_transform(df[col])
    
df.head()

Now that we have encoded the the string values into interger we can begin the preparation of the data for algorithm to see which algorithm will fit this data. Lets do some data visualization to see how the data is distributed.  

In [None]:
plt.figure(figsize=(15,7))
sns.boxplot(df)
plt.show()

We can now start separating the data for the model we know the entire data has 23 features. We will use the class feature as a label therefore we will have one label and 22 features.   

In [None]:
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

In [None]:
X.head()

In [None]:
y.head()

In [None]:
X.describe()

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(df.corr(), annot=True, cmap='seismic_r', linewidths=.5)
plt.show()

Lets standardize the data by scaling the data to be -1 and 1 

In [None]:
from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler()
X = scaler.fit_transform(X)
X

In [None]:
from sklearn.decomposition import PCA
N = df.values
pca = PCA(n_components=2)
x = pca.fit_transform(N)
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4, random_state=5)
X_clustered = kmeans.fit_predict(N)

LABEL_COLOR_MAP = {0 : 'g',
                   1 : 'y',
                   2 : 'r',
                   3 : 'b'
                  }

label_color = [LABEL_COLOR_MAP[l] for l in X_clustered]
plt.figure(figsize = (15,7))
plt.scatter(x[:,0],x[:,1], c= label_color)
plt.show()

In [None]:
X_clustered

In [None]:
pca=PCA(n_components=20)

X = pca.fit_transform(X)



Lets split the data into train and test to fit it in the model we alrady have our Features in variables X and our label in variable y.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=5)

We are now ready to perform the Machine learning on the data. Using this data we will know which algorithm perform well with this data by using scoring system within the algorithm.  It will be compile at the end to see which model performed well.  Note any accuracy scores close or at 100% will be consider as accurate.    

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
logreg = LogisticRegression() 
logreg.fit(X_train, y_train)
log_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, y_train) * 100, 2)
acc_log

In [None]:
svc = SVC()
svc.fit(X_train, y_train)
svc_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, y_train) * 100, 2)
acc_svc

In [None]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, y_train) * 100, 2)
acc_knn

In [None]:
gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
gnb_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, y_train) * 100, 2)
acc_gaussian

In [None]:
perceptron = Perceptron()
perceptron.fit(X_train, y_train)
per_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, y_train) * 100, 2)
acc_perceptron

In [None]:
linear_svc = LinearSVC()
linear_svc.fit(X_train, y_train)
lin_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, y_train) * 100, 2)
acc_linear_svc

In [None]:
sgd = SGDClassifier()
sgd.fit(X_train, y_train)
sgd_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, y_train) * 100, 2)
acc_sgd

In [None]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
dec_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, y_train) * 100, 2)
acc_decision_tree

In [None]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
ran_pred = random_forest.predict(X_test)
random_forest.score(X_train, y_train)
acc_random_forest = round(random_forest.score(X_train, y_train) * 100, 2)
acc_random_forest

In [None]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)

The following models performed well with this dataset, Support Vector Machines, KNN, Random Forrest, Decision Trees all have a accrucy score of 100%.   This show that these models have a high prediction rate.