In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize']=(8,6)

In [None]:
df=pd.read_csv('/kaggle/input/mushroom-classification/mushrooms.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df['veil-type'].value_counts()

In [None]:
df['class'].value_counts()

In [None]:
sns.countplot(x='cap-shape',hue='class',data=df)

**As 'Veil-type' column consists only constant values, removing the column from the dataset**

In [None]:
#separating the independent and dependent data
X=df.drop(['class','veil-type'],axis=1)
y=df['class']

In [None]:
fig,axis=plt.subplots(5,4,figsize=(25,25))
j=0
k=0
for i in X.columns.tolist():
    sns.countplot(x=i,hue='class',data=df,ax=axis[j,k])
    if k==3:
        j+=1
        k=0
    else:
        k+=1
    if j==5:
        break

In [None]:
#as all the columns are of object data type, converting the values into numerical values
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
for i in X.columns.tolist():
    X[i]=le.fit_transform(X[i])

In [None]:
X.head()

In [None]:
#checking corelation between the instances
plt.figure(figsize=(15,15))
sns.heatmap(X.corr(),annot=True,fmt='.2f',cmap='Blues')

In [None]:
#checking all the instances that are highly corelated
value=X.corr()
variables=set()
for i in range(len(value)):
    for j in range(i+1,len(value)):
        if value.iloc[i,j]>0.7:
            print(value.iloc[i,j])
            print(X.columns.tolist()[i],X.columns.tolist()[j])
            variables.add(X.columns.tolist()[i])
print(variables)

In [None]:
#removing the highly corelated column
X.drop('gill-attachment',axis=1,inplace=True)
X.head()

In [None]:
X.shape

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
etc=ExtraTreesClassifier(n_estimators=100,random_state=42)
etc.fit(X,y)
feature_values=pd.Series(etc.feature_importances_)
feature_values.index=X.columns
feature_values

In [None]:
plt.figure(figsize=(12,10))
feature_values.sort_values(ascending=False).plot(kind='barh')

In [None]:
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
X_norm=ss.fit_transform(X)

In [None]:
X_norm[0:5]

In [None]:
#separating train and test dataset
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_norm,y,test_size=0.15,random_state=42)
X_train.shape,y_train.shape,X_test.shape,y_test.shape

# Applying different Models to predict the accuracy

**Random Forest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(n_estimators=250)
rfc.fit(X_train,y_train)
yhat=rfc.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix,classification_report,f1_score,accuracy_score,precision_score,recall_score

In [None]:
print('Accuracy: ',accuracy_score(y_test,yhat))

In [None]:
sns.heatmap(confusion_matrix(y_test,yhat),annot=True,fmt='.0f')

In [None]:
print(classification_report(y_test,yhat))

In [None]:
print(f1_score(y_test,yhat,average='weighted'))

**Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(solver='sag',random_state=42)
lr.fit(X_train,y_train)
yhat=lr.predict(X_test)

In [None]:
accuracy_score(y_test,yhat)

In [None]:
sns.heatmap(confusion_matrix(y_test,yhat),annot=True,fmt='.0f')

In [None]:
print(classification_report(y_test,yhat))

In [None]:
f1_score(y_test,yhat,average='weighted')

**Support Vector Classifier**

In [None]:
from sklearn.svm import SVC
sc=SVC()
sc.fit(X_train,y_train)
yhat=sc.predict(X_test)

In [None]:
accuracy_score(y_test,yhat)

In [None]:
sns.heatmap(confusion_matrix(y_test,yhat),annot=True,fmt='.0f')

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train)
yhat=knn.predict(X_test)

In [None]:
accuracy_score(y_test,yhat)

In [None]:
sns.heatmap(confusion_matrix(y_test,yhat),annot=True,fmt='.0f')

In [None]:
precision_score(y_test,yhat,average='weighted')

In [None]:
recall_score(y_test,yhat,average='weighted')