# <center> Mushroom Classification

# Importing libraries


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import warnings

%matplotlib inline
warnings.filterwarnings('ignore')

# Importing the data

In [None]:
data = pd.read_csv('../input/mushroom-classification/mushrooms.csv')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
for i in data.columns:
    print('Unique Values in',i,'are',data[i].unique())

# Cardinal Encoding


In [None]:
data = data.drop('veil-type',axis=1)

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
for i in data.columns:
    data[i] = label_encoder.fit_transform(data[i])
# Print sample of dataset
data.head()

In [None]:
data.info()

### Note: A value of 1 in class represnets a poisonous mushroom (p)

# Feature Splitting

Splitting into targets and features

In [None]:
X = data.drop('class',axis=1)

Y = data['class']

# Feature Evaluation

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
fs = SelectKBest(score_func=chi2, k='all')
fs.fit(X, Y)
per = []
for m in fs.scores_:
    per.append(round(((m/sum(fs.scores_))*100),3))

features_data = pd.DataFrame({'Feature':X.columns,'Scores':fs.scores_,'Importance (%)':per}).sort_values(by=['Scores'],ascending=False)
plt.figure(figsize=(15,12))
sns.barplot( 'Importance (%)','Feature',orient='h',data=features_data)
print(features_data,'\n')
insignificant = features_data.loc[features_data['Importance (%)']<0.005]['Feature'].unique()

In [None]:
X=X.drop(insignificant,axis=1)
#insignificant

# EDA

In [None]:
sns.countplot(data['class'])

In [None]:
plt.figure(figsize=(17,35))
m=1
for i in features_data['Feature']:
    plt.subplot(7, 3, m)
    sns.countplot(x=data[i],hue=data['class'],)
    m=m+1

# Test Train Split

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state=100)

# Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Modelling

## Creation

In [None]:
from sklearn.metrics import accuracy_score,classification_report

#XGB
import xgboost as xgb
from xgboost import XGBClassifier
xgb = XGBClassifier() 


# Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

# Random Forest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
#print('Random Forest Created')

# SVM
from sklearn.svm import SVC
svc = SVC()
#print('SVM Created')

#KNN
from sklearn.neighbors import KNeighborsClassifier
accuracy = []
for i in range(1,40):    
    kn = KNeighborsClassifier(n_neighbors=i)
    kn.fit(X_train,Y_train)
    predK = kn.predict(X_test)
    accuracy.append([accuracy_score(Y_test,predK),i])
    #print('Tested for k =',i)
temp = accuracy[0]
for m in accuracy:
    if temp[0] < m[0]:
        temp=m
knn = KNeighborsClassifier(n_neighbors=temp[1])

## Fitting

In [None]:
model_acc = []
models = [xgb,lr,rfc,knn,svc]
#model_name = ['xgb','lr','rfc','kno','svc','grid']
for i in models:
    i.fit(X_train,Y_train)
    model_acc.append(accuracy_score(Y_test,i.predict(X_test)))
                      
models = pd.DataFrame({'Models':models,'Accuracy':model_acc})

## Evaluation

In [None]:
models = models.sort_values(by=['Accuracy'],ascending=False).reset_index().drop('index',axis=1)
best = models['Models'][0]
models['Models']=models['Models'].astype(str).str.split("(", n = 2, expand = True)[0]
models

In [None]:
print('Hence the best model is',models['Models'][0],'with an accuracy of',round((models['Accuracy'][0]*100),2),'%')
print('\nThe classification report is:')
print(classification_report(Y_test,best.predict(X_test)))