In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


This Notebook covers the following aspects:

~1. Overview

~2. Data Preprocessing

    ~2.1 Encoding categorical variables
    
    
~3. Classification Model

~4. Tuning Model

~5. Building the Final Model

# **1. Overview**

Input Variables:

-**classes**: edible=e, poisonous=p)

-**cap-shape**: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s

-**cap-surface**: fibrous=f,grooves=g,scaly=y,smooth=s

-**cap-color**: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y

-**bruises**: bruises=t,no=f

-**odor**: almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s

-**gill-attachment**: attached=a,descending=d,free=f,notched=n

-**gill-spacing**: close=c,crowded=w,distant=d

-**gill-size**: broad=b,narrow=n

-**gill-color**: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y

-**stalk-shape**: enlarging=e,tapering=t

-**stalk-root**: bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?

-**stalk-surface-above-ring**: fibrous=f,scaly=y,silky=k,smooth=s

-**stalk-surface-below-ring**: fibrous=f,scaly=y,silky=k,smooth=s

-**stalk-color-above-ring**: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

-**stalk-color-below-ring**: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

-**veil-type: partial**=p,universal=u

-**veil-color: brown**=n,orange=o,white=w,yellow=y

-**ring-number**: none=n,one=o,two=t

-**ring-type**: cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z

-**spore-print-color**: black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y

-**population**: abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y

-**habitat**: grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d

# **2. Data Preprocessing**

In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv("../input/mushroom-classification/mushrooms.csv")
df.head()

Understanding the data 

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
df.shape

Checking null value

In [None]:
df.isnull().sum()

In [None]:
df.columns.isna()

In [None]:
df.isin([' ?']).sum()

No need to use this code as this dataset no null value

In [None]:
#df = df.dropna()

# 2.1 Encoding categorical variables

Encoding categorical variables numerically for classification

In [None]:

categorical_df = df.select_dtypes(include=['object'])
categorical_df.columns


from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()

categorical_df = categorical_df.apply(enc.fit_transform)
categorical_df.head()


df = df.drop(categorical_df.columns, axis=1)
df = pd.concat([df, categorical_df], axis=1)
df.head()


Define X and y

In [None]:
X = df.drop('class', axis=1)
y = df['class']


# 3. Classification Model

Split Data 

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)


Using default model

1. Logistic Regression

2. SVM

3. Decision Tree

4. Random Forest

Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

def_lr= LogisticRegression()
def_lr.fit(X_train, y_train)

lr_pred = def_lr.predict(X_test)

from sklearn.metrics import accuracy_score
print("Logistic Regression accuracy: ", accuracy_score(y_test, lr_pred))


SVM

In [None]:
from sklearn import svm
def_svm = svm.SVC()
def_svm.fit(X_train, y_train)

svm_pred = def_svm.predict(X_test)
from sklearn.metrics import accuracy_score
print("SVM accuracy: ", accuracy_score(y_test, svm_pred))


Decision TreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

def_dt= DecisionTreeClassifier()
def_dt.fit(X_train, y_train)

dt_pred = def_dt.predict(X_test)

from sklearn.metrics import accuracy_score
print("Decision Tree accuracy", accuracy_score(y_test, dt_pred))
  

Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

def_rf = RandomForestClassifier()
def_rf.fit(X_train, y_train)


rf_pred = def_rf.predict(X_test)

from sklearn.metrics import accuracy_score
print("Random Forests accuracy", accuracy_score(y_test, rf_pred))



# 4. Tuning Model

Logistic Regression

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV , KFold


lr= LogisticRegression()


gs_grid  = {
               'penalty': ['l1','l2','elasticnet'],
               'class_weight': ['balanced', 'None'],
               'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}


lr_CV = GridSearchCV(estimator = lr, param_grid=gs_grid, cv=5)

result = lr_CV.fit(X_train, y_train)

print(result.best_params_)
print(result.best_score_)


# 4. Building the Final Model

Input those parameter into Logistic Regression Model

In [None]:

from sklearn.linear_model import LogisticRegression

final_lr=LogisticRegression(penalty='l1', solver='liblinear', class_weight='balanced')
final_lr.fit(X_train, y_train)

final_lr_pred = final_lr.predict(X_test)

from sklearn.metrics import accuracy_score
print("Logistic Regression accuracy: ", accuracy_score(y_test, final_lr_pred))



Comparing 4 Model's accuary scoure: 

I will choose ***Random Forest*** as Final Model