In [None]:
import numpy as np
import pandas as pd
import sklearn 
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder

Accessing the CSV file and printing the first 5 lines to know what type of data I am dealing with

In [None]:
path='/kaggle/input/mushroom-classification/mushrooms.csv'
data=pd.read_csv(path)
data.head()

Shuffling the entire dataset using .**sample**() method and setting the fraction of data(**frac**) to 1

In [None]:
data=data.sample(frac=1)
data.head()

Looking at the description of the dataset to see if there are any numerical values at all. There aren't, if there were .**describe**() method would have shown me the minimum,maximum,average etc for that column. Meaning we are dealing with text labelled data

In [None]:
data.describe()

I have made a function to encode the categorical text data into numerical data using **LabelEncoder**, I have stored the LabelEncoders for each column in a dictionary so that I can encode new data with the same encoder,leading to the same mapping rather than initialising an encoder from scratch which will lead to different encodings

In [None]:
def map_data(data):
    for name in list(data.columns):
        le=LabelEncoder()
        le.fit(np.array(data[name]).reshape(-1,))
        data[name]=le.transform(np.array(data[name]).reshape(-1,))
        val_dict[name]=le
def unmap_data(data):
    for name in list(data.columns):
        le=val_dict[name]
        data[name]=le.inverse_transform(np.array(data[name]).reshape(-1,))

In [None]:
val_dict={}

In [None]:
map_data(data)
data.head()

Separating the classes from the remaining data

In [None]:
labels=data.pop('class')
data.head()

Making my X and y variables

In [None]:
X=np.array(data)
y=np.array(labels)

Converting **data** back to categorical form since I have X and y. You can cross check that the conversion has worked out perfectly

In [None]:
unmap_data(data)
data.head()

Getting my training and testing data

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25)

Making my parameters which I will try to find using **GridSearchCV**

In [None]:
parameters={'max_depth':list(range(200,400,50)),
            'n_estimators':list(range(10,120,20)),
           }

Making the model for which the above parameters will be found 

In [None]:
model=RandomForestClassifier()

Making my **GridSearch** object

In [None]:
grid_clf=GridSearchCV(model,parameters,verbose=4,refit=True)

Calling the object on my training data

In [None]:
grid_clf.fit(X_train,y_train)

Getting the best estimator from the grid_clf object. For this you need to set **refit=True** while initialising grid_clf

In [None]:
clf=grid_clf.best_estimator_
print(clf)

Making a function to train the model and return the accuracy

In [None]:
def train_test(clf):
    clf.fit(X_train,y_train)
    p=clf.predict(X_test)
    cm=confusion_matrix(y_true=y_test,y_pred=p)
    acc=cm.trace()/cm.sum()
    return acc*100

Printing the accuracy(100 %)

In [None]:
print(f'Accuracy on test set of length {X_test.shape[0]} is : {train_test(clf)}')