### Importing Needed Module

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

### Reading Data

In [2]:
#Loading <data andrenaming columns
mushroom = pd.read_csv("agaricus-lepiota.data")
                      
mushroom.head()

Unnamed: 0,p,x,s,n,t,p.1,f,c,n.1,k,...,s.2,w,w.1,p.2,w.2,o,p.3,k.1,s.3,u
0,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
1,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
2,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
3,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
4,e,x,y,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g


In [3]:
mushroom.shape

(8123, 23)

### Cleaning Data

In [4]:
 #Renaming columns
names = ["classes", "cap-shape", "cap-surface",
         "cap-color", "bruises", "odor",
         "gill-attachment", "gill-spacing", "gill-size",
         "gill-color", "stalk-shape", "stalk-root",
         "stalk-surface-above-ring", "stalk-surface-below-ring", "stalk-color-above-ring",
         "stalk-color-below-ring", "veil-type", "veil-color",
         "ring-number", "ring-type", "spore-print-color",
         "population", "habitat"]
mushroom.columns = names
mushroom.head(1)

Unnamed: 0,classes,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g


In [5]:
#Checking <data information
mushroom.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8123 entries, 0 to 8122
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   classes                   8123 non-null   object
 1   cap-shape                 8123 non-null   object
 2   cap-surface               8123 non-null   object
 3   cap-color                 8123 non-null   object
 4   bruises                   8123 non-null   object
 5   odor                      8123 non-null   object
 6   gill-attachment           8123 non-null   object
 7   gill-spacing              8123 non-null   object
 8   gill-size                 8123 non-null   object
 9   gill-color                8123 non-null   object
 10  stalk-shape               8123 non-null   object
 11  stalk-root                8123 non-null   object
 12  stalk-surface-above-ring  8123 non-null   object
 13  stalk-surface-below-ring  8123 non-null   object
 14  stalk-color-above-ring  

* There 8123 entries.
* And there are no missing values.
* Our data is in type string(Object),we may have to convert it to integer or float type to be able to work with it.  
* We have 23 columns altogether.

In [6]:
#Checking the value counts of both edible and poisonous mushrooms in classes
edible = mushroom[mushroom.classes == "e"]
poisonous = mushroom[mushroom.classes == "p"]
print("Number of edible mushrooms =",len(edible))
print("Number of poisonous mushrooms =",len(poisonous))


Number of edible mushrooms = 4208
Number of poisonous mushrooms = 3915


### EDA

In [7]:
#Checking the statistical summary of our data
mushroom.describe().T

Unnamed: 0,count,unique,top,freq
classes,8123,2,e,4208
cap-shape,8123,6,x,3655
cap-surface,8123,4,y,3244
cap-color,8123,10,n,2283
bruises,8123,2,f,4748
odor,8123,9,n,3528
gill-attachment,8123,2,f,7913
gill-spacing,8123,2,c,6811
gill-size,8123,2,b,5612
gill-color,8123,12,b,1728


In [8]:
mushroom.nunique()

classes                      2
cap-shape                    6
cap-surface                  4
cap-color                   10
bruises                      2
odor                         9
gill-attachment              2
gill-spacing                 2
gill-size                    2
gill-color                  12
stalk-shape                  2
stalk-root                   5
stalk-surface-above-ring     4
stalk-surface-below-ring     4
stalk-color-above-ring       9
stalk-color-below-ring       9
veil-type                    1
veil-color                   4
ring-number                  3
ring-type                    5
spore-print-color            9
population                   6
habitat                      7
dtype: int64

* Looking at my findings I have duplicated values in each column

In [9]:
mushroom["stalk-root"].value_counts()

b    3776
?    2480
e    1119
c     556
r     192
Name: stalk-root, dtype: int64

* Looking at our data stalk-root has 2480 missing values because "?" represents missing data.
* I will replace the missing value with the first index because it has more values/it has the frequent value.

In [10]:
#Replacing missing value with frequency
freq = mushroom["stalk-root"].value_counts().index[0]
mushroom["stalk-root"] = mushroom["stalk-root"].replace("?", freq)
mushroom["stalk-root"].unique()

array(['c', 'e', 'b', 'r'], dtype=object)

In [11]:
mushroom["stalk-root"].value_counts()

b    6256
e    1119
c     556
r     192
Name: stalk-root, dtype: int64

* Looking at our value count we are certain that we no longer have missing values because our first index has increased values from 3776 to 6256.

### Data Preprocessing

In [12]:
#Label Encoding the values
le = LabelEncoder()
for col in mushroom.columns:
    mushroom[col] = le.fit_transform(mushroom[col])

mushroom.head()

Unnamed: 0,classes,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
1,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
2,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
3,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1
4,0,5,3,9,1,0,1,0,0,5,...,2,7,7,0,2,1,4,2,2,1


* Machine learning algorithms use numerical values,so I converted the categorical data in the columns from being strings(Object) to numerical values(Integers).

### Splitting Data

In [13]:
X = mushroom.drop(["classes"], axis = 1)
y = mushroom["classes"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

#Printing lenght of train and test
print("Training dataset length =", X_train.shape, y_train.shape)
print("Testing dataset length =", X_test.shape, y_test.shape)


Training dataset length = (5686, 22) (5686,)
Testing dataset length = (2437, 22) (2437,)


In [14]:
mushroom.corr()["classes"]


classes                     1.000000
cap-shape                   0.052826
cap-surface                 0.178440
cap-color                  -0.031361
bruises                    -0.501758
odor                       -0.093675
gill-attachment             0.129188
gill-spacing               -0.348358
gill-size                   0.539944
gill-color                 -0.530574
stalk-shape                -0.101888
stalk-root                 -0.324541
stalk-surface-above-ring   -0.334712
stalk-surface-below-ring   -0.298901
stalk-color-above-ring     -0.154096
stalk-color-below-ring     -0.146824
veil-type                        NaN
veil-color                  0.145133
ring-number                -0.214349
ring-type                  -0.411942
spore-print-color           0.172063
population                  0.298776
habitat                     0.216990
Name: classes, dtype: float64