In [1]:
#importing the necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#reading the dataset
df = pd.read_csv('mushrooms.csv')
df

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [3]:
#to get the information about the data, their values and data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [4]:
df.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [5]:
#since the dataset has only categorical values, we can use the label encoder to convert them into numerical values
from sklearn.preprocessing import LabelEncoder
import pandas as pd

try:
    df = pd.read_csv('mushrooms.csv')
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Error: 'mushrooms.csv' not found. Please place the file in the correct directory.")
    exit()

print("\n--- Original DataFrame Head ---")
print(df.head())
print("\nOriginal DataFrame shape:", df.shape)



Dataset loaded successfully!

--- Original DataFrame Head ---
  class cap-shape cap-surface cap-color bruises odor gill-attachment  \
0     p         x           s         n       t    p               f   
1     e         x           s         y       t    a               f   
2     e         b           s         w       t    l               f   
3     p         x           y         w       t    p               f   
4     e         x           s         g       f    n               f   

  gill-spacing gill-size gill-color  ... stalk-surface-below-ring  \
0            c         n          k  ...                        s   
1            c         b          k  ...                        s   
2            c         b          n  ...                        s   
3            c         n          n  ...                        s   
4            w         b          k  ...                        s   

  stalk-color-above-ring stalk-color-below-ring veil-type veil-color  \
0                 

In [6]:
# --- 1. Separate Features (x) and Target (y) ---
# 'class' is our target variable (what we want to predict).
# All other columns are our features (what we use to make the prediction).
x = df.drop('class', axis=1)
y = df['class']

In [7]:
# --- 2. Encode the Target Variable (y) ---
# We will use LabelEncoder to convert 'p' (poisonous) and 'e' (edible) into 1 and 0.
print("\n--- Encoding Target Variable 'class' ---")
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


--- Encoding Target Variable 'class' ---


In [8]:
# You can see what 'p' and 'e' were mapped to.
# It's alphabetical, so 'e' (edible) -> 0 and 'p' (poisonous) -> 1.
print("Class mapping:", list(label_encoder.classes_), "->", label_encoder.transform(label_encoder.classes_))
print("First 5 encoded target values:", y_encoded[:5])

Class mapping: ['e', 'p'] -> [0 1]
First 5 encoded target values: [1 0 0 1 0]


In [9]:
# --- 3. Encode the Feature Variables (X) ---
# We will use One-Hot Encoding for all the feature columns.
# The pandas get_dummies() function is perfect for this. It automatically
# finds all categorical columns and converts them.
print("\n--- Applying One-Hot Encoding to Features ---")
x_encoded = pd.get_dummies(x)


--- Applying One-Hot Encoding to Features ---


In [10]:
print("\n--- Encoded DataFrame Head ---")
print(x_encoded.head())
print("\nNew DataFrame shape after One-Hot Encoding:", x_encoded.shape)

# Note: The number of columns has increased significantly because each category
# in the original 22 feature columns has become its own new column.
# For example, 'cap-shape_b', 'cap-shape_c', etc.

# The data is now ready for a machine learning model!
# You would use X_encoded as your features and y_encoded as your target.


--- Encoded DataFrame Head ---
   cap-shape_b  cap-shape_c  cap-shape_f  cap-shape_k  cap-shape_s  \
0        False        False        False        False        False   
1        False        False        False        False        False   
2         True        False        False        False        False   
3        False        False        False        False        False   
4        False        False        False        False        False   

   cap-shape_x  cap-surface_f  cap-surface_g  cap-surface_s  cap-surface_y  \
0         True          False          False           True          False   
1         True          False          False           True          False   
2        False          False          False           True          False   
3         True          False          False          False           True   
4         True          False          False           True          False   

   ...  population_s  population_v  population_y  habitat_d  habitat_g  \
0  .

In [11]:
corr_matrix = df.corr(numeric_only=True)
print(corr_matrix)

Empty DataFrame
Columns: []
Index: []
