In [1]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set(style='whitegrid')


In [3]:
# Step 2: Load the Data
# Load the dataset
url = 'https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/mushrooms.csv'
df = pd.read_csv(url)


In [4]:
# Step 3: Understand the Data
# Display the first few rows
print(df.head())

# Get the basic information
print(df.info())

# Summary statistics
print(df.describe(include='all'))


  type cap_shape cap_surface cap_color bruises odor gill_attachment  \
0    p         x           s         n       t    p               f   
1    e         x           s         y       t    a               f   
2    e         b           s         w       t    l               f   
3    p         x           y         w       t    p               f   
4    e         x           s         g       f    n               f   

  gill_spacing gill_size gill_color  ... stalk_surface_below_ring  \
0            c         n          k  ...                        s   
1            c         b          k  ...                        s   
2            c         b          n  ...                        s   
3            c         n          n  ...                        s   
4            w         b          k  ...                        s   

  stalk_color_above_ring stalk_color_below_ring veil_type veil_color  \
0                      w                      w         p          w   
1             

In [5]:
# Step 4: Handle Missing Data
# Check for missing values
print(df.isnull().sum())

# Handle missing values if necessary
# This dataset typically does not have missing values, but ensure it
# df.dropna(inplace=True)  # Drop missing values if necessary


type                        0
cap_shape                   0
cap_surface                 0
cap_color                   0
bruises                     0
odor                        0
gill_attachment             0
gill_spacing                0
gill_size                   0
gill_color                  0
stalk_shape                 0
stalk_root                  0
stalk_surface_above_ring    0
stalk_surface_below_ring    0
stalk_color_above_ring      0
stalk_color_below_ring      0
veil_type                   0
veil_color                  0
ring_number                 0
ring_type                   0
spore_print_color           0
population                  0
habitat                     0
dtype: int64


In [6]:
# Step 5: Data Visualization
# Distribution of mushroom classes (edible vs poisonous)
sns.countplot(x='class', data=df)
plt.title('Distribution of Mushroom Classes')
plt.show()

# Distribution of cap shapes
sns.countplot(y='cap-shape', data=df)
plt.title('Distribution of Cap Shapes')
plt.show()

# Distribution of cap colors
sns.countplot(y='cap-color', data=df)
plt.title('Distribution of Cap Colors')
plt.show()

# Distribution of gill sizes
sns.countplot(y='gill-size', data=df)
plt.title('Distribution of Gill Sizes')
plt.show()


ValueError: Could not interpret input 'class'

In [7]:
# Step 6: Univariate Analysis
# Distribution of cap shapes
sns.countplot(x='cap-shape', data=df)
plt.title('Cap Shape Distribution')
plt.show()

# Distribution of gill color
sns.countplot(y='gill-color', data=df)
plt.title('Gill Color Distribution')
plt.show()


ValueError: Could not interpret input 'cap-shape'

In [8]:
# Step 7: Bivariate Analysis
# Cap shape vs Edibility
sns.countplot(x='cap-shape', hue='class', data=df)
plt.title('Cap Shape vs Edibility')
plt.show()

# Gill color vs Edibility
sns.countplot(x='gill-color', hue='class', data=df)
plt.title('Gill Color vs Edibility')
plt.show()


ValueError: Could not interpret input 'cap-shape'

In [9]:
# Step 8: Multivariate Analysis
# Pair plot for selected features (limited by categorical nature)
sns.pairplot(df[['class', 'cap-shape', 'cap-color', 'gill-size']], hue='class')
plt.show()


KeyError: "None of [Index(['class', 'cap-shape', 'cap-color', 'gill-size'], dtype='object')] are in the [columns]"

Step 9: Identify and Handle Outliers

Since this dataset consists of categorical features, outliers in the traditional numerical sense do not apply. However, checking for and understanding rare categories might be useful.

In [10]:
# Step 10: Feature Engineering
# Convert categorical features into numerical values using encoding
df_encoded = pd.get_dummies(df, drop_first=True)


In [11]:
# Step 11: Summary and Insights
# Summarize key findings
print("Key Insights:")

# Distribution of classes
class_dist = df['class'].value_counts()
print(f"Class Distribution:\n{class_dist}")

# Cap shape distribution
cap_shape_dist = df['cap-shape'].value_counts()
print(f"Cap Shape Distribution:\n{cap_shape_dist}")

# Cap color distribution
cap_color_dist = df['cap-color'].value_counts()
print(f"Cap Color Distribution:\n{cap_color_dist}")

# Gill size distribution
gill_size_dist = df['gill-size'].value_counts()
print(f"Gill Size Distribution:\n{gill_size_dist}")

# Association between features and class
print("Association between features and class (Edible/Poisons):")

# Cap shape vs class
cap_shape_class = df.groupby('cap-shape')['class'].value_counts().unstack()
print(f"Cap Shape vs Class:\n{cap_shape_class}")

# Gill color vs class
gill_color_class = df.groupby('gill-color')['class'].value_counts().unstack()
print(f"Gill Color vs Class:\n{gill_color_class}")


Key Insights:


KeyError: 'class'

Findings:
1. Class Distribution: The dataset is likely imbalanced between edible and poisonous mushrooms.
2. Cap Shape and Color: Different shapes and colors are associated with edible or poisonous mushrooms, revealing patterns useful for classification.
3. Gill Size: Gill size can also be an important feature in distinguishing between edible and poisonous mushrooms.