In [7]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7
Note: you may need to restart the kernel to use updated packages.


In [8]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
mushroom = fetch_ucirepo(id=73) 
  
# data (as pandas dataframes) 
X = mushroom.data.features 
y = mushroom.data.targets 
  
# metadata 
print(mushroom.metadata) 
  
# variable information 
print(mushroom.variables) 

{'uci_id': 73, 'name': 'Mushroom', 'repository_url': 'https://archive.ics.uci.edu/dataset/73/mushroom', 'data_url': 'https://archive.ics.uci.edu/static/public/73/data.csv', 'abstract': 'From Audobon Society Field Guide; mushrooms described in terms of physical characteristics; classification: poisonous or edible', 'area': 'Biology', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 8124, 'num_features': 22, 'feature_types': ['Categorical'], 'demographics': [], 'target_col': ['poisonous'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1981, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5959T', 'creators': [], 'intro_paper': None, 'additional_info': {'summary': "This data set includes descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms in the Agaricus and Lepiota Family (pp. 500-525).  Each species is identified as definitely edible, definitely po

In [9]:
from sklearn.datasets import fetch_openml

# Fetch dataset
mushroom = fetch_openml(name='mushroom', version=1)

# Data (as pandas DataFrame)
X = mushroom.data
y = mushroom.target

# Metadata
print(mushroom.DESCR)

# Variable information
print(mushroom.feature_names)

**Author**: [Jeff Schlimmer](Jeffrey.Schlimmer@a.gp.cs.cmu.edu)  
**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/mushroom) - 1981     
**Please cite**:  The Audubon Society Field Guide to North American Mushrooms (1981). G. H. Lincoff (Pres.), New York: Alfred A. Knopf 


### Description

This dataset describes mushrooms in terms of their physical characteristics. They are classified into: poisonous or edible.

### Source
```
(a) Origin: 
Mushroom records are drawn from The Audubon Society Field Guide to North American Mushrooms (1981). G. H. Lincoff (Pres.), New York: Alfred A. Knopf 

(b) Donor: 
Jeff Schlimmer (Jeffrey.Schlimmer '@' a.gp.cs.cmu.edu)
```

### Dataset description

This dataset includes descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms in the Agaricus and Lepiota Family. Each species is identified as definitely edible, definitely poisonous, or of unknown edibility and not recommended. This latter class was combined with the

In [10]:
num_mushrooms = X.shape[0]
print("Number of mushrooms in the dataset:", num_mushrooms)

Number of mushrooms in the dataset: 8124


In [44]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import OneHotEncoder

In [12]:
# Convert feature matrix X and target vector y into a pandas DataFrame
mushroom_df = pd.DataFrame(data=X, columns=mushroom.feature_names)
mushroom_df['class'] = y  # Add target variable 'class' to the DataFrame

# Display the DataFrame
print(mushroom_df.head())

  cap-shape cap-surface cap-color bruises%3F odor gill-attachment  \
0         x           s         n          t    p               f   
1         x           s         y          t    a               f   
2         b           s         w          t    l               f   
3         x           y         w          t    p               f   
4         x           s         g          f    n               f   

  gill-spacing gill-size gill-color stalk-shape  ... stalk-color-above-ring  \
0            c         n          k           e  ...                      w   
1            c         b          k           e  ...                      w   
2            c         b          n           e  ...                      w   
3            c         n          n           e  ...                      w   
4            w         b          k           t  ...                      w   

  stalk-color-below-ring veil-type veil-color ring-number ring-type  \
0                      w         p     

In [62]:
cleaned_mushroom_df =mushroom_df.dropna()

In [64]:
#X_test.isna()
nan_values = cleaned_mushroom_df.isna()
nan_columns = nan_values.sum()
print(nan_columns)

cap-shape                   0
cap-surface                 0
cap-color                   0
bruises%3F                  0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
class                       0
dtype: int64


In [79]:
y = cleaned_mushroom_df['class']
x = cleaned_mushroom_df.drop(columns=['class'])

In [80]:
y.head()

0    p
1    e
2    e
3    p
4    e
Name: class, dtype: category
Categories (2, object): ['e', 'p']

In [81]:
x.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises%3F,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,x,s,n,t,p,f,c,n,k,e,...,s,w,w,p,w,o,p,k,s,u
1,x,s,y,t,a,f,c,b,k,e,...,s,w,w,p,w,o,p,n,n,g
2,b,s,w,t,l,f,c,b,n,e,...,s,w,w,p,w,o,p,n,n,m
3,x,y,w,t,p,f,c,n,n,e,...,s,w,w,p,w,o,p,k,s,u
4,x,s,g,f,n,f,w,b,k,t,...,s,w,w,p,w,o,e,n,a,g


In [98]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function

X_train, X_test, y_train, y_test = train_test_split(x, y,test_size=0.3, random_state=7)

In [99]:
categorical_columns = x.columns.tolist()

In [100]:
transformer = OneHotEncoder(handle_unknown='ignore')


In [101]:
from sklearn.pipeline import Pipeline

In [102]:
from sklearn.compose import ColumnTransformer

In [114]:
from sklearn.svm import SVC

In [115]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', transformer, categorical_columns)
    ])

# Create a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC(kernel = 'linear'))
])

# Train the model
pipeline.fit(X_train, y_train)

In [116]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
#model = LogisticRegression(random_state=1)

# Fit the model using training data
pipeline.fit(X_train, y_train)

In [105]:
import sklearn.metrics as metrics 

In [117]:
y_pred = pipeline.predict(X_test)

#results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
#print(results_df)

In [118]:
conf_matrix = metrics.confusion_matrix(y_test,y_pred)
print (conf_matrix)


[[1025    0]
 [   0  669]]


In [119]:
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(results_df)

     Actual Predicted
3816      e         e
5352      p         p
5307      p         p
4606      p         p
2993      e         e
...     ...       ...
4909      p         p
2026      e         e
2885      e         e
1727      e         e
4896      p         p

[1694 rows x 2 columns]


In [96]:
from sklearn.metrics import accuracy_score 

In [120]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 1.0


In [None]:
y_pred = model.predict(X_test)

#results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
#print(results_df)

ValueError: Input contains NaN

60914       0          0
36843       0          0
1966        0          0
70137       0          0
27237       0          0
...       ...        ...
45639       0          0
11301       0          0
51614       0          0
4598        0          0
2793        0          0

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')