In [22]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [23]:
shrooms = pd.read_csv("data/mushrooms.csv")

In [24]:
display(shrooms.columns)
shrooms.head()

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [25]:
X = shrooms.drop("class", axis=1)
X = pd.get_dummies(X)
y = shrooms["class"]
y = y.map({"p": 1, "e": 0})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

In [26]:
# get best logistic regression metrics
logreg = LogisticRegression(max_iter=5000)
param_grid = {"C": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
              "solver": ["liblinear", "lbfgs", "newton-cg", "newton-cholesky", "sag", "saga"]}

grid = GridSearchCV(logreg, param_grid, cv=10, scoring="f1", n_jobs=-1)
grid.fit(X_train, y_train)



In [27]:
grid.best_params_

{'C': 1, 'solver': 'liblinear'}

In [28]:
grid.best_score_

1.0

In [29]:
# get classification report for the best logistic regression model
y_pred = grid.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       840
           1       1.00      1.00      1.00       785

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



In [30]:
# doing all the same stuff for SVC
svc = SVC()
param_grid_svc = {'C': [0.1, 1, 10, 100], 
                  'gamma': [1, 0.1, 0.01, 0.001],
                  'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
                  'degree': [2, 3, 4, 5]}

grid_svc = GridSearchCV(svc, param_grid_svc, cv=10, scoring="f1", n_jobs=-1)
grid_svc.fit(X_train, y_train)


In [31]:
print(f"Best params: {grid_svc.best_params_}")
print(f"Best f1 score: {grid_svc.best_score_}")

Best params: {'C': 0.1, 'degree': 2, 'gamma': 1, 'kernel': 'poly'}
Best f1 score: 1.0


In [32]:
y_pred_svc = grid_svc.predict(X_test)
print(classification_report(y_test, y_pred_svc))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       840
           1       1.00      1.00      1.00       785

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625

