In [1]:
%matplotlib inline

# python 3.6.8
import os, itertools, csv

from IPython.display import Image
from IPython.display import display

# numpy  1.19.5
import numpy as np

# pandas  0.25.3
import pandas as pd

# scikit-learn  0.22
from sklearn.compose import ColumnTransformer
from sklearn.datasets import make_moons
from sklearn.impute import SimpleImputer 
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.svm import SVC

# matplotlib  3.1.2
import matplotlib
matplotlib.rcParams.update({'font.size': 22})
plt = matplotlib.pyplot

# load utility classes/functions that has been taught in previous labs
# e.g., plot_decision_regions()
from lib import *

In [12]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/'
                 'mushroom/agaricus-lepiota.data', header=None, engine='python')
column_name = ['classes','cap-shape', 'cap-surface','cap-color','bruises?','odor',
               'gill-attachment','gill-spacing','gill-size','gill-color',
               'stalk-shape','stalk-root','stalk-surface-above-ring',
               'stalk-surface-below-ring','stalk-color-above-ring',
               'stalk-color-below-ring','veil-type','veil-color','ring-number',
               'ring-type','spore-print-color','population','habitat']
df.columns = column_name
df.head(10).transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
classes,p,e,e,p,e,e,e,e,p,e
cap-shape,x,x,b,x,x,x,b,b,x,b
cap-surface,s,s,s,y,s,y,s,y,y,s
cap-color,n,y,w,w,g,y,w,w,w,y
bruises?,t,t,t,t,f,t,t,t,t,t
odor,p,a,l,p,n,a,a,l,p,a
gill-attachment,f,f,f,f,f,f,f,f,f,f
gill-spacing,c,c,c,c,w,c,c,c,c,c
gill-size,n,b,b,n,b,b,b,b,n,b
gill-color,k,k,n,n,k,n,g,n,p,g


In [13]:
label_le = LabelEncoder()
df['classes'] = label_le.fit_transform(df['classes'].values)
catego_features = ['cap-shape', 'cap-surface','cap-color','bruises?','odor',
               'gill-attachment','gill-spacing','gill-size','gill-color',
               'stalk-shape','stalk-root','stalk-surface-above-ring',
               'stalk-surface-below-ring','stalk-color-above-ring',
               'stalk-color-below-ring','veil-type','veil-color','ring-number',
               'ring-type','spore-print-color','population','habitat']

catego_le = LabelEncoder()
categories = []
for i in catego_features:
    df[i] = catego_le.fit_transform(df[i].values)
    classes_list = catego_le.classes_.tolist()
    
    # replace '?' with 'NaN'
    if '?' in classes_list:
        idx = classes_list.index('?')
        df[i] = df[i].replace(idx, np.nan)
    
    # store the total number of values
    categories.append(np.arange(len(classes_list)))

display(df.head(15).transpose())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
classes,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
cap-shape,5.0,5.0,0.0,5.0,5.0,5.0,0.0,0.0,5.0,0.0,5.0,5.0,0.0,5.0,5.0
cap-surface,2.0,2.0,2.0,3.0,2.0,3.0,2.0,3.0,3.0,2.0,3.0,3.0,2.0,3.0,0.0
cap-color,4.0,9.0,8.0,8.0,3.0,9.0,8.0,8.0,8.0,9.0,9.0,9.0,9.0,8.0,4.0
bruises?,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
odor,6.0,0.0,3.0,6.0,5.0,0.0,0.0,3.0,6.0,0.0,3.0,0.0,0.0,6.0,5.0
gill-attachment,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
gill-spacing,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
gill-size,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
gill-color,4.0,4.0,5.0,5.0,4.0,5.0,2.0,5.0,7.0,2.0,2.0,5.0,10.0,4.0,5.0


In [14]:
X = df.drop('classes', axis=1).values
y = df['classes'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)

用pipeline處理：

In [18]:
catego_features_idx = []
for fea in catego_features:    
    catego_features_idx.append(df.drop('classes', axis=1).columns.tolist().index(fea))

ohe = ColumnTransformer(
    [
        ('ohe', OneHotEncoder(categories = categories, sparse = False), catego_features_idx),
    ],
    remainder = "passthrough"
)
pipe_knn = Pipeline([
    ("imr", SimpleImputer(missing_values = np.NaN, strategy = 'most_frequent')),
    ("ohe", ohe),
    ("scl", StandardScaler()),
    ("clf", KNeighborsClassifier(n_neighbors = 10, p = 2, metric = "minkowski"))
])

pipe_svm = Pipeline([
    ("imr", SimpleImputer(missing_values = np.NaN, strategy = 'most_frequent')),
    ("ohe", ohe),
    ("scl", StandardScaler()),
    ('clf', SVC(kernel = "rbf", random_state = 0, gamma = 0.001, C = 100.0))
])
pipe_knn.fit(X_train, y_train)
y_pred = pipe_knn.predict(X_test)
print('[KNN]')
print('Misclassified samples: %d' % (y_test != y_pred).sum())
print('Accuracy: %.4f' % accuracy_score(y_test, y_pred))

pipe_svm.fit(X_train, y_train)
y_pred = pipe_svm.predict(X_test)
print('\n[SVC]')
print('Misclassified samples: %d' % (y_test != y_pred).sum())
print('Accuracy: %.4f' % accuracy_score(y_test, y_pred))

[KNN]
Misclassified samples: 0
Accuracy: 1.0000

[SVC]
Misclassified samples: 0
Accuracy: 1.0000


得到完全正確的準確率。