In [59]:
import pandas as pd
from bokeh.charts import Scatter, output_file, show
from bokeh.io import output_notebook
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [60]:
# column names
names = ["target", "cap-shape", "cap-surface", "cap-color", "bruises",
        "odor", "gill-attachment", "gill-spacing", "gill-size",
        "gill-color", "stalk-shape", "stalk-root", "stalk-surface-above-ring",
        "stalk-surface-below-ring", "stalk-color-above-ring", "stalk-color-below-ring",
        "veil-type", "veil-color", "ring-number", "ring-type", "spore-print-color",
        "population", "habitat"]

In [61]:
# training data
df = pd.read_csv('../Resources/agaricus-lepiotadata.txt', header=None, names=names)

In [62]:
# select all columns of type object
cat_columns = df.select_dtypes(['object']).columns

In [63]:
# convert all object columns to type category
df[cat_columns] = df[cat_columns].apply(lambda x: x.astype('category'))

In [64]:
# convert categorical data to numeric
df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)

In [65]:
# create a scatter chart
p = Scatter(df, x='cap-shape', y='cap-color', color='target')

In [66]:
output_notebook()

In [67]:
show(p)

In [68]:
X = np.array(df.ix[:, 1:22])

In [69]:
y = np.array(df['target'])

In [70]:
# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [71]:
knn = KNeighborsClassifier(n_neighbors=3)

In [72]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [73]:
pred = knn.predict(X_test)

In [74]:
print(accuracy_score(y_test, pred))

1.0


In [79]:
trained_model = knn.fit(X_train, y_train)

In [80]:
trained_model.score(X_train, y_train)

0.99950248756218907