# Using data from UC Irvine. http://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Original%29
## Each dataset has a metadata description with the fields in the data, missing data indicators etc

In [85]:
import numpy as np
import pandas as pd
from sklearn import preprocessing, neighbors, cross_validation

In [86]:
# read the csv
df = pd.read_csv("/Users/scottlynn73/Documents/Python/breast_cancer.txt")

# replace ? with -99999, ? is missing data as per metadata document, -99999 makes it a huge outlier that thw algo overlooks
df.replace('?', -99999, inplace=True)

# get rid of the id column as it has no predictive power
df.drop(['id'],1 , inplace=True)

In [87]:
# we're trying to predict 'class' so Xs excludes this, ys are the class as they are being predicted
X = np.array(df.drop(['class'],1))
y = np.array(df['class'])

# set up the training and testing, retain 20% of the data for testing
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.2)

# run the nearest neighbours classifier
clf = neighbors.KNeighborsClassifier()

# fit the classifier to the data
clf.fit(X_train, y_train)

# how accurate is it?
accuracy = clf.score(X_test, y_test)
print(accuracy)

0.978571428571


In [84]:
# set some tests, these should not be in the original dataset, this is a list of list with 
# all the measurements but not including the ID or the class (what we're trying to predict)
example_measures = np.array([[9,2,5,3,2,6,3,10,2], [9,4,6,5,1,5,2,11,3], [10,1,1,1,2,1,3,7,1]])

# needs to be reshaped to the length of the example_features dataset to keep sklearn happy
example_measures = example_measures.reshape(len(example_measures),-1)

# make the predictions
prediction = clf.predict(example_measures)
print(prediction)

[4 4 2]
