#### importing the relevant packages

In [48]:
import pandas as pd
from sklearn import neighbors, datasets

#### Loading the data

In [49]:
iris = datasets.load_iris()

X, y = iris.data, iris.target

X = pd.DataFrame(X, columns=iris.feature_names)
X.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [50]:
#iris.target

In [51]:
X.shape

(150, 4)

So we have 150 rows of data, which corresponds to 150 different data points (in this case flowers), each data point has 4 features.

In [52]:
y.shape

(150,)

for each of the 150 data points, we have a label stored in the y variable.

In [53]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

The y variable values - 0,1 and 2 correspond to the three different species the flower belongs to.

In [54]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

#### Modeling the data

In [55]:
# create the model, consider this a black box
knn = neighbors.KNeighborsClassifier(n_neighbors=5, weights='uniform')

# training the model on our data
knn.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

#### using the model to make a prediction

In [56]:
# What kind of iris has 3cm x 5cm sepal and 4cm x 2cm petal?
X_pred = [3, 5, 4, 2]

result = knn.predict([X_pred, ])
print(iris.target_names[result])

['versicolor']


In [57]:
# We can also predict the probability of belonging to differnt classes
print(iris.target_names)
print(knn.predict_proba([X_pred, ]))

['setosa' 'versicolor' 'virginica']
[[0.  0.8 0.2]]


#### evaluating the model
We need to split our data into train and test sets

In [58]:
from sklearn.model_selection import train_test_split

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [60]:
X_train.shape

(112, 4)

In [61]:
# create the model
knn = neighbors.KNeighborsClassifier(n_neighbors=5, weights='uniform')

# We train just on the X_train variable containing 112 examples
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [62]:
X_test.shape

(38, 4)

In [63]:
# Now we use the trained model to predict on the unseen X_test data
predictions = knn.predict(X_test)

In [64]:
# The model returns it's prediction of flower species
# based on what it had learned from the training data
predictions

array([1, 2, 1, 2, 0, 2, 2, 1, 2, 2, 1, 2, 1, 0, 2, 1, 2, 0, 0, 0, 0, 1,
       0, 0, 2, 2, 2, 1, 1, 2, 1, 2, 1, 0, 1, 1, 1, 0])

In [65]:
# The original flower labels are stored in the variable y_test
y_test

array([1, 2, 1, 2, 0, 2, 1, 1, 2, 2, 1, 2, 1, 0, 2, 1, 2, 0, 0, 0, 0, 2,
       0, 0, 2, 2, 2, 1, 1, 2, 1, 2, 1, 0, 1, 1, 1, 0])

In [66]:
# Comparing the predictions with original values
from sklearn.metrics import accuracy_score
accuracy_score(predictions, y_test)

0.9473684210526315

Our model classifies more than 90% of unseen data correctly!