In [1]:
import pandas as pd
import numpy as np

# For scaling data
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [9]:
data = load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
df.head()


# Arrange data into a features matrix and target vector
X = df.drop(columns='target')
y = df['target']
X.shape
y.shape


# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [10]:
# Instantiate Standard Scaler
# Standardization of a dataset is a common requirement for many machine learning estimators (estimator is another name for model in scikit-learn).
# We will instantiate it now and use it later in our pipeline.

scaler = StandardScaler()

In [11]:
# K-Nearest Neighbors

# Step 1: Make an instance of the Model
# This is a place where you can tune the hyperparameters of a model. In this case, n_neighbors=1. You will want to try different values
# for number of neighbors. The default is 5.

knn = KNeighborsClassifier(n_neighbors=1)
# Looking at some hyperparameters that seem tunable
knn.get_params()

# Step 2: Put the scaler and the model into a pipeline and fit the pipeline on the training data.
knn_pipe = make_pipeline(scaler, knn)
knn_pipe.fit(X_train, y_train)

# Step 3: Predict the values for the test set
predictions = knn_pipe.predict(X_test)
predictions

# Step 4: Evaluate the model's performance on the test set
# calculate classification accuracy
acc_score = knn_pipe.score(X_test, y_test)
acc_score

1.0

In [None]:
# If KNN can get to 100% accuracy, why isn't it used more often?

# The accuracy was 1.0 or 100%.
# KNN is an instance-based model (lazy learning) that stores all the data (very memory intensive) it was fit on.

# There is no explicit training phase before classification. In other words .fit doesn't do as much for this model versus other models like
# linear regression, decision trees, neural networks etc.

# Keeping the entire dataset in memory and performing classifications/regressions can be computationally expensive as the algorithm has
# to parse through all of the data points. For this reason, KNN tends to work best on smaller data-sets that do not have many features.