In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn import datasets
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

In [None]:
# loading a build-in datatset from sklearn
# contains data about iris flowers
# the task is to learn how to recognise the type/specie? of flower based on dimensions
iris = datasets.load_iris()
columns = iris['feature_names']
iris_df = pd.DataFrame(iris.data, columns=columns)
iris_df['class'] = pd.Series(iris.target)

In [None]:
# using train_test_split to split data randomly into train and test sets
# X is an array containing features, y contains classes / lables (types of flowers)
X_train, X_test, y_train, y_test = train_test_split(
    iris_df[columns], iris_df['class'], stratify=iris_df['class'], test_size=0.4)

# preprocessing - scaling to 0, 1 range
scaler = preprocessing.MinMaxScaler(feature_range=(0, 1), copy=True)
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test) # X_test scaled using the same parameters as X_train

In [None]:
# creating and training (fitting) a K nearest neighbours classifier
clf = KNeighborsClassifier(n_neighbors=5, weights='distance', metric='minkowski')
clf.fit(X_train, y_train)

In [None]:
# getting predicted classes for the test data
y_pred = clf.predict(X_test)

# one of the ways to check the performance of a classifier is to analyse a confusion matrix
# it shows the overlap between actual (y_test) and predicted classes/labels
metrics.confusion_matrix(y_test, y_pred, labels=[0, 1, 2])