In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn import datasets
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ParameterGrid, GridSearchCV
from sklearn.pipeline import Pipeline

In [None]:
# loading a build-in datatset from sklearn
# contains data about iris flowers
# the task is to learn how to recognise the type/specie? of flower based on dimensions
iris = datasets.load_iris()
columns = iris['feature_names']
iris_df = pd.DataFrame(iris.data, columns=columns)
iris_df['class'] = pd.Series(iris.target)

In [None]:
# using train_test_split to split data randomly into train and test sets
# X is an array containing features, y contains classes / lables (types of flowers)
X_train, X_test, y_train, y_test = train_test_split(
    iris_df[columns], iris_df['class'], stratify=iris_df['class'], test_size=0.4)

In [None]:
# creation of a pipe - a sequence of preprocessors or/and models
# here the first object is MinMaxScaler, the second KNN classifier
# Strings are the names of these objects, and will be used to pass hyperparameters to them
estimators = [('Scaler', preprocessing.MinMaxScaler()), ('clf', KNeighborsClassifier())]
pipe = Pipeline(estimators)

# to find the best set of hyperparameters we need to first define what hyperparameters to check
# and what objects should use them. The keys start with object name (clf or Scaler in this case)
# that we defined above, followed by a __ separator and the name of a hyperparameter (keyword)
# values are lists of alternative values.
# In the case of feature_range (passed to the Scaler) there is only one value (0,1)
# In the case of n_neighbours (passed to the clf) there are 4 values to check: 5, 10, 25, 50
param_grid = {
    'Scaler__feature_range':[(0,1)],
    'clf__n_neighbors':[5, 10, 25, 50], 
    'clf__weights':['uniform', 'distance'], 
    'clf__metric':['minkowski', 'manhattan', 'chebyshev'],
}

# parameter grid and pipe are passed to GridSearchCV - Grid Search CrossValidator
# we also pass a name of a scoring method
grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring='f1_micro')

# GridSearchCV generates all combinations of parameters from the param_grid,
# applies them to the pipe (scaler and classifier)
# and performs crossvalidation to check which parameter set is the best
# according to the scoring method that was given above
#
# using pipe and grid search (apart from being convinient) solves one problem with 
# manual preprocessing and crossvalidating using cross_validate():
# scaler should not be fitted on the whole dataset used for crossvalidation, but only
# on the random subset of crossvalidation dataset that is used for training - 
# so each round of crossvalidation should fit scaler separately, leaving out the validation subset
#
# in grid_search it is done the second way 
grid_search.fit(X_train, y_train)

# get best parameter set 
grid_search.best_params_

In [None]:
# creating a new scaler and fitting it on all training data
scaler = preprocessing.MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# get the best hyperparameters as a dict with correct names (keys)
# (without pipe objects names)
kwargs_clf = {key.split('__')[1] : val for key, val in grid_search.best_params_.items() if key.startswith('clf__')}

# create a classifier using the best hyperparameters, train and apply to test data
clf = KNeighborsClassifier(**kwargs_clf)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# check results
metrics.confusion_matrix(y_test, y_pred, labels=[0, 1, 2])