# Pipelines
*Sequentially apply a list of transforms and a final estimator.*
- Scaling or imputation are examples of *transforms*
- a classifier is an *estimator*

In [4]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

### The Mamography Mass dataset from UCI

In [5]:
# Note: Given what 'Shape' and 'Margin' actually mean it is not really valid to treat 
# them as numeric/ordinal.
# Info on dataset available at https://archive.ics.uci.edu/ml/datasets/Mammographic+Mass

mam_mass = pd.read_csv('data/MamMass.csv',na_values='?')
mam_mass.pop('BI-RADS')  # we're not using this variable in this example
y = mam_mass.pop('Severity').values
X = mam_mass.values

In [6]:
mam_mass.head()

Unnamed: 0,Age,Shape,Margin,Density
0,67.0,3.0,5.0,3.0
1,43.0,1.0,1.0,
2,58.0,4.0,5.0,3.0
3,28.0,1.0,1.0,3.0
4,74.0,1.0,5.0,


### Two sample missing value imputers from `sklearn`
- `SimpleImputer` replace missing values with the mean for that column
- `KNNImputer` use similar instances to estimate missing values

In [7]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean') # Not used
imp_kNN = KNNImputer(missing_values = np.nan)
imp_kNN.fit(X)
Xi = imp_kNN.transform(X)

Also scale the data (otherwise `Age` attribute will dominate)

In [8]:
bScal = StandardScaler().fit(Xi)
XiS = bScal.transform(Xi)

Making the train-test-split after Imputation and Scaling is not the right way to do things.

In [9]:
X_train, X_test, y_train, y_test = train_test_split(XiS, y, 
                                                    test_size=0.2,
                                                    random_state=42)
X_train.shape, X_test.shape

((768, 4), (193, 4))

In [10]:
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
print("Accuracy: {0:4.2f}".format(accuracy_score(y_test,y_pred)))
confusion_matrix(y_test, y_pred)


Accuracy: 0.84


array([[82, 19],
       [12, 80]])

## Fit Impute and Scale transforms on Train data only
The right way to do it. 

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                    random_state=42)
X_train.shape, X_test.shape

((768, 4), (193, 4))

In [12]:
imp_kNN = KNNImputer(missing_values = np.nan)
imp_kNN.fit(X_train)
Xi_train = imp_kNN.transform(X_train)
Xi_test = imp_kNN.transform(X_test)

In [13]:
bScal = StandardScaler().fit(Xi_train)
XiS_train = bScal.transform(Xi_train)
XiS_test = bScal.transform(Xi_test)

In [14]:
knn = KNeighborsClassifier()  #default hyperparameters 
knn.fit(XiS_train,y_train)
y_pred = knn.predict(XiS_test)
print("Accuracy: {0:4.2f}".format(accuracy_score(y_test,y_pred)))
confusion_matrix(y_test, y_pred)


Accuracy: 0.82


array([[78, 23],
       [12, 80]])

In [24]:
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

## With Pipelines

In [21]:
kNNpipe  = Pipeline(steps=[
    ('imputer', KNNImputer(missing_values = np.nan)),
    ('scaler', StandardScaler()),
    ('classifier', KNeighborsClassifier())])


In [22]:
kNNpipe.fit(X_train, y_train)
y_pred = kNNpipe.predict(X_test)
print("Accuracy: {0:4.2f}".format(accuracy_score(y_test,y_pred)))
confusion_matrix(y_test, y_pred)


Accuracy: 0.82


array([[78, 23],
       [12, 80]])

## Pipelines & Cross Validation

In [26]:
kNNpipe  = Pipeline(steps=[
    ('imputer', KNNImputer(missing_values = np.nan)),
    ('scaler', StandardScaler()),
    ('classifier', KNeighborsClassifier())])


acc_arr = cross_val_score(kNNpipe, X, y, cv=5, n_jobs = -1)
print("Accuracy: {0:4.2f}".format(sum(acc_arr)/len(acc_arr)))
confusion_matrix(y_test, y_pred)


Accuracy: 0.78


array([[78, 23],
       [12, 80]])

Accuracy estimate with pipeline and cross-validation is worse than with hold-out - why?  
Hold-out split is a *lucky* split - change `random_state` and repeat. 

## Pipelines & Grid Search

The pipeline

In [15]:
kNNpipe  = Pipeline(steps=[
    ('imputer', KNNImputer(missing_values = np.nan)),
    ('scaler', StandardScaler()),
    ('classifier', KNeighborsClassifier())])

*k*-NN hyperparameters to be set

In [16]:
param_grid = {'classifier__n_neighbors':[1,3,5,10], 
              'classifier__metric':['manhattan','euclidean'],
              'classifier__weights':['uniform','distance']}

In [17]:
pipe_gs = GridSearchCV(kNNpipe,param_grid,cv=10, 
                      verbose = 1, n_jobs = -1)

In [18]:
pipe_gs = pipe_gs.fit(X_train, y_train)

Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:    2.6s finished


In [19]:
pipe_gs.best_params_

{'classifier__metric': 'manhattan',
 'classifier__n_neighbors': 10,
 'classifier__weights': 'uniform'}

In [20]:
y_pred_gs = pipe_gs.predict(X_test)
print("Accuracy: {0:4.2f}".format(accuracy_score(y_test,y_pred_gs)))
confusion_matrix(y_test, y_pred_gs)

Accuracy: 0.81


array([[82, 19],
       [17, 75]])

How do the best parameters compare with the default parameters above?