## Advanced tuning of parameters

In this tutorial, we will apply the skills from previous tutorials and build a classifier using the `Pipelines` and `FeatureUnion` functions from sklearn.

In [13]:
# IMPORT PACKAGES
import os.path
import pandas as pd
import numpy as np

In [2]:
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge

### Data

We will be building a binary classifier that predicts wheter a person has diabetes or not, using information on the patient's health. 

The data can be found [here](https://drive.google.com/file/d/1TvCKlmH3Z32XAKk-VUcZyYu95Ccyw3PO/view?usp=sharing). 


In [45]:
data = '/Users/patrickokwir/Desktop/Lighthouse-data-notes/Unit_8/Day_1/pipelines_and_persistence_exercise-master/pima-indians-diabetes.csv'

In [15]:
col_names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

In [46]:
# Load the data
df = pd.read_csv(data, delimiter=';')

In [47]:
df.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


### Task

Build classifier which predicts the target variable `class` using rest of the attributes. The model should be fitted using pipeline that contains:
- PCA method
- SelectKBest method
- FeatureUnion
- Random Forest

Choose the best set of parameters using `Pipeline` and grid_search.

> #### Note
> **In this exercise, we are focusing on the implementation of pipeline. Since we have only 9 columns in our dataset the PCA is probably not the best technique to use during the data preparation from a methodology point of view.**

In [51]:
# import pipeline
from sklearn.pipeline import Pipeline
# import random forest classifier
from sklearn.ensemble import RandomForestClassifier
# import train test split
from sklearn.model_selection import train_test_split
# import accuracy score
from sklearn.metrics import accuracy_score
# import gridsearchcv
from sklearn.model_selection import GridSearchCV
# import train test split
from sklearn.model_selection import train_test_split
# import accuracy score


In [53]:
# split the data into training and test set
X_train, X_test, y_train, y_test = train_test_split(df[col_names[:-1]], df['class'], test_size=0.2, random_state=0)

In [55]:
# build pipeline for feature selection, parameter tuning, and model training
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('selector', SelectKBest(f_regression)),
    ('classifier', RandomForestClassifier())
])

# set parameters for feature selection
k = [2, 3, 4, 5, 6, 7, 8]
n_estimators = [10, 50, 100, 200, 300, 400, 500]
max_depth = [2, 4, 6, 8, 10, 12, 14]
cv = 5

# create a dictionary of all the parameter options
# Note has you can access the parameters of steps of a pipeline by using '__’
parameters = dict(selector__k=k,
                    classifier__n_estimators=n_estimators,
                    classifier__max_depth=max_depth)

# conduct parameter tuning using grid search
# create a grid search object
clf = GridSearchCV(pipe, parameters)

# fit the grid search
clf.fit(X_train, y_train)

# print the best parameters
print(clf.best_params_)

# print the best estimator
print(clf.best_estimator_)

# print the best model
print(clf.best_score_)

# print the accuracy score
print(accuracy_score(y_test, clf.predict(X_test)))





{'classifier__max_depth': 4, 'classifier__n_estimators': 200, 'selector__k': 6}
Pipeline(steps=[('scaler', StandardScaler()),
                ('selector',
                 SelectKBest(k=6,
                             score_func=<function f_regression at 0x14dad4790>)),
                ('classifier',
                 RandomForestClassifier(max_depth=4, n_estimators=200))])
0.7720245235239238
0.7922077922077922


In [56]:
from sklearn.pipeline import Pipeline, FeatureUnion

In [57]:
# build pipeline for feature union(k-best and PCA), parameter tuning, and model training
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_union', FeatureUnion([
        ('pca', PCA()),
        ('kbest', SelectKBest(f_regression))
    ])),
    ('classifier', RandomForestClassifier())
])

# set parameters for feature union
k = [2, 3, 4, 5, 6, 7, 8]
n_components = [2, 3, 4, 5, 6, 7, 8]
n_estimators = [10, 50, 100, 200, 300, 400, 500]
max_depth = [2, 4, 6, 8, 10, 12, 14]
cv = 5

# create a dictionary of all the parameter options
# Note has you can access the parameters of steps of a pipeline by using '__’
parameters = dict(feature_union__pca__n_components=n_components,
                    feature_union__kbest__k=k,
                    classifier__n_estimators=n_estimators,
                    classifier__max_depth=max_depth)

# conduct parameter tuning using grid search
# create a grid search object
clf = GridSearchCV(pipe, parameters)

# fit the grid search
clf.fit(X_train, y_train)

# print the best parameters
print(clf.best_params_)
# print the best estimator
print(clf.best_estimator_)
# print the best model
print(clf.best_score_)
# print the accuracy score
print(accuracy_score(y_test, clf.predict(X_test)))



{'classifier__max_depth': 4, 'classifier__n_estimators': 50, 'feature_union__kbest__k': 7, 'feature_union__pca__n_components': 4}
Pipeline(steps=[('scaler', StandardScaler()),
                ('feature_union',
                 FeatureUnion(transformer_list=[('pca', PCA(n_components=4)),
                                                ('kbest',
                                                 SelectKBest(k=7,
                                                             score_func=<function f_regression at 0x14dad4790>))])),
                ('classifier',
                 RandomForestClassifier(max_depth=4, n_estimators=50))])
0.7801012928162068
0.7792207792207793
