In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [2]:
iris = datasets.load_iris()
X, y = iris.data[50:, [1, 2]], iris.target[50:]

In [3]:
X[:5]

array([[3.2, 4.7],
       [3.2, 4.5],
       [3.1, 4.9],
       [2.3, 4. ],
       [2.8, 4.6]])

In [4]:
y

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [5]:
le = LabelEncoder()
y = le.fit_transform(y)

In [6]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [7]:
X_train, X_test, y_train, y_test =train_test_split(X, y,test_size=0.5,random_state=1,stratify=y)

### Using the training dataset, we now will train three different classifiers:

1. `Logistic regression classifier`
2. `Decision tree classifier`
3. `k-nearest neighbors classifier`


#### We will then evaluate the model performance of each classifier via 10-fold cross-validation on the training dataset before we combine them into an ensemble classifier

In [8]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
import numpy as np

In [9]:
clf1 = LogisticRegression(penalty='l2',C=0.001,solver='lbfgs',random_state=1)
clf2 = DecisionTreeClassifier(max_depth=1,criterion='entropy',random_state=0)
clf3 = KNeighborsClassifier(n_neighbors=1,p=2,metric='minkowski')

In [10]:
pipe1 = Pipeline([['sc', StandardScaler()],['clf', clf1]]) # lr
clf2 # decision tree don't need scaling, as they are scale invariant
pipe3 = Pipeline([['sc', StandardScaler()],['clf', clf3]]) # knn 

In [11]:
clf_labels = ['Logistic regression', 'Decision tree', 'KNN']
print('10-fold cross validation:\n')

for clf, label in zip([pipe1, clf2, pipe3], clf_labels):
    scores = cross_val_score(estimator=clf,
    X=X_train,
    y=y_train,
    cv=10
    # ,
    # scoring='accuracy'
    )
    print(f'Accuracy: {scores.mean():.2f} '
    f'(+/- {scores.std():.2f}) [{label}]')

10-fold cross validation:

Accuracy: 0.44 (+/- 0.08) [Logistic regression]
Accuracy: 0.86 (+/- 0.20) [Decision tree]
Accuracy: 0.84 (+/- 0.15) [KNN]


### using voting classfier

In [12]:
from sklearn.ensemble import  VotingClassifier
mv_clf = VotingClassifier(estimators=[('lr',pipe1), ('dt',clf2), ('knn',pipe3)],voting='hard')

clf_labels += ['Majority voting']

all_clf = [pipe1, clf2, pipe3, mv_clf]

for clf, label in zip(all_clf, clf_labels):
    scores = cross_val_score(estimator=clf,
    X=X_train,
    y=y_train,
    cv=10
    # ,scoring='accuracy'
    )
    print(f'Accuracy: {scores.mean():.2f} 'f'(+/- {scores.std():.2f}) [{label}]')

Accuracy: 0.44 (+/- 0.08) [Logistic regression]
Accuracy: 0.86 (+/- 0.20) [Decision tree]
Accuracy: 0.84 (+/- 0.15) [KNN]
Accuracy: 0.84 (+/- 0.20) [Majority voting]


### Scoring on test set

In [13]:
for clf, label in zip(all_clf, clf_labels):
    # assuming the label of the positive class is 1
    y_pred = clf.fit(X_train,y_train)
    print(f'Score: {clf.score(X_test,y_test):.2f} [{label}]')

Score: 0.84 [Logistic regression]
Score: 0.90 [Decision tree]
Score: 0.86 [KNN]
Score: 0.90 [Majority voting]


### doing grid search and hyperparameter tuning

In [14]:
mv_clf.get_params()

{'estimators': [('lr',
   Pipeline(steps=[('sc', StandardScaler()),
                   ['clf', LogisticRegression(C=0.001, random_state=1)]])),
  ('dt',
   DecisionTreeClassifier(criterion='entropy', max_depth=1, random_state=0)),
  ('knn',
   Pipeline(steps=[('sc', StandardScaler()),
                   ['clf', KNeighborsClassifier(n_neighbors=1)]]))],
 'flatten_transform': True,
 'n_jobs': None,
 'verbose': False,
 'voting': 'hard',
 'weights': None,
 'lr': Pipeline(steps=[('sc', StandardScaler()),
                 ['clf', LogisticRegression(C=0.001, random_state=1)]]),
 'dt': DecisionTreeClassifier(criterion='entropy', max_depth=1, random_state=0),
 'knn': Pipeline(steps=[('sc', StandardScaler()),
                 ['clf', KNeighborsClassifier(n_neighbors=1)]]),
 'lr__memory': None,
 'lr__steps': [('sc', StandardScaler()),
  ['clf', LogisticRegression(C=0.001, random_state=1)]],
 'lr__verbose': False,
 'lr__sc': StandardScaler(),
 'lr__clf': LogisticRegression(C=0.001, random_state=1)

#### Let’s now tune the inverse regularization parameter, C , of the logistic regression classifier and the decision tree depth via a grid search for demonstration purposes

In [15]:
from sklearn.model_selection import GridSearchCV
params = {'dt__max_depth': [1, 2],'lr__clf__C': [0.001, 0.1, 100.0]}

In [16]:
grid = GridSearchCV(estimator=mv_clf,param_grid=params,cv=10)
grid.fit(X_train, y_train)

#### After the grid search has completed, we can print the different hyperparameter value combinations and the average scores computed via 10-fold cross-validation as follows:

In [17]:
for r, _ in enumerate(grid.cv_results_['mean_test_score']):
    mean_score = grid.cv_results_['mean_test_score'][r]
    std_dev = grid.cv_results_['std_test_score'][r]
    params = grid.cv_results_['params'][r]
    print(f'{mean_score:.3f} +/- {std_dev:.2f} {params}')

0.840 +/- 0.20 {'dt__max_depth': 1, 'lr__clf__C': 0.001}
0.860 +/- 0.20 {'dt__max_depth': 1, 'lr__clf__C': 0.1}
0.880 +/- 0.20 {'dt__max_depth': 1, 'lr__clf__C': 100.0}
0.840 +/- 0.20 {'dt__max_depth': 2, 'lr__clf__C': 0.001}
0.860 +/- 0.20 {'dt__max_depth': 2, 'lr__clf__C': 0.1}
0.880 +/- 0.20 {'dt__max_depth': 2, 'lr__clf__C': 100.0}


In [18]:
print(f'Best parameters: {grid.best_params_}')

Best parameters: {'dt__max_depth': 1, 'lr__clf__C': 100.0}


In [19]:
print(f'Best parameters: {grid.best_params_}')

Best parameters: {'dt__max_depth': 1, 'lr__clf__C': 100.0}


#### As you can see, we get the best cross-validation results when we choose a higher regularization strength ( C=100), whereas the tree depth does not seem to affect the performance at all, suggesting that a decision stump is sufficient to separate the data