## 1. Supervised Learning

We begin our ML journey with the supervised learning paradigm studying some classification and regression algorithms. We are gonna employ typical and industry standard out-of-shelf model implementations. The datasets used are classic sets for ML introduction: diabetes and breast cancer.

In [1]:
# Import datasets, data splitter and metrics
from sklearn.datasets import load_diabetes, load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error # Regression
from sklearn.metrics import accuracy_score 

In [2]:
import math 

In [3]:
SEED = 999 # for random state in train test splitting

### Regression Problem: Diabetes dataset

In [4]:
# data is a dictionary of objects. If the parameter return_X_y is set to True then data is an np array.
data = load_diabetes(return_X_y=False)
X = data['data']
y = data['target']

In [5]:
data

{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
          0.01990842, -0.01764613],
        [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
         -0.06832974, -0.09220405],
        [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
          0.00286377, -0.02593034],
        ...,
        [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
         -0.04687948,  0.01549073],
        [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
          0.04452837, -0.02593034],
        [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
         -0.00421986,  0.00306441]]),
 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
         69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
         68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
         87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
        259.,  53., 190., 142.,  75., 142., 155., 225.,  59

In [6]:
# Use the train_test_split method for get the train and test set. 
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=SEED)

### Linear Regression

In [7]:
from sklearn.linear_model import LinearRegression

In [8]:
# Initialize 
lr = LinearRegression()

In [9]:
# The default test size is 0.25 of the whole X data.
len(X), len(X_test)

(442, 111)

In [10]:
# The fit method adjust the model parameters minimizing an error function.
lr.fit(X_train, y_train)

LinearRegression()

In [11]:
#Train error:
y_pred_tr = lr.predict(X_train)
train_error = math.sqrt(mean_squared_error(y_pred_tr, y_train))
train_error 

55.69112244790681

In [12]:
# Test error:

In [13]:
y_pred = lr.predict(X_test)

In [14]:
mse = mean_squared_error(y_pred, y_test)

In [15]:
test_error = math.sqrt(mse)
test_error

47.5372373968945

### Multilayer Perceptron

In [16]:
from sklearn.neural_network import MLPRegressor

In [17]:
NN = MLPRegressor()

In [18]:
# Check some of its attributes
NN.activation, NN.hidden_layer_sizes, NN.activation

('relu', (100,), 'relu')

In [19]:
NN.fit(X_train, y_train)



MLPRegressor()

In [20]:
#Train error:
y_pred_tr = NN.predict(X_train)
train_error = math.sqrt(mean_squared_error(y_pred_tr, y_train))
train_error 

156.4650692974529

In [21]:
# Test error:

In [22]:
y_pred = NN.predict(X_test)

In [23]:
mse = mean_squared_error(y_pred, y_test)

In [24]:
test_error = math.sqrt(mse)
test_error

136.81009084124267

### Decision Tree

In [25]:
from sklearn.tree import DecisionTreeRegressor

In [26]:
dt = DecisionTreeRegressor()

In [27]:
dt.fit(X_train, y_train)

DecisionTreeRegressor()

In [28]:
#Train error:
y_pred_tr = dt.predict(X_train)
train_error = math.sqrt(mean_squared_error(y_pred_tr, y_train))
train_error 

0.0

In [29]:
# Test error:

In [30]:
y_pred = dt.predict(X_test)

In [31]:
mse = mean_squared_error(y_pred, y_test)

In [32]:
test_error = math.sqrt(mse)
test_error

71.292949754409

### Random Forest

In [33]:
from sklearn.ensemble import RandomForestRegressor

In [34]:
rf = RandomForestRegressor()

In [35]:
rf.fit(X_train, y_train)

RandomForestRegressor()

In [36]:
#Train error:
y_pred_tr = rf.predict(X_train)
train_error = math.sqrt(mean_squared_error(y_pred_tr, y_train))
train_error 

22.991615390131557

In [37]:
# Test error:

In [38]:
y_pred = rf.predict(X_test)

In [39]:
mse = mean_squared_error(y_pred, y_test)

In [40]:
test_error = math.sqrt(mse)
test_error

52.719078849831725

In [41]:
from sklearn.ensemble import AdaBoostRegressor

In [42]:
ada = AdaBoostRegressor()

In [43]:
ada.fit(X_train, y_train)

AdaBoostRegressor()

In [44]:
#Train error:
y_pred_tr = ada.predict(X_train)
train_error = math.sqrt(mean_squared_error(y_pred_tr, y_train))
train_error 

47.48624503489752

In [45]:
# Test error:

In [46]:
y_pred = ada.predict(X_test)

In [47]:
mse = mean_squared_error(y_pred, y_test)

In [48]:
test_error = math.sqrt(mse)
test_error

52.04700846092931

### XGBoost

In [49]:
import xgboost as xgb

In [50]:
xg_reg = xgb.XGBRegressor()

In [51]:
xg_reg.fit(X_train, y_train)



XGBRegressor()

In [52]:
#Train error:
y_pred_tr = xg_reg.predict(X_train)
train_error = math.sqrt(mean_squared_error(y_pred_tr, y_train))
train_error 

32.615154929729975

In [53]:
# Test error:

In [54]:
y_pred = xg_reg.predict(X_test)

In [55]:
mse = mean_squared_error(y_pred, y_test)

In [56]:
test_error = math.sqrt(mse)
test_error

51.81883653672616

### Binary Classification Problem: Cancer dataset

In [57]:
# data is a dictionary of objects. If the parameter return_X_y is set to True then data is an np array.
data = load_breast_cancer(return_X_y=False)
X = data['data']
y = data['target']

In [58]:
data

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=SEED)

### Logistic Regression

In [60]:
from sklearn.linear_model import LogisticRegression

In [61]:
log_reg = LogisticRegression()

In [62]:
log_reg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [63]:
#Train accuracy:
y_pred_tr = log_reg.predict(X_train)
train_acc = accuracy_score(y_pred_tr, y_train)
train_acc 

0.9530516431924883

In [64]:
# Test accuracy:

In [65]:
y_pred = log_reg.predict(X_test)
test_acc = accuracy_score(y_pred, y_test)
test_acc

0.9370629370629371

### MLP

In [66]:
from sklearn.neural_network import MLPClassifier

In [67]:
NN = MLPClassifier()

In [68]:
NN.fit(X_train, y_train)

MLPClassifier()

In [69]:
#Train accuracy:
y_pred_tr = NN.predict(X_train)
train_acc = accuracy_score(y_pred_tr, y_train)
train_acc 

0.9178403755868545

In [70]:
# Test accuracy:

In [71]:
y_pred = NN.predict(X_test)
test_acc = accuracy_score(y_pred, y_test)
test_acc

0.8601398601398601

### Decision Tree

In [72]:
from sklearn.tree import DecisionTreeClassifier

In [73]:
dt = DecisionTreeClassifier()

In [74]:
dt.fit(X_train, y_train)

DecisionTreeClassifier()

In [75]:
#Train accuracy:
y_pred_tr = dt.predict(X_train)
train_acc = accuracy_score(y_pred_tr, y_train)
train_acc 

1.0

In [76]:
# Test accuracy:

In [77]:
y_pred = dt.predict(X_test)
test_acc = accuracy_score(y_pred, y_test)
test_acc

0.9090909090909091

### Random Forest

In [78]:
from sklearn.ensemble import RandomForestClassifier

In [79]:
rf = RandomForestClassifier()

In [80]:
rf.fit(X_train, y_train)

RandomForestClassifier()

In [81]:
#Train accuracy:
y_pred_tr = rf.predict(X_train)
train_acc = accuracy_score(y_pred_tr, y_train)
train_acc 

1.0

In [82]:
# Test accuracy:

In [83]:
y_pred = rf.predict(X_test)
test_acc = accuracy_score(y_pred, y_test)
test_acc

0.9370629370629371

### AdaBoost

In [84]:
from sklearn.ensemble import AdaBoostClassifier

In [85]:
ada = AdaBoostClassifier()

In [86]:
ada.fit(X_train, y_train)

AdaBoostClassifier()

In [87]:
#Train accuracy:
y_pred_tr = ada.predict(X_train)
train_acc = accuracy_score(y_pred_tr, y_train)
train_acc 

1.0

In [88]:
# Test accuracy:

In [89]:
y_pred = ada.predict(X_test)
test_acc = accuracy_score(y_pred, y_test)
test_acc

0.9370629370629371

### XGBoost

In [90]:
xgb_clf = xgb.XGBClassifier()

In [91]:
xgb_clf.fit(X_train, y_train)

XGBClassifier()

In [92]:
#Train accuracy:
y_pred_tr = xgb_clf.predict(X_train)
train_acc = accuracy_score(y_pred_tr, y_train)
train_acc 

1.0

In [93]:
# Test accuracy:

In [94]:
y_pred = xgb_clf.predict(X_test)
test_acc = accuracy_score(y_pred, y_test)
test_acc

0.9300699300699301

## 2. Model selection and evaluation

The default hyperparameters may not be the best for model performance. One should choose different combinations and validate the resultant model.

#### Model selection with Cross-Validation Regression
We are gonna use grid search for selecting hyperparameters using Cross-Validation 5-fold. We begin with the regression problem.

In [95]:
data = load_diabetes(return_X_y=False)
X = data['data']
y = data['target']

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=SEED)

In [97]:
from sklearn.model_selection import GridSearchCV

In [98]:
# The parameters for the GridSearchCV object are established in a dictionary
# with the name of the hyperparameter and the possible values:
# for example, in the logistic regression the parameters are C and the penalty and the grid is:
# log_reg_grid = {'C':[0.1, 0.05, 1, 5, 10], 'penalty': ['l1,', 'l2']}
# Grid search means that the estimator will use all the combinations of the selected hyperparameters, 
# in the case of this log_reg_grid, the total combinations are: (5) * (2)


In [99]:
# We compare Linear regression with penalization, MLP, and RandomForest

#### In order to regularize a linear regression model, two estimators will be used: Lasso and Ridge 
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html#sklearn.linear_model.Ridge
The grid search will then be performed over the alpha parameter

In [100]:
from sklearn.linear_model import Lasso, Ridge

In [101]:
lasso = Lasso()
ridge = Ridge()

In [102]:
lasso_params = {'alpha':[0.001, 0.01, 0.1, 0.5, 1.0, 5.0, 10.0]}
ridge_params = {'alpha':[0.001, 0.01, 0.1, 0.5, 1.0, 5.0, 10.0]}

In [103]:
lasso_grid = GridSearchCV(estimator=lasso, param_grid=lasso_params, cv=5, scoring='neg_root_mean_squared_error')

In [104]:
%%timeit
lasso_grid.fit(X_train, y_train) # The X_train set will be splitted n folds times, and the test_set will be used for final evaluation

142 ms ± 10.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [105]:
lasso_grid.cv_results_

{'mean_fit_time': array([0.00286274, 0.00212264, 0.00144577, 0.00101366, 0.00120802,
        0.00243497, 0.00108371]),
 'std_fit_time': array([5.33418857e-04, 7.46656359e-04, 8.02414608e-04, 4.27488169e-05,
        4.56204077e-04, 2.99920299e-03, 1.26825472e-04]),
 'mean_score_time': array([0.00058174, 0.00081654, 0.00056143, 0.00055342, 0.00053978,
        0.0005085 , 0.00055041]),
 'std_score_time': array([3.76499098e-05, 5.32068854e-04, 4.72801295e-05, 3.36673374e-05,
        7.73968550e-05, 4.61295037e-05, 3.42731497e-05]),
 'param_alpha': masked_array(data=[0.001, 0.01, 0.1, 0.5, 1.0, 5.0, 10.0],
              mask=[False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'alpha': 0.001},
  {'alpha': 0.01},
  {'alpha': 0.1},
  {'alpha': 0.5},
  {'alpha': 1.0},
  {'alpha': 5.0},
  {'alpha': 10.0}],
 'split0_test_score': array([-59.08361052, -59.19897473, -59.28041167, -62.33718381,
        -68.31259136, -83.68676327, -83.686

In [106]:
print('Best hyperparameters: ', lasso_grid.best_params_)
print('Best score: ', - lasso_grid.best_score_)

Best hyperparameters:  {'alpha': 0.01}
Best score:  57.58744869608974


In [107]:
ridge_grid = GridSearchCV(estimator=ridge, param_grid=ridge_params, cv=5,  scoring='neg_root_mean_squared_error')

In [108]:
%%timeit
ridge_grid.fit(X_train, y_train)

159 ms ± 34.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [109]:
print('Best hyperparameters: ', ridge_grid.best_params_)
print('Best score: ', - ridge_grid.best_score_)

Best hyperparameters:  {'alpha': 0.1}
Best score:  57.53623023152693


#### MLP

In [110]:
mlp = MLPRegressor()

In [119]:
# alpha for regularizing, more info in the docs https://scikit-learn.org/stable/modules/neural_networks_supervised.html
mlp_params = {'alpha': [0.1, 0.01],
            'hidden_layer_sizes': [(20,), (50,)],
            'learning_rate_init': [0.01, 0.001 ]} # total combinations: 2 * 2 *  = 8

In [120]:
# the grid object is initialized with the regressor, the grid dictionary and the number of folds. 
mlp_grid = GridSearchCV(estimator=mlp, param_grid=mlp_params, cv=5, scoring='neg_root_mean_squared_error')

In [121]:
%%timeit
mlp_grid.fit(X_train, y_train) # The X_train set will be splitted n folds times, and the test_set will be used for final evaluation



12.3 s ± 2.18 s per loop (mean ± std. dev. of 7 runs, 1 loop each)




In [122]:
print('Best hyperparameters: ', mlp_grid.best_params_)
print('Best score: ', - mlp_grid.best_score_)

Best hyperparameters:  {'alpha': 0.1, 'hidden_layer_sizes': (50,), 'learning_rate_init': 0.01}
Best score:  59.174632906239495


#### Random Forest

In [123]:
rf = RandomForestRegressor()

In [124]:
rf_params = {'n_estimators': [100, 150],
            'max_depth': [8, 10],
            'max_samples': [0.9, 1.0]} # total combinations: 2 * 2 * 2 = 8

#### We select just a few params for both MLP and RF, in practice the search space considers more hyperparameters and values.


In [125]:
rf_grid = GridSearchCV(estimator=rf, param_grid=rf_params, cv=5, scoring='neg_root_mean_squared_error')

In [126]:
%%timeit
rf_grid.fit(X_train, y_train)

12.2 s ± 666 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [127]:
print('Best hyperparameters: ', rf_grid.best_params_)
print('Best score: ', - rf_grid.best_score_)

Best hyperparameters:  {'max_depth': 8, 'max_samples': 0.9, 'n_estimators': 100}
Best score:  59.80812108751703


We can now select a model for training using the whole training set and validating the model with the test set.
We choose the simplest model since there is no significant difference in the obtained scores, at least with the hyperparameters for searching. That is, we are gonna train the ridge regressor with an alpha of 0.1

In [128]:
model = Ridge(alpha=0.1)

In [129]:
model.fit(X_train, y_train)

Ridge(alpha=0.1)

In [130]:
y_pred = model.predict(X_test)
val_score = math.sqrt(mean_squared_error(y_pred, y_test))
val_score # validation score

47.88982706257487

#### Model selection with Cross-Validation Classification


In [131]:
data = load_breast_cancer(return_X_y=False)
X = data['data']
y = data['target']

In [132]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=SEED)

#### Logistic Regression

In [133]:
log_reg = LogisticRegression()

In [143]:
log_params = {'C': [0.001, 0.01, 0.1, 1.0, 10.0]}

In [144]:
from sklearn.metrics import f1_score

In [148]:
log_grid = GridSearchCV(estimator=log_reg, param_grid=log_params, cv=5, scoring='accuracy')

In [149]:
log_grid.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': [0.001, 0.01, 0.1, 1.0, 10.0]},
             scoring='accuracy')

In [150]:
print('Best hyperparameters: ', log_grid.best_params_)
print('Best score: ', log_grid.best_score_)

Best hyperparameters:  {'C': 10.0}
Best score:  0.9577564979480163


#### MLP

In [151]:
mlp = MLPClassifier()

In [152]:
mlp_params = {'alpha': [0.1, 0.01],
            'hidden_layer_sizes': [(20,), (50,)],
            'learning_rate_init': [0.01, 0.001 ]}

In [155]:
mlp_grid = GridSearchCV(estimator=mlp, param_grid=mlp_params, cv=5, scoring='accuracy')

In [156]:
%%timeit
mlp_grid.fit(X_train, y_train)



10.8 s ± 840 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [157]:
print('Best hyperparameters: ', mlp_grid.best_params_)
print('Best score: ', mlp_grid.best_score_)

Best hyperparameters:  {'alpha': 0.1, 'hidden_layer_sizes': (20,), 'learning_rate_init': 0.001}
Best score:  0.9225991792065663


#### Random Forest

In [158]:
rf = RandomForestClassifier()

In [159]:
rf_params = {'n_estimators': [100, 150],
            'max_depth': [8, 10],
            'max_samples': [0.9, 1.0]} # total combinations: 2 * 2 * 2 = 8

In [160]:
rf_grid = GridSearchCV(estimator=rf, param_grid=rf_params, cv=5, scoring='accuracy')

In [161]:
%%timeit
rf_grid.fit(X_train, y_train)

10.1 s ± 316 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [163]:
print('Best hyperparameters: ', rf_grid.best_params_)
print('Best score: ',  rf_grid.best_score_)

Best hyperparameters:  {'max_depth': 8, 'max_samples': 0.9, 'n_estimators': 150}
Best score:  0.9670861833105336


The selected model is a RF, but special care should be taken when accuracy is used for evaluating the model and the dataset is highly imbalanced. In this case the balance is: number of positives / total samples

In [164]:
# percentage of pos:
len(y_train[y_train == 1]) / len(y_train)

0.6502347417840375

In highly imbalanced sets, other metrics are appropriate such as ROC, and f1-score

In [168]:
from sklearn.metrics import f1_score

In [166]:
model = RandomForestClassifier(max_depth=8, max_samples=0.9, n_estimators=150)

In [167]:
model.fit(X_train, y_train)

RandomForestClassifier(max_depth=8, max_samples=0.9, n_estimators=150)

In [170]:
y_pred = model.predict(X_test)
val_acc = accuracy_score(y_pred, y_test)
val_f1_score = f1_score(y_pred, y_test) # validation score

In [171]:
print('Validation acc: ', val_acc)
print('Validation f1-score: ', val_f1_score)

Validation acc:  0.9370629370629371
Validation f1-score:  0.9447852760736196


### Extra Reading:
#### Bayes Optimization: https://neptune.ai/blog/hyperparameter-tuning-in-python-complete-guide
#### AutoML: https://www.automl.org/automl/
#### Data and concept drift: https://www.evidentlyai.com/blog/machine-learning-monitoring-data-and-concept-drift
