🤖⚡ **scikit-learn tips**

In [1]:
import pandas as pd
import numpy as np

In [2]:
import sklearn
print(sklearn.__version__)

0.23.2


In [5]:
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import plot_confusion_matrix

In [4]:
# select columns by data type
num_cols = make_column_selector(dtype_include='number')
cat_cols = make_column_selector(dtype_exclude='number')

Four options for handling missing values (NaNs):

Drop rows containing NaNs  
Drop columns containing NaNs  
Fill NaNs with imputed values  
Use a model that natively handles NaNs (NEW!)  

In [None]:
imp = SimpleImputer()

# add an indicator matrix to show missingness as a new feature. missing = 1, value =  imputed mean
imputer = SimpleImputer(add_indicator=True)


# Try KNNImputer or IterativeImputer (inspired by R's MICE package). 
# Both are multivariate approaches (they take other features into account!)

impute_it = IterativeImputer()
impute_knn = KNNImputer(n_neighbors=2)

Don't use drop='first' with OneHotEncoder. Here's why:

1. Multicollinearity is rarely an issue with scikit-learn models
2. drop='first' is incompatible with handle_unknown='ignore'
3. May be problematic if you standardize all features or use a regularized model

In [None]:
# left-to-right column order is alphabetical (circle, oval, square)
ohe = OneHotEncoder(sparse=False)
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore') # category that has not been learnt in fit step is encoded as all zeros

# category ordering (within each feature) is defined by you
oe = OrdinalEncoder(categories=[['first', 'second', 'third'], ['S', 'M', 'L', 'XL']])
oe.fit_transform(X[['Class', 'Size']])

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

clf = HistGradientBoostingClassifier()
# no errors, despite NaNs in train and test!
clf.fit(train, label)
clf.predict(test)

Pipeline requires naming of steps, make_pipeline does not.

(Same applies to ColumnTransformer vs make_column_transformer)

In [None]:
ct = make_column_transformer(
    (ohe, ['Embarked', 'Sex']),
    (imp, ['Age']),
    remainder='passthrough')
pipe = make_pipeline(ct, clf)
#-----------------------------------
ct = ColumnTransformer(
    [('encoder', ohe, ['Embarked', 'Sex']),
     ('imputer', imp, ['Age'])],
    remainder='passthrough')
pipe = Pipeline([('preprocessor', ct), ('classifier', clf)])

## Cross-validate the entire pipeline (not just the model)

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()

## Find optimal tuning parameters for the entire pipeline

In [None]:
# specify parameter values to search
params = {}
params['columntransformer__countvectorizer__min_df'] = [1, 2]
params['logisticregression__C'] = [0.1, 1, 10]
params['logisticregression__penalty'] = ['l1', 'l2']


# try all possible combinations of those parameter values
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(X, y);

# what was the best score found during the search?
grid.best_score_


# which combination of parameters produced the best score?
grid.best_params_

GridSearchCV taking too long? Try RandomizedSearchCV with a small number of iterations.

Make sure to specify a distribution (instead of a list of values) for continuous parameters!

In [None]:
# try "n_iter" random combinations of those parameter values
from sklearn.model_selection import RandomizedSearchCV
rand = RandomizedSearchCV(pipe, params, n_iter=10, cv=5, scoring='accuracy', random_state=1)
rand.fit(X, y);

# what was the best score found during the search?
rand.best_score_


# which combination of parameters produced the best score?
rand.best_params_

Hyperparameter search results (from GridSearchCV or RandomizedSearchCV) can be converted into a pandas DataFrame.

Makes it far easier to explore the results!

In [None]:
# convert results into a DataFrame
results = pd.DataFrame(grid.cv_results_)[['params', 'mean_test_score', 'rank_test_score']]
# sort by test score
results.sort_values('rank_test_score')

### Plotting confusion matrix

In [None]:
# pass it a trained model: it makes predictions for X_test and compares them to y_test
disp = plot_confusion_matrix(clf, X_test, y_test, cmap='Blues', values_format='d')