# 🤖⚡ **scikit-learn tips**

Reference material to pull up when using scikit-learn 

In [1]:
import pandas as pd
import numpy as np

In [2]:
import sklearn
print(sklearn.__version__)

0.23.2


In [5]:
# necessary imports
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import plot_confusion_matrix

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_roc_curve

## Feature engineering and cleaning

In [4]:
# select columns by data type
num_cols = make_column_selector(dtype_include='number')
cat_cols = make_column_selector(dtype_exclude='number')

Four options for handling missing values (NaNs):

Drop rows containing NaNs  
Drop columns containing NaNs  
Fill NaNs with imputed values  
Use a model that natively handles NaNs (NEW!)  

In [None]:
imp = SimpleImputer()
# Categorical
imputer = SimpleImputer(strategy='constant', fill_value='missing')
imputer = SimpleImputer(strategy='most_frequent')

# add an indicator matrix to show missingness as a new feature. missing = 1, value =  imputed mean
imputer = SimpleImputer(add_indicator=True)


# Try KNNImputer or IterativeImputer (inspired by R's MICE package). 
# Both are multivariate approaches (they take other features into account!)

impute_it = IterativeImputer()
impute_knn = KNNImputer(n_neighbors=2)

Don't use drop='first' with OneHotEncoder. Here's why:

1. Multicollinearity is rarely an issue with scikit-learn models
2. drop='first' is incompatible with handle_unknown='ignore'
3. May be problematic if you standardize all features or use a regularized model

In [None]:
# left-to-right column order is alphabetical (circle, oval, square)
ohe = OneHotEncoder(sparse=False)
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore') # category that has not been learnt in fit step is encoded as all zeros

# category ordering (within each feature) is defined by you
oe = OrdinalEncoder(categories=[['first', 'second', 'third'], ['S', 'M', 'L', 'XL']])
oe.fit_transform(X[['Class', 'Size']])

Use drop='if_binary' with OneHotEncoder to drop the first category ONLY if it's a binary feature (meaning it has exactly two categories).

In [None]:
# drop='first' drops the first category in each feature
ohe = OneHotEncoder(sparse=False, drop='first')
ohe.fit_transform(X)

# drop='if_binary' drops the first category of binary features
ohe = OneHotEncoder(sparse=False, drop='if_binary')
ohe.fit_transform(X)

With a tree-based model, try OrdinalEncoder instead of OneHotEncoder even for nominal (unordered) features.

Accuracy will often be similar, but OrdinalEncoder will be much faster!

In [None]:
# Random Forests is a tree-based model
rf = RandomForestClassifier(random_state=1, n_jobs=-1)

# Pipeline containing OneHotEncoder
ohe_pipe = make_pipeline(ohe, rf)
%time cross_val_score(ohe_pipe, X, y).mean()

You don't need all the features in the one hot encoding. if the feature is binary then just having one of the ohe vectors is sufficent to give full info. same with multi values features. you can drop the first one

Even with Nans you can still run models

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

clf = HistGradientBoostingClassifier()
# no errors, despite NaNs in train and test!
clf.fit(train, label)
clf.predict(test)

Create more features based on interaction

In [None]:
from sklearn.preprocessing import PolynomialFeatures

# Create polynomial relationships between features
poly = PolynomialFeatures(include_bias=False, interaction_only=True)
poly.fit_transform(X)

# Input columns: A, B, C
# Output columns: A, B, C, A*B, A*C, B*C

## Pipelines

Pipeline requires naming of steps, make_pipeline does not.

(Same applies to ColumnTransformer vs make_column_transformer)

In [None]:
ct = make_column_transformer(
    (ohe, ['Embarked', 'Sex']),
    (imp, ['Age']),
    remainder='passthrough')
pipe = make_pipeline(ct, clf)
#-----------------------------------
ct = ColumnTransformer(
    [('encoder', ohe, ['Embarked', 'Sex']),
     ('imputer', imp, ['Age'])],
    remainder='passthrough')
pipe = Pipeline([('preprocessor', ct), ('classifier', clf)])

In [None]:
# impute A, passthrough B & C, then drop the remaining columns
ct = make_column_transformer(
    (impute, ['A']),
    ('passthrough', ['B', 'C']),
    remainder='drop')

# impute A, drop D & E, then passthrough the remaining columns
ct = make_column_transformer(
    (impute, ['A']),
    ('drop', ['D', 'E']),
    remainder='passthrough')

In [None]:
# get the names of those one-hot encoded features
ct.get_feature_names() 

#### Adding feature selection to the pipeline

In [None]:
from sklearn.feature_selection import SelectPercentile, chi2

# keep 50% of features with the best chi-squared scores
selection = SelectPercentile(chi2, percentile=50)
pipe = make_pipeline(vect, selection, clf)

cross_val_score(pipe, X, y, scoring='accuracy').mean()

#### Transform multiple features

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

from sklearn.compose import make_column_transformer
ct = make_column_transformer((vect, 'Name'), (vect, 'Cabin'))
ct.fit_transform(X)

####  Lambda function transformer

In [None]:
from sklearn.preprocessing import FunctionTransformer

clip_values = FunctionTransformer(np.clip, kw_args={'a_min':100, 'a_max':600})

# extract the first letter from each string
def first_letter(df):
    return df.apply(lambda x: x.str.slice(0, 1))
get_first_letter = FunctionTransformer(first_letter)

ct = make_column_transformer(
    (clip_values, ['Fare']),
    (get_first_letter, ['Code', 'Deck']))

#### Display the pipeline and make iiit interactive 

In [None]:
# display estimators as diagrams
from sklearn import set_config
set_config(display='diagram')
pipe = make_pipeline(ct, selection, logreg)
pipe

# export the diagram to a file
from sklearn.utils import estimator_html_repr
with open('pipeline.html', 'w') as f:  
    f.write(estimator_html_repr(pipe))


Accessing parts of the pipeline individually

In [None]:
# create Pipeline
pipe = Pipeline([('preprocessor', ct), ('feature selector', fs), ('classifier', clf)])
pipe

# access step 0 (preprocessor)
pipe[0].fit_transform(X)

# access steps 0 and 1 (preprocessor and feature selector)
pipe[0:2].fit_transform(X, y)

# access step 1 (feature selector)
pipe[1].get_support()

#### Cross-validate the entire pipeline (not just the model)

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()

### GridSearch - Find optimal tuning parameters for the entire pipeline

In [None]:
# specify parameter values to search
params = {}
params['columntransformer__countvectorizer__min_df'] = [1, 2]
params['logisticregression__C'] = [0.1, 1, 10]
params['logisticregression__penalty'] = ['l1', 'l2']


# try all possible combinations of those parameter values
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(X, y);

# what was the best score found during the search?
grid.best_score_


# which combination of parameters produced the best score?
grid.best_params_

You can tune 2+ models using the same grid search! Here's how:

Create multiple parameter dictionaries  
Specify the model within each dictionary  
Put the dictionaries in a list  

In [None]:
# each of these models will take a turn as the second Pipeline step
clf1 = LogisticRegression(solver='liblinear', random_state=1)
clf2 = RandomForestClassifier(random_state=1)
# create the Pipeline
pipe = Pipeline([('preprocessor', ct), ('classifier', clf1)])

# create the parameter dictionary for clf1
params1 = {}
params1['preprocessor__vectorizer__ngram_range'] = [(1, 1), (1, 2)]
params1['classifier__penalty'] = ['l1', 'l2']
params1['classifier__C'] = [0.1, 1, 10]
params1['classifier'] = [clf1]
# create the parameter dictionary for clf2
params2 = {}
params2['preprocessor__vectorizer__ngram_range'] = [(1, 1), (1, 2)]
params2['classifier__n_estimators'] = [100, 200]
params2['classifier__min_samples_leaf'] = [1, 2]
params2['classifier'] = [clf2]


# create a list of parameter dictionaries
params = [params1, params2]

# this will search every parameter combination within each dictionary
grid = GridSearchCV(pipe, params)
grid.fit(X, y)
grid.best_params_

#### Random search

GridSearchCV taking too long? Try RandomizedSearchCV with a small number of iterations.

Make sure to specify a distribution (instead of a list of values) for continuous parameters!

In [None]:
# try "n_iter" random combinations of those parameter values
from sklearn.model_selection import RandomizedSearchCV
rand = RandomizedSearchCV(pipe, params, n_iter=10, cv=5, scoring='accuracy', random_state=1)
rand.fit(X, y);

# what was the best score found during the search?
rand.best_score_


# which combination of parameters produced the best score?
rand.best_params_

Hyperparameter search results (from GridSearchCV or RandomizedSearchCV) can be converted into a pandas DataFrame.

Makes it far easier to explore the results!

In [None]:
# convert results into a DataFrame
results = pd.DataFrame(grid.cv_results_)[['params', 'mean_test_score', 'rank_test_score']]
# sort by test score
results.sort_values('rank_test_score')

In [None]:
# Run in parallel to finish quicker
grid = GridSearchCV(pipe, params, n_jobs=-1)
%time grid.fit(X, y)

### Plotting confusion matrix

In [None]:
# pass it a trained model: it makes predictions for X_test and compares them to y_test
disp = plot_confusion_matrix(clf, X_test, y_test, cmap='Blues', values_format='d')

#### Display mutiple AUROC curves to compare classifiers

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

lr = LogisticRegression()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()

lr.fit(X_train, y_train);
dt.fit(X_train, y_train);
rf.fit(X_train, y_train);

disp = plot_roc_curve(lr, X_test, y_test)
plot_roc_curve(dt, X_test, y_test, ax=disp.ax_);
plot_roc_curve(rf, X_test, y_test, ax=disp.ax_);

### Display decision tree as either image or text

In [None]:
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree, export_text  # both are new in 0.21

plt.figure(figsize=(8, 6))
plot_tree(dt, feature_names=features, class_names=classes, filled=True);

print(export_text(dt, feature_names=features, show_weights=True))

Prune or cut down on number of nodes for better performance

In [None]:
# Pruning 

# default tree has 331 nodes
dt = DecisionTreeClassifier(random_state=0)
dt.fit(X, y).tree_.node_count


# pruned tree has 121 nodes
dt = DecisionTreeClassifier(ccp_alpha=0.001, random_state=0)
dt.fit(X, y).tree_.node_count

# pruning improved the cross-validated accuracy
cross_val_score(dt, X, y, cv=5, scoring='accuracy').mean()

### Saving Pipelines

In [None]:
# save the pipeline to a file
import joblib
joblib.dump(pipe, 'pipe.joblib')

# load the pipeline from a file
same_pipe = joblib.load('pipe.joblib')

### Cross Validation

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold

# For Regression
kf = KFold(5, shuffle=True, random_state=1)
cross_val_score(reg, X_reg, y_reg, cv=kf, scoring='r2')


# For Classification
skf = StratifiedKFold(5, shuffle=True, random_state=1)
cross_val_score(clf, X_clf, y_clf, cv=skf, scoring='accuracy')

#### For Multi-class 

In [None]:
from sklearn.metrics import roc_auc_score
y_score = clf.predict_proba(X_test)

# use 'ovo' (One-vs-One) or 'ovr' (One-vs-Rest)
roc_auc_score(y_test, y_score, multi_class='ovo')

In [None]:
from sklearn.model_selection import cross_val_score
# use 'roc_auc_ovo' (One-vs-One) or 'roc_auc_ovr' (One-vs-Rest)
cross_val_score(clf, X, y, cv=5, scoring='roc_auc_ovo').mean()

There's no need to use ".values" when passing a DataFrame or Series to scikit-learn... it knows how to access the underlying NumPy array!

In [None]:
print(type(X))
print(type(y))
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>

# there's no need to use X.values or y.values
clf.fit(X, y)

## Ensemble

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB

lr = LogisticRegression(solver='liblinear', random_state=1)
rf = RandomForestClassifier(max_features=None, random_state=1)
nb = MultinomialNB()

In [None]:
# create an ensemble for improved accuracy
vc = VotingClassifier([('clf1', lr), ('clf2', rf)], voting='soft')
cross_val_score(vc, X, y).mean()

In [None]:
# create an ensemble of 3 classifiers
vc = VotingClassifier([('clf1', lr), ('clf2', rf), ('clf3', nb)])
cross_val_score(vc, X, y).mean()

# define VotingClassifier parameters to search
params = {'voting':['hard', 'soft'],
          'weights':[(1,1,1), (2,1,1), (1,2,1), (1,1,2)]}

# find the best set of parameters
grid = GridSearchCV(vc, params)
grid.fit(X, y)

grid.best_params_
{'voting': 'soft', 'weights': (1, 2, 1)}

# accuracy has improved
grid.best_score_