<a href="https://colab.research.google.com/github/simonprudhomme/sklearn_tips/blob/master/Scikit_Learn_Tips.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Scikit-learn tip #1

Use ColumnTransformer to apply different preprocessing to different columns

In [14]:
# Load Python Package
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer

# Load data (working with Titanic dataset)
data = pd.read_csv('https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv')
data.head(3)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925


In [17]:
# Make Transformer
transformer = make_column_transformer(
    (OneHotEncoder(), ['Pclass','Sex']),
    (SimpleImputer(), ['Age']),
    remainder='passthrough'
)

# Fit-Transform data with transformer
transformer.fit_transform(data)

array([[0.0, 0.0, 1.0, ..., 1, 0, 7.25],
       [1.0, 0.0, 0.0, ..., 1, 0, 71.2833],
       [0.0, 0.0, 1.0, ..., 0, 0, 7.925],
       ...,
       [0.0, 0.0, 1.0, ..., 1, 2, 23.45],
       [1.0, 0.0, 0.0, ..., 0, 0, 30.0],
       [0.0, 0.0, 1.0, ..., 0, 0, 7.75]], dtype=object)

## scikit-learn tip #2

There are SEVEN ways to select columns using ColumnTransformer

In [22]:
# Load Python Package
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector


# Load data (working with Titanic dataset)
data = pd.read_csv('https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv')
data.pop('Name')
data.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925


In [23]:
# Make Transformer
transformer = make_column_transformer(
    (OneHotEncoder(), make_column_selector(dtype_include='object')),
    (SimpleImputer(), make_column_selector(dtype_include='int')),
    remainder='drop'
)

# Fit-Transform data with transformer
transformer.fit_transform(data)

array([[0., 1., 0., 3., 1., 0.],
       [1., 0., 1., 1., 1., 0.],
       [1., 0., 1., 3., 0., 0.],
       ...,
       [1., 0., 0., 3., 1., 2.],
       [0., 1., 1., 1., 0., 0.],
       [0., 1., 0., 3., 0., 0.]])

In [0]:
#https://mail.google.com/mail/u/0/#label/Data+School/WhctKJVqrsDJXwHPtwJkplqhDQppftcGXSMDqlvvNvSTCWVdkhwsNQGDznJPKqFjWSgLZzQ

## scikit-learn tip #3

Two common ways to encode categorical features:

*   OneHotEncoder for unordered (nominal) data
*   OrdinalEncoder for ordered (ordinal) data

use handle_unknown='ignore'

In [82]:
# Load Python Package
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# Load data (working with Titanic dataset)
data = pd.read_csv('https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv')
data.head(3)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925


In [83]:
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
ohe.fit_transform(data.drop('Survived',axis=1)[['Sex','Pclass']],data['Survived'])

array([[0., 1., 0., 0., 1.],
       [1., 0., 1., 0., 0.],
       [1., 0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0., 1.],
       [0., 1., 1., 0., 0.],
       [0., 1., 0., 0., 1.]])

In [0]:
ce = OrdinalEncoder(categories=[['1','2','3']])
ce.fit_transform(data.drop('Survived',axis=1)[['Pclass']],data['Survived'])

## scikit-learn tip #4



What does "pipeline" do? 

Chains together multiple steps: output of each step is used as input to the next step.

Makes it easy to apply the same preprocessing to train and test!

In [0]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer, iter
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier


from sklearn.model_selection import train_test_split

# Load data (working with Titanic dataset)
data = pd.read_csv('https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv')
data.head(3)

X_train, X_test, y_train, y_test = train_test_split(data.drop('Survived',axis=1), data[['Survived']], test_size=0.33, random_state=42,shuffle=True)

In [126]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse=True)
imputer = SimpleImputer(add_indicator=True, verbose=1)
scaler = StandardScaler()
#clf = DecisionTreeClassifier()
clf = MLPClassifier(learning_rate_init=0.002, early_stopping=True, random_state=42)


preproces = make_column_transformer(
    (make_pipeline(imputer,scaler),['Age','Siblings/Spouses Aboard','Parents/Children Aboard','Fare']),
    (ohe, ['Pclass','Sex','Name']),
    remainder='passthrough',
    n_jobs=-1)

model = make_pipeline(preproces,clf)

model.fit(X_train, y_train.values.ravel())
print("Best score : %f" % model.score(X_test, y_test.values.ravel()))

Best score : 0.795222


In [101]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

ohe = OneHotEncoder(handle_unknown='ignore', sparse=True)
imputer = SimpleImputer(add_indicator=True, verbose=1)
scaler = StandardScaler()
clf = LogisticRegression(verbose=0)
clf_1 = DecisionTreeClassifier()
clf_2 = MLPClassifier(max_iter=200, verbose=0)

preproces = make_column_transformer(
    (make_pipeline(imputer,scaler),['Age','Siblings/Spouses Aboard','Parents/Children Aboard','Fare']),
    (ohe, ['Pclass','Sex','Name']),
    remainder='passthrough',
    n_jobs=-1)

param_grid = {
    'columntransformer__pipeline__simpleimputer__strategy': ['mean', 'median']
    }

model = make_pipeline(preproces,clf_2)

grid_clf = GridSearchCV(model, param_grid, cv=10, iid=False)
grid_clf.fit(X_train, y_train.values.ravel())
print("Best from grid search: %f" % grid_clf.best_estimator_.score(X_test, y_test.values.ravel()))



Best from grid search: 0.798635


In [0]:
#https://jorisvandenbossche.github.io/blog/2018/05/28/scikit-learn-columntransformer/

## scikit-learn tip #5


Need something better than SimpleImputer for missing value imputation?

Try KNNImputer or IterativeImputer (inspired by R's MICE package). Both are multivariate approaches (they take other features into account!)

In [179]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.experimental import enable_iterative_imputer, enable_hist_gradient_boosting
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

from sklearn.model_selection import train_test_split

# Load data (working with Titanic dataset)
data = pd.read_csv('https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv')
data.pop('Name')
data.head(3)

X_train, X_test, y_train, y_test = train_test_split(data.drop('Survived',axis=1), data[['Survived']], test_size=0.2, random_state=13, shuffle=True, stratify=data[['Survived']])

ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
imputer = KNNImputer(add_indicator=True)
scaler = StandardScaler()
clf = HistGradientBoostingClassifier()

preproces = make_column_transformer(
    (make_pipeline(imputer,scaler),['Age','Siblings/Spouses Aboard','Parents/Children Aboard','Fare']),
    (ohe, ['Pclass','Sex']),
    remainder='passthrough',
    n_jobs=-1)

model = make_pipeline(preproces,clf)

model.fit(X_train, y_train.values.ravel())
print("Best score : %f" % model.score(X_test, y_test.values.ravel()))

Best score : 0.831461
