### Scikit-Learn tip 1
Use make_column_transformer to apply different preprocessing to different columns

In [1]:
# Load Python Package
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer

In [2]:
# Load data (loading Titanic dataset)
data  = pd.read_csv('https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv')

In [3]:
# Make Transformer
preprocessing = make_column_transformer(
    (OneHotEncoder(), ['Pclass','Sex']),
    (SimpleImputer(), ['Age']),
    remainder='passthrough')


In [4]:
# Fit-Transform data with transformer
preprocessing.fit_transform(data)

array([[0.0, 0.0, 1.0, ..., 1, 0, 7.25],
       [1.0, 0.0, 0.0, ..., 1, 0, 71.2833],
       [0.0, 0.0, 1.0, ..., 0, 0, 7.925],
       ...,
       [0.0, 0.0, 1.0, ..., 1, 2, 23.45],
       [1.0, 0.0, 0.0, ..., 0, 0, 30.0],
       [0.0, 0.0, 1.0, ..., 0, 0, 7.75]], dtype=object)

### Scikit-Learn Tip #2

Select columns using make_column_selector with make_columns_transformer

In [5]:
# Load Python Package
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

In [6]:
# Load data (loading Titanic dataset)
data = pd.read_csv('https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv')

In [7]:
# Make Transformer
preprocessing = make_column_transformer(
    (OneHotEncoder(), make_column_selector(dtype_include='object')),
    (SimpleImputer(), make_column_selector(dtype_include='int')),
    remainder='drop'
)

In [8]:
# Fit-Transform data with transformer
preprocessing.fit_transform(data)

<887x889 sparse matrix of type '<class 'numpy.float64'>'
	with 1774 stored elements in Compressed Sparse Row format>

### Scikit-Learn Tip #3

Use Pipeline. Pipeline chains together multiple preprocessing steps. The output of each step is used as input to the next step, it makes it easy to apply the same preprocessing to Train and Test.

In [9]:
# Load Python Package
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [10]:
# Load data (loading Titanic dataset)
data = pd.read_csv('https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv')

In [11]:
# Set X and y
X = data.drop('Survived',axis=1)
y = data[['Survived']]

In [12]:
# Split Train and Test
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)

In [13]:
# Set variables
ohe = OneHotEncoder(handle_unknown='ignore', sparse=True)
imputer = SimpleImputer(add_indicator=True, verbose=1)
scaler = StandardScaler()
clf = DecisionTreeClassifier()

In [14]:
# Make Transformer
preprocessing = make_column_transformer(
(make_pipeline(imputer,scaler),['Age','Siblings/Spouses Aboard','Parents/Children Aboard','Fare'])
,(ohe, ['Pclass','Sex','Name'])
,remainder='passthrough')

In [15]:
# Make pipeline
pipe = make_pipeline(preprocessing, clf)

In [16]:
# Fit model
pipe.fit(X_train, y_train.values.ravel())
print("Best score : %f" % pipe.score(X_test, y_test.values.ravel()))

Best score : 0.784983


### Scikit-Learn Tip #4
Need something better than SimpleImputer for missing value imputation? Try KNNImputer or IterativeImputer (inspired by MICE package). Both are multivariate approaches (they take other features into account!)

In [17]:
# Load Python Package
from sklearn.experimental import enable_iterative_imputer, enable_hist_gradient_boosting
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer

### Scikit-Learn Tip #5
You can cross-validate an entire pipeline.

In [18]:
# Load Python Package
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

In [19]:
# Load data (loading Titanic dataset)
data = pd.read_csv('https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv')

In [20]:
# Set X and y
X = data.drop('Survived',axis=1)
y = data[['Survived']]

In [21]:
# Set variables
ohe = OneHotEncoder(handle_unknown='ignore', sparse=True)
imputer = SimpleImputer(add_indicator=True, verbose=1)
clf = DecisionTreeClassifier()

In [22]:
# Make Transformer
preprocessing = make_column_transformer(
(make_pipeline(imputer),['Age','Siblings/Spouses Aboard','Parents/Children Aboard','Fare']),
(ohe, ['Pclass','Sex','Name']),remainder='passthrough')

In [23]:
# Make pipeline
pipe = make_pipeline(preprocessing, clf)

In [24]:
# Cross-validation
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()

0.82869929537231

### Scikit-Learn Tip #6
You can grid search an entire pipeline and fine optimal tuning parameters.

In [25]:
# Import Python Package
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [26]:
# Load data (loading Titanic dataset)
data = pd.read_csv('https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv')

In [27]:
# Set X and y
X = data.drop('Survived',axis=1)
y = data[['Survived']]

In [28]:
# Set variables
clf = LogisticRegression()
ohe = OneHotEncoder()
scaler = StandardScaler()
imputer = SimpleImputer()

In [29]:
# Make Transformer
preprocessing = make_column_transformer((make_pipeline(imputer,scaler),['Age','Siblings/Spouses Aboard','Parents/Children Aboard','Fare']),(ohe, ['Sex']),remainder='drop')

In [30]:
# Make pipeline
pipe = make_pipeline(preprocessing, clf)

In [31]:
# Set params for Grid Search
params = {}
params['logisticregression__C'] = [0.1,0.2,0.3]
params['logisticregression__max_iter'] = [200,500]

In [32]:
# Run grid search
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(X,y.values.ravel())
print(grid.best_score_)
print(grid.best)

0.7868977337649972


AttributeError: 'GridSearchCV' object has no attribute 'best'

### Scikit-Learn Tip #7
Are you using train_test_split and working with an imbalanced dataset ? Be sure to set stratify=y so that class proportions are preserved when splitting.

In [33]:
# Import Python Package
import pandas as pd
from sklearn.model_selection import train_test_split

In [34]:
# Load data (loading Titanic dataset)
data = pd.read_csv('https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv')

In [35]:
# Set X and y
X = data.drop('Survived',axis=1)
y = data[['Survived']]

In [36]:
# Split Train Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y)

### Scikit-Learn Tip #8
Want to use three datasets (Train, Validation and Test) use train_test_split twice

In [37]:
# Import Python Package
import pandas as pd
from sklearn.model_selection import train_test_split

In [38]:
# Load data (loading Titanic dataset)
data = pd.read_csv('https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv')

In [39]:
# Set X and y
X = data.drop('Survived',axis=1)
y = data[['Survived']]

In [40]:
# Split Train, Val and Test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25)

In [41]:
# Print dataFrames size
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(531, 7)
(178, 7)
(178, 7)


### Scikit-Learn Tip #9
Want to do feature engineering within a ColumnTransformer or Pipeline? Write your own function and convert it into a transformer using FunctionTransformer.

In [42]:
# Import Python Package
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
imputer = SimpleImputer()

In [43]:
# Load data (loading Titanic dataset)
data = pd.read_csv('https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv')

In [44]:
# Set X and y
X = data.drop('Survived',axis=1)
y = data[['Survived']]

In [45]:
# Write function
def lower_letter(df):
   return df.apply(lambda x : x.str.lower())

In [46]:
# Convert function
get_lower_letter = FunctionTransformer(lower_letter)

In [47]:
# Make Pipeline
preprocess = make_column_transformer((imputer, ['Age']),(get_lower_letter,['Name']),remainder='drop')
preprocess.fit_transform(X)

array([[22.0, 'mr. owen harris braund'],
       [38.0, 'mrs. john bradley (florence briggs thayer) cumings'],
       [26.0, 'miss. laina heikkinen'],
       ...,
       [7.0, 'miss. catherine helen johnston'],
       [26.0, 'mr. karl howell behr'],
       [32.0, 'mr. patrick dooley']], dtype=object)