## (1) A Stateless Transformer

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Binarizer.html

In [None]:
from sklearn.preprocessing import Binarizer

data = [[13.0, 0.0, 1.0],
        [27.0, 1.0, 0.0]]

Binarizer().transform(data)

## (2) An Estimator / Transformer to standardize data

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

In [15]:
from sklearn.preprocessing import StandardScaler

data = [[0, 0], 
        [0, 0], 
        [1, 1], 
        [1, 1]]

scaler = StandardScaler()

fitted_scaler = scaler.fit(data) 

print("Internal state of the fitted scaler:")
print("Means: %s" % fitted_scaler.mean_)
print("Variances: %s" % fitted_scaler.var_)
print("# samples seen: %s" % fitted_scaler.n_samples_seen_)

new_data = [[2, 2]]

fitted_scaler.transform(new_data)

Internal state of the fitted scaler:
Means: [0.5 0.5]
Variances: [0.25 0.25]
# samples seen: 4


array([[3., 3.]])

## (3) An Estimator / Transformer to impute missing data

https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html

In [19]:
import numpy as np
from sklearn.impute import SimpleImputer

data = [[7, 2, 3], 
        [4, np.nan, 6], 
        [10, 5, 9]]

mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

fitted_mean_imputer = mean_imputer.fit(data)

new_data = [
    [np.nan, 2, 3], 
    [4, np.nan, 6], 
    [10, np.nan, 9]]

fitted_mean_imputer.transform(new_data)

array([[ 7. ,  2. ,  3. ],
       [ 4. ,  3.5,  6. ],
       [10. ,  3.5,  9. ]])

## (4) Classifiers are a special kind of Estimator / Transformers

https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [38]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

iris_dataset = load_iris()

train_data = iris_dataset.data[:120]
train_target = iris_dataset.target[:120]

test_data = iris_dataset.data[120:]
test_target = iris_dataset.target[120:]

classifier = DecisionTreeClassifier()
fitted_classifier = classifier.fit(train_data, train_target)

print("Accuracy on test set is %s" % classifier.score(test_data, test_target))
      
fitted_classifier.predict(test_data)      

Accuracy on test set is 0.7333333333333333


array([2, 2, 2, 1, 2, 2, 1, 1, 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 2, 1, 2, 2, 2])

## (5) Pipelines allow us to compose Estimator / Transformers

https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

In [41]:
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

iris_dataset = load_iris()

train_data = iris_dataset.data[:120]
train_target = iris_dataset.target[:120]

test_data = iris_dataset.data[120:]
test_target = iris_dataset.target[120:]

pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()), 
    ('classifier', DecisionTreeClassifier())])

fitted_pipeline = pipeline.fit(train_data, train_target)

print("Accuracy on test set is %s" % fitted_pipeline.score(test_data, test_target))
      
fitted_pipeline.predict(test_data)   

Accuracy on test set is 0.8


array([2, 1, 2, 1, 2, 2, 1, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2])

## (6) The ColumnTransformer allows us to feed pipelines with Pandas Dataframes

https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html

In [48]:
import pandas as pd

raw_data = pd.read_csv('adult-sample.csv', na_values='?')
raw_data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income-per-year
0,28,Private,273269,Some-college,10,Never-married,Craft-repair,Not-in-family,Black,Male,0,0,40,United-States,<=50K
1,58,State-gov,123329,HS-grad,9,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,16,United-States,<=50K
2,34,Private,79637,Bachelors,13,Never-married,Exec-managerial,Own-child,Amer-Indian-Eskimo,Female,0,0,40,United-States,<=50K
3,71,Private,97870,Masters,14,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,15,Germany,<=50K
4,20,State-gov,41103,Some-college,10,Never-married,Other-service,Own-child,White,Female,0,0,20,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,32,Private,108116,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,1902,60,United-States,>50K
96,42,Self-emp-not-inc,32185,Bachelors,13,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,60,United-States,>50K
97,18,Private,333611,5th-6th,3,Never-married,Other-service,Other-relative,White,Male,0,0,54,Mexico,<=50K
98,25,Private,50053,HS-grad,9,Never-married,Other-service,Not-in-family,Black,Male,0,0,40,Japan,<=50K


In [60]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler, label_binarize
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

complete_raw_data = raw_data.dropna()

train_data, test_data = train_test_split(complete_raw_data, test_size=0.2,)

train_labels = label_binarize(train_data['income-per-year'], ['>50K', '<=50K'])
test_labels = label_binarize(test_data['income-per-year'], ['>50K', '<=50K'])

feature_transformation = ColumnTransformer(transformers=[
    ('categorical_features', OneHotEncoder(handle_unknown='ignore'), ['education', 'workclass']),
    ('scaled_numeric_features', StandardScaler(), ['age', 'hours-per-week'])
])

pipeline = Pipeline([
  ('features', feature_transformation),
  ('classifier', DecisionTreeClassifier())])

fitted_pipeline = pipeline.fit(train_data, train_labels)

fitted_pipeline.score(test_data, test_labels)

0.7368421052631579

## (7) Nesting Pipelines

In [50]:
raw_data[raw_data.isna().any(axis=1)]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income-per-year
25,62,State-gov,221558,Masters,14,Separated,Prof-specialty,Unmarried,White,Female,0,0,24,,<=50K
30,60,,41517,11th,7,Married-spouse-absent,,Unmarried,Black,Female,0,0,20,United-States,<=50K
40,19,,134974,Some-college,10,Never-married,,Own-child,White,Female,0,0,20,United-States,<=50K
60,23,,99399,Some-college,10,Never-married,,Unmarried,Amer-Indian-Eskimo,Female,0,0,25,United-States,<=50K
64,20,,150084,Some-college,10,Never-married,,Own-child,White,Male,0,0,25,United-States,<=50K
78,45,Private,274657,11th,7,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,,<=50K
86,72,,402306,Some-college,10,Married-civ-spouse,,Husband,White,Male,0,0,32,Canada,<=50K
93,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States,<=50K


In [61]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler, label_binarize
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

train_data, test_data = train_test_split(raw_data, test_size=0.2,)

train_labels = label_binarize(train_data['income-per-year'], ['>50K', '<=50K'])
test_labels = label_binarize(test_data['income-per-year'], ['>50K', '<=50K'])

feature_transformation = ColumnTransformer(transformers=[
    ('categorical_features', OneHotEncoder(handle_unknown='ignore'), ['education', 'workclass']),
    ('scaled_numeric_features', StandardScaler(), ['age', 'hours-per-week'])
])

pipeline = Pipeline([
  ('features', feature_transformation),
  ('classifier', DecisionTreeClassifier())])

fitted_pipeline = pipeline.fit(train_data, train_labels)

fitted_pipeline.score(test_data, test_labels)

ValueError: Input contains NaN

In [69]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler, label_binarize
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

train_data, test_data = train_test_split(raw_data, test_size=0.2,)

train_labels = label_binarize(train_data['income-per-year'], ['>50K', '<=50K'])
test_labels = label_binarize(test_data['income-per-year'], ['>50K', '<=50K'])

categorical_feature_transformation = Pipeline(steps=[
    ('imputation', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
    ('encode', OneHotEncoder(handle_unknown='ignore'))
])

feature_transformation = ColumnTransformer(transformers=[
    ('categorical_features', categorical_feature_transformation, ['education', 'workclass']),
    ('scaled_numeric_features', StandardScaler(), ['age', 'hours-per-week'])
])

pipeline = Pipeline([
  ('features', feature_transformation),
  ('classifier', DecisionTreeClassifier())])

fitted_pipeline = pipeline.fit(train_data, train_labels)

fitted_pipeline.score(test_data, test_labels)

0.75

## (TASK) Adjust the pipeline from the previous cell to apply 5-fold cross-validation and grid search over different hyperparameters of the decision tree

Hint: use https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html