<a href="https://colab.research.google.com/github/sensei-jirving/Online-DS-PT-01.24.22-cohort-notes/blob/main/Week_05/Lecture_02/OfficeHours/ColumnTransformer_Questions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Office Hours - ColumnTransformer 


- Date: 02/24/22
- Cohort:  01.24.22 Cohort
- Author: James Irving



# ðŸš¢ Predicting Passenger Survival on the Titanic with `scikit-learn`

In [None]:
## All imports
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.compose import (make_column_selector,make_column_transformer, 
                             ColumnTransformer)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix


In [None]:
## Load in dataset from url 
url ="https://docs.google.com/spreadsheets/d/e/2PACX-1vS7TaxsUixSyoL0Rn8LPfbWIjeTd2-QdoZ0B2Knk14XYEmUzHUL-UhMilWK34Fn9dGjTcuo0-teSLU2/pub?output=csv"
df = pd.read_csv(url,index_col=0,na_values='?')

## Keep relevant columns
relevant_columns = ['Pclass', 'Age', 'SibSp', 'Fare', 'Sex', 'Embarked', 'Survived']
df = df[relevant_columns]
df

Unnamed: 0_level_0,Pclass,Age,SibSp,Fare,Sex,Embarked,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,3,22.0,1,7.2500,male,S,0
2,1,38.0,1,71.2833,female,C,1
3,3,26.0,0,7.9250,female,S,1
4,1,35.0,1,53.1000,female,S,1
5,3,35.0,0,8.0500,male,S,0
...,...,...,...,...,...,...,...
887,2,27.0,0,13.0000,male,S,0
888,1,19.0,0,30.0000,female,S,1
889,3,,1,23.4500,female,S,0
890,1,26.0,0,30.0000,male,C,1


## Exploratory Analysis

In [None]:
## Check out the .info for dtypes + summary
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Age       714 non-null    float64
 2   SibSp     891 non-null    int64  
 3   Fare      891 non-null    float64
 4   Sex       891 non-null    object 
 5   Embarked  889 non-null    object 
 6   Survived  891 non-null    int64  
dtypes: float64(2), int64(3), object(2)
memory usage: 55.7+ KB


In [None]:
## check for duplicates
df.duplicated().sum()

112

In [None]:
## drop duplicates
df = df.drop_duplicates()
df.duplicated().sum()

0

In [None]:
## check range of numeric values
df.describe().round(2)

Unnamed: 0,Pclass,Age,SibSp,Fare,Survived
count,779.0,676.0,779.0,779.0,779.0
mean,2.25,29.81,0.53,34.86,0.41
std,0.85,14.73,0.99,52.29,0.49
min,1.0,0.42,0.0,0.0,0.0
25%,1.0,20.0,0.0,8.05,0.0
50%,3.0,28.0,0.0,16.0,0.0
75%,3.0,39.0,1.0,34.38,1.0
max,3.0,80.0,8.0,512.33,1.0


In [None]:
## check null values 
df.isna().sum()

Pclass        0
Age         103
SibSp         0
Fare          0
Sex           0
Embarked      2
Survived      0
dtype: int64

## Preprocessing

In [None]:
## Separate X and y and train-test-split
target = 'Survived'

y = df[target]
X = df.drop(target, axis=1)

# Perform test train split
X_train , X_test, y_train, y_test = train_test_split(X, y,random_state=42)
y_train.value_counts()

0    346
1    238
Name: Survived, dtype: int64

In [None]:
## Check for nulls in training set
X_train.isna().sum()

Pclass       0
Age         74
SibSp        0
Fare         0
Sex          0
Embarked     1
dtype: int64

In [None]:
## Preparing categorical data tools
cat_sel = make_column_selector(dtype_include=['object'])
cat_sel(X_train)

['Sex', 'Embarked']

> The selector we just made is a function that sklearn will use to find the string/object columns on the fly. 
- We can use it ourselves, as well as in our pipelines

In [None]:
# NEW/DIFFERENT - saving the result of out cat selector
cat_features_in = cat_sel(X_train)
X_train[cat_features_in]

Unnamed: 0_level_0,Sex,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
173,female,S
29,female,Q
807,male,S
206,female,S
233,male,S
...,...,...
73,male,S
113,male,S
288,male,S
484,female,S


## Preparing Individual Pipelines for Each Type of Data

### Categorical Pipeline

In [None]:
from sklearn import set_config
set_config(display='diagram')

In [None]:
## cat pipe
cat_pipe = make_pipeline(SimpleImputer(strategy='constant',
                                       fill_value='MISSING'),
                         OneHotEncoder(handle_unknown='ignore',
                                       sparse=False))
cat_pipe

In [None]:
## fit cat pipe and transform the training data
cat_pipe.fit( X_train[cat_features_in])
X_train_cat = cat_pipe.transform( X_train[cat_features_in])
X_train_cat

array([[1., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 1.],
       ...,
       [0., 1., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 1.]])

> We need to get to the OneHotEncoder in our pipeline to run `onehotencoder.get_feature_names_out()`
- Pipelines can be sliced like a dictionay if we use the `cat_pipe.named_steps`

#### Slicing Out the the `pipeline.named_steps`

In [None]:
## need to slice out the encoder from our cat pipe
cat_pipe

In [None]:
## what is in named-steps
cat_pipe.named_steps

{'onehotencoder': OneHotEncoder(handle_unknown='ignore', sparse=False),
 'simpleimputer': SimpleImputer(fill_value='MISSING', strategy='constant')}

In [None]:
## Get The feature names -
cat_pipe.named_steps['onehotencoder'].get_feature_names_out()

array(['x0_female', 'x0_male', 'x1_C', 'x1_MISSING', 'x1_Q', 'x1_S'],
      dtype=object)

In [None]:
## Get the FULL
cat_features_out = cat_pipe.named_steps['onehotencoder']\
                .get_feature_names_out(cat_features_in)
cat_features_out

array(['Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_MISSING',
       'Embarked_Q', 'Embarked_S'], dtype=object)

In [None]:
## Make into a dataframe with feature names
X_train_cat_df = pd.DataFrame(X_train_cat, columns=cat_features_out)
X_train_cat_df

Unnamed: 0,Sex_female,Sex_male,Embarked_C,Embarked_MISSING,Embarked_Q,Embarked_S
0,1.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...
579,0.0,1.0,0.0,0.0,0.0,1.0
580,0.0,1.0,0.0,0.0,0.0,1.0
581,0.0,1.0,0.0,0.0,0.0,1.0
582,1.0,0.0,0.0,0.0,0.0,1.0


### Numeric Pipeline

In [None]:
## make numeric preprocessing pipeline
num_sel = make_column_selector(dtype_include=['number'])
num_features_in = num_sel(X_train)
num_features_in

['Pclass', 'Age', 'SibSp', 'Fare']

In [None]:
## num pipe
num_pipe = make_pipeline(SimpleImputer(strategy='mean'),
                         StandardScaler())
num_pipe.fit_transform(X_train[num_features_in])

array([[ 0.88163784, -2.08107077,  0.47681387, -0.4641602 ],
       [ 0.88163784,  0.        , -0.53946094, -0.53046113],
       [-1.45870988,  0.61426005, -0.53946094, -0.69099656],
       ...,
       [ 0.88163784, -0.59154584, -0.53946094, -0.53012292],
       [ 0.88163784,  2.31657425, -0.53946094, -0.49565523],
       [ 0.88163784,  0.54333029, -0.53946094, -0.53012292]])

### Combining Pipelines with ColumnTransformer

In [None]:
## Make column transformer
preprocessor = make_column_transformer( (num_pipe,num_sel),
                                       (cat_pipe,cat_sel))
preprocessor

In [None]:
## Fitting the processor so it learns the data and column names
preprocessor.fit(X_train)
preprocessor.named_transformers_

{'pipeline-1': Pipeline(steps=[('simpleimputer', SimpleImputer()),
                 ('standardscaler', StandardScaler())]),
 'pipeline-2': Pipeline(steps=[('simpleimputer',
                  SimpleImputer(fill_value='MISSING', strategy='constant')),
                 ('onehotencoder',
                  OneHotEncoder(handle_unknown='ignore', sparse=False))])}

In [None]:

# Extract the feature names from one hot encoder
cat_feature_names = preprocessor.named_transformers_['pipeline-2']\
                            .named_steps['onehotencoder']\
                            .get_feature_names_out(cat_features_in)
cat_feature_names

array(['Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_MISSING',
       'Embarked_Q', 'Embarked_S'], dtype=object)

In [None]:
## get final col names which are num features + cat features
final_cols = num_sel(X_train) + list(cat_feature_names)
final_cols

['Pclass',
 'Age',
 'SibSp',
 'Fare',
 'Sex_female',
 'Sex_male',
 'Embarked_C',
 'Embarked_MISSING',
 'Embarked_Q',
 'Embarked_S']

In [None]:
X_train_tf = preprocessor.transform(X_train)
X_train_df = pd.DataFrame(X_train_tf, columns=final_cols)
X_train_df

Unnamed: 0,Pclass,Age,SibSp,Fare,Sex_female,Sex_male,Embarked_C,Embarked_MISSING,Embarked_Q,Embarked_S
0,0.881638,-2.081071,0.476814,-0.464160,1.0,0.0,0.0,0.0,0.0,1.0
1,0.881638,0.000000,-0.539461,-0.530461,1.0,0.0,0.0,0.0,1.0,0.0
2,-1.458710,0.614260,-0.539461,-0.690997,0.0,1.0,0.0,0.0,0.0,1.0
3,0.881638,-2.010141,-0.539461,-0.477827,1.0,0.0,0.0,0.0,0.0,1.0
4,-0.288536,2.032855,-0.539461,-0.415940,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
579,-0.288536,-0.662476,-0.539461,0.806535,0.0,1.0,0.0,0.0,0.0,1.0
580,0.881638,-0.591546,-0.539461,-0.526981,0.0,1.0,0.0,0.0,0.0,1.0
581,0.881638,-0.591546,-0.539461,-0.530123,0.0,1.0,0.0,0.0,0.0,1.0
582,0.881638,2.316574,-0.539461,-0.495655,1.0,0.0,0.0,0.0,0.0,1.0


In [None]:
X_test_tf = preprocessor.transform(X_test)
X_test_df = pd.DataFrame(X_test_tf, columns=final_cols)
X_test_df

Unnamed: 0,Pclass,Age,SibSp,Fare,Sex_female,Sex_male,Embarked_C,Embarked_MISSING,Embarked_Q,Embarked_S
0,0.881638,-0.875265,-0.539461,-0.490476,1.0,0.0,0.0,0.0,0.0,1.0
1,0.881638,0.897979,-0.539461,-0.526981,0.0,1.0,0.0,0.0,0.0,1.0
2,-0.288536,-0.449686,0.476814,0.633351,1.0,0.0,0.0,0.0,0.0,1.0
3,0.881638,-0.378757,0.476814,-0.529528,1.0,0.0,0.0,0.0,0.0,1.0
4,-1.458710,2.387504,-0.539461,-0.161257,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
190,0.881638,-0.662476,-0.539461,-0.543281,0.0,1.0,0.0,0.0,0.0,1.0
191,-1.458710,0.046822,0.476814,0.368482,0.0,1.0,0.0,0.0,0.0,1.0
192,0.881638,-0.804335,-0.539461,-0.553468,0.0,1.0,0.0,0.0,1.0,0.0
193,-1.458710,-0.449686,-0.539461,1.003319,1.0,0.0,1.0,0.0,0.0,0.0


### Manually Making Piplines and ColumnTransfoemr

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

num_pipe = Pipeline([
                     ('mean_imputer', SimpleImputer(strategy='mean')),
                      ('scaler',StandardScaler())])
num_pipe

In [None]:
num_pipe.named_steps['scaler']

In [None]:
cat_pipe = Pipeline([
                     ('cat_imputer',SimpleImputer(strategy='constant', fill_value='MISSING')),
                      ('encoder',OneHotEncoder(handle_unknown='ignore',sparse=False))])
cat_pipe

In [None]:
num_sel(X_train)

['Pclass', 'Age', 'SibSp', 'Fare']

In [None]:
column_transform = ColumnTransformer([
                                      ('num',num_pipe,num_sel),
                                      ('cat',cat_pipe, cat_sel)
], remainder='passthrough')
column_transform.fit(X_train)

In [None]:
column_transform.transform(X_train).shape

(584, 10)

In [None]:
X_train.shape

(584, 6)

In [None]:
 cat_cols = cat_sel(X_train)
 num_cols = num_sel(X_train)
 cat_cols

['Sex', 'Embarked']

In [None]:
cat_feature_names = column_transform.named_transformers_['cat'].named_steps['encoder'].get_feature_names_out( cat_sel(X_train))
cat_feature_names

array(['Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_MISSING',
       'Embarked_Q', 'Embarked_S'], dtype=object)

In [None]:
final_cols = num_sel(X_train) + list(cat_feature_names)
final_cols

['Pclass',
 'Age',
 'SibSp',
 'Fare',
 'Sex_female',
 'Sex_male',
 'Embarked_C',
 'Embarked_MISSING',
 'Embarked_Q',
 'Embarked_S']

In [None]:
X_train_df = pd.DataFrame( column_transform.transform(X_train),
                          columns=final_cols)
X_train_df

Unnamed: 0,Pclass,Age,SibSp,Fare,Sex_female,Sex_male,Embarked_C,Embarked_MISSING,Embarked_Q,Embarked_S
0,0.881638,-2.081071,0.476814,-0.464160,1.0,0.0,0.0,0.0,0.0,1.0
1,0.881638,0.000000,-0.539461,-0.530461,1.0,0.0,0.0,0.0,1.0,0.0
2,-1.458710,0.614260,-0.539461,-0.690997,0.0,1.0,0.0,0.0,0.0,1.0
3,0.881638,-2.010141,-0.539461,-0.477827,1.0,0.0,0.0,0.0,0.0,1.0
4,-0.288536,2.032855,-0.539461,-0.415940,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
579,-0.288536,-0.662476,-0.539461,0.806535,0.0,1.0,0.0,0.0,0.0,1.0
580,0.881638,-0.591546,-0.539461,-0.526981,0.0,1.0,0.0,0.0,0.0,1.0
581,0.881638,-0.591546,-0.539461,-0.530123,0.0,1.0,0.0,0.0,0.0,1.0
582,0.881638,2.316574,-0.539461,-0.495655,1.0,0.0,0.0,0.0,0.0,1.0


In [None]:
X_test_df = pd.DataFrame( column_transform.transform(X_test),
                          columns=final_cols)
X_test_df

Unnamed: 0,Pclass,Age,SibSp,Fare,Sex_female,Sex_male,Embarked_C,Embarked_MISSING,Embarked_Q,Embarked_S
0,0.881638,-0.875265,-0.539461,-0.490476,1.0,0.0,0.0,0.0,0.0,1.0
1,0.881638,0.897979,-0.539461,-0.526981,0.0,1.0,0.0,0.0,0.0,1.0
2,-0.288536,-0.449686,0.476814,0.633351,1.0,0.0,0.0,0.0,0.0,1.0
3,0.881638,-0.378757,0.476814,-0.529528,1.0,0.0,0.0,0.0,0.0,1.0
4,-1.458710,2.387504,-0.539461,-0.161257,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
190,0.881638,-0.662476,-0.539461,-0.543281,0.0,1.0,0.0,0.0,0.0,1.0
191,-1.458710,0.046822,0.476814,0.368482,0.0,1.0,0.0,0.0,0.0,1.0
192,0.881638,-0.804335,-0.539461,-0.553468,0.0,1.0,0.0,0.0,1.0,0.0
193,-1.458710,-0.449686,-0.539461,1.003319,1.0,0.0,1.0,0.0,0.0,0.0


In [None]:
# column_transform.get_feature_names_out()

### Why bother?

> Below will error!

In [None]:
cat_imputer = SimpleImputer(strategy='constant',fill_value='MISSING')
encoder = OneHotEncoder(handle_unknown='ignore',sparse=False)

cat_imputer.fit(X_train)
encoder.fit(X_train)

cat_train = X_train[ cat_sel(X_train)]
cat_test = X_test[ cat_sel(X_test)]

X_train_cat = cat_imputer.transform(cat_train)
X_train_cat = encoder.transform(X_train_cat)


X_test_cat = cat_imputer.transform(cat_test)
X_test_cat = encoder.transform(X_test_cat)
X_test_cat

Feature names seen at fit time, yet now missing:
- Age
- Fare
- Pclass
- SibSp



ValueError: ignored

In [None]:
num_imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()

num_imputer.fit(X_train)
scaler.fit(X_train)

X_train_num = num_imputer.transform(X_train)
X_train_cat = encoder.transform(X_train_cat)


X_test_cat = cat_imputer.transform(X_test)
X_test_cat = encoder.transform(X_test_cat)
X_test_cat

### All Together In One Cell

In [None]:
## make categorical preprocessing pipe
cat_sel = make_column_selector(dtype_include=['object'])
## make numeric preprocessing pipeline
num_sel = make_column_selector(dtype_include=['number'])

## cat pipe
cat_pipe = make_pipeline(SimpleImputer(strategy='constant',fill_value='MISSING'),
                         OneHotEncoder(handle_unknown='ignore',sparse=False))
## num pipe
num_pipe = make_pipeline(SimpleImputer(strategy='mean'),
                         StandardScaler())

## Make column transformer
preprocessor = make_column_transformer( (num_pipe,num_sel),
                                       (cat_pipe,cat_sel))
preprocessor