In [3]:
# Pipelines and Column Transformers
# Pipelines, column selectors, column transformers and other preprocessing transformers can be combined together to perform complex transformations on different subsets of data. For instance you can
# impute and scale numeric data and impute and one-hot encode object data.
# Pipelines can go inside of ColumnTransformer to make sequential transformation after splitting columns. AND ColumnTransformer objects can be put inside pipelines. You could achieve the
# transformations described above with either a series of ColumnTransformer in a Pipeline OR two pipelines inside of a ColumnTransformer. You could even put a ColumnTransformer in a pipeline inside a
# ColumnTransformer inside a pipeline!
# As you can see, this can get a bit complicated so It can be useful to diagram the transformations you want on your data. Do you want integer numeric data median imputed, float data mean imputed, both
# types scaled, object data imputed with the most frequent values and then one-hot encoded?



In [1]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import set_config
set_config(display='diagram')

In [4]:
# In this example we will will take data with ordinal categorical, nominal, and numeric data. There are missing data and the numeric data needs to be scaled.
# We will use two pipelines, one for numeric and one for categorical and put them in a column transformer. We will also ordinal encode a column before we split the data. None of the missing data is of
# integer type.

# Load the data
path = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vSzb_CfjmApDMSXRn-Ga8X5rgoRVm7U_UNYotqQ0iW2JVx1qoKFr41XOA-FNKPqds83B0oUM6zKtLqK/pub?output=csv'
df = pd.read_csv(path)
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 32 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   State               995 non-null    object 
 1   Lat                 1000 non-null   float64
 2   Lng                 1000 non-null   float64
 3   Area                995 non-null    object 
 4   Children            993 non-null    float64
 5   Age                 1000 non-null   int64  
 6   Income              1000 non-null   float64
 7   Marital             995 non-null    object 
 8   Gender              995 non-null    object 
 9   ReAdmis             1000 non-null   int64  
 10  VitD_levels         1000 non-null   float64
 11  Doc_visits          1000 non-null   int64  
 12  Full_meals_eaten    1000 non-null   int64  
 13  vitD_supp           1000 non-null   int64  
 14  Soft_drink          1000 non-null   int64  
 15  Initial_admin       995 non-null    object 
 16  HighBlo

In [7]:
# Ordinal Encoding
# We can ordinal encode data without too much risk of data leakage. There are generally a small number of ordinal variables and they are likely to be in both training and testing data. If that is not the case,
# the sklearn transformer called OrdinalEncoder can be added to a preprocessing pipeline.

df['Complication_risk'].value_counts()


# Ordinal Encoding 'Complication_risk'
replacement_dictionary = {'High':2, 'Medium':1, 'Med':1, 'Low':0}
df['Complication_risk'].replace(replacement_dictionary, inplace=True)
df['Complication_risk']


# Validation Split
X = df.drop('Additional_charges', axis=1)
y = df['Additional_charges']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Instantiate Column Selectors
# We will create our column selectors to use with our column transformer later. We could use lists of columns instead, but a column selector makes it more algorithmic. In this case the code will still work,
# even if the columns in the dataframe change after the pipeline has been put into production.

# Selectors
cat_selector = make_column_selector(dtype_include='object')
num_selector = make_column_selector(dtype_include='number')

# Instantiate Transformers.
# We will be using 3 different transformers, SimpleImputer, StandardScaler, and OneHotEncoder. 
# There will be 2 different SimpleImputers with different imputation strategies: 'most_frequent' and 'mean'

# Imputers
freq_imputer = SimpleImputer(strategy='most_frequent')
mean_imputer = SimpleImputer(strategy='mean')

# Scaler
scaler = StandardScaler()

# One-hot encoder
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Instantiate Pipelines
# We will be using TWO different pipelines here. One for numeric data and one for nominal categorical data.

# Numeric pipeline
numeric_pipe = make_pipeline(mean_imputer, scaler)
numeric_pipe

# Categorical pipeline
categorical_pipe = make_pipeline(freq_imputer, ohe)
categorical_pipe


# Instantiate ColumnTransformer
# make_column_transformer uses tuples to match transformers with the datatypes they should act on. 
# We can use pipelines as those transformers, which we will do below.

# Tuples for Column Transformer
number_tuple = (numeric_pipe, num_selector)
category_tuple = (categorical_pipe, cat_selector)

# ColumnTransformer
preprocessor = make_column_transformer(number_tuple, category_tuple)
preprocessor

# Transformer Data
# We fit the ColumnTransformer, which we called 'preprocessor' on the training data. (Never on testing data!)
                                                                                    
# fit on train
preprocessor.fit(X_train)

# The fit method worked to fit all 4 transformers inside the ColumnTransformer.
# We will use that fitted ColumnTransformer to transform both our training and testing datasets.

# transform train and test
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)


# Inspect the Result
# All Scikit-Learn transformers return Numpy arrays, NOT Pandas dataframes. Because of this we need to use Numpy functions, such as np.isnan(), to inspect our data. In some cases we can easily
# transform our data back into a Pandas dataframe, but it's not always easy to get the column names back. The OneHotEncoder created extra columns and it is complicated to retrieve the correct column
# names for all columns.
# We will ensure that the missing data was replaced, the categorical data was one-hot encoded, and the numeric data was scaled.


# Check for missing values and that data is scaled and one-hot encoded
print(np.isnan(X_train_processed).sum().sum(), 'missing values in training data')
print(np.isnan(X_test_processed).sum().sum(), 'missing values in testing data')
print('\n')
print('All data in X_train_processed are', X_train_processed.dtype)
print('All data in X_test_processed are', X_test_processed.dtype)
print('\n')
print('shape of data is', X_train_processed.shape)
print('\n')
X_train_processed

0 missing values in training data
0 missing values in testing data


All data in X_train_processed are float64
All data in X_test_processed are float64


shape of data is (750, 97)






array([[-0.50820472,  0.28193545, -0.06527826, ...,  0.        ,
         1.        ,  0.        ],
       [-0.72064168,  0.25283631,  1.23912135, ...,  0.        ,
         0.        ,  0.        ],
       [-0.49340318,  0.48282262, -0.50007813, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 0.27295848,  0.63816773, -0.93487801, ...,  0.        ,
         0.        ,  0.        ],
       [-0.89653885, -1.73729615, -0.93487801, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.30727477,  1.1082109 , -0.93487801, ...,  0.        ,
         0.        ,  0.        ]])