In [23]:
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [24]:
# Sample data
data = {
    'numeric_feature': [10, 20, 30, 40, 50],
    'categorical_feature': ['A', 'B', 'A', 'C', 'B'],
    'target': [0, 1, 0, 1, 1]
}
df = pd.DataFrame(data)
df

Unnamed: 0,numeric_feature,categorical_feature,target
0,10,A,0
1,20,B,1
2,30,A,0
3,40,C,1
4,50,B,1


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   numeric_feature      5 non-null      int64 
 1   categorical_feature  5 non-null      object
 2   target               5 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 248.0+ bytes


In [26]:
df

Unnamed: 0,numeric_feature,categorical_feature,target
0,10,A,0
1,20,B,1
2,30,A,0
3,40,C,1
4,50,B,1


In [27]:
# Separate features and target
X = df[['numeric_feature', 'categorical_feature']]
y = df['target']

# Define the column transformer
preprocessor = make_column_transformer(
    (StandardScaler(), make_column_selector(dtype_include=np.number)), ### this tells scale only numerical columns
    (OneHotEncoder(), make_column_selector(dtype_include=object)) ### this tells apply one hot encoder to only string columns
)

In [28]:
df_new = preprocessor.fit_transform(df)
df_new                      

array([[-1.41421356, -1.22474487,  1.        ,  0.        ,  0.        ],
       [-0.70710678,  0.81649658,  0.        ,  1.        ,  0.        ],
       [ 0.        , -1.22474487,  1.        ,  0.        ,  0.        ],
       [ 0.70710678,  0.81649658,  0.        ,  0.        ,  1.        ],
       [ 1.41421356,  0.81649658,  0.        ,  1.        ,  0.        ]])

In [29]:
preprocessor.get_feature_names_out()

array(['standardscaler__numeric_feature', 'standardscaler__target',
       'onehotencoder__categorical_feature_A',
       'onehotencoder__categorical_feature_B',
       'onehotencoder__categorical_feature_C'], dtype=object)

In [30]:
df2 = pd.DataFrame(df_new, columns=preprocessor.get_feature_names_out())
df2

Unnamed: 0,standardscaler__numeric_feature,standardscaler__target,onehotencoder__categorical_feature_A,onehotencoder__categorical_feature_B,onehotencoder__categorical_feature_C
0,-1.414214,-1.224745,1.0,0.0,0.0
1,-0.707107,0.816497,0.0,1.0,0.0
2,0.0,-1.224745,1.0,0.0,0.0
3,0.707107,0.816497,0.0,0.0,1.0
4,1.414214,0.816497,0.0,1.0,0.0


In [31]:

# Create a pipeline with the column transformer and the model
pipeline = make_pipeline(preprocessor, LogisticRegression())

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Predict using the pipeline
y_pred = pipeline.predict(X_test)
