# Pipelines

A Pipeline is a way to combine all your data preprocessing and model steps into one object — so you can train and test everything together in a clean, consistent way.

### libraries

In [27]:
import pandas as pd
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from  sklearn.metrics import accuracy_score

## DataFrame

In [4]:
df=pd.read_csv('../Feature Engineering/customer.csv')
df.head(5)

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,Average,School,No
1,68,Female,Poor,UG,No
2,70,Female,Good,PG,No
3,72,Female,Good,PG,No
4,16,Female,Average,UG,No


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        50 non-null     int64 
 1   gender     50 non-null     object
 2   review     50 non-null     object
 3   education  50 non-null     object
 4   purchased  50 non-null     object
dtypes: int64(1), object(4)
memory usage: 2.1+ KB


### train test Split

We do **train-test split before using a ColumnTransformer or Pipeline** to prevent **data leakage** — meaning we don’t want information from the test set to influence how the model learns. When we split first, the preprocessing steps (like scaling, encoding, or imputing) are **fit only on the training data** and then **applied to the test data** later. This ensures that the model is evaluated on truly unseen data, giving a more accurate measure of how it will perform in the real world.


In [7]:
X = df.iloc[:,:4]       #Train Column
y = df.iloc[:,4:]       #Test Column

In [8]:
X_train, X_test, y_train, y_test= train_test_split(X, y, random_state=42, test_size=0.2)
X_train.shape, X_test.shape

((40, 4), (10, 4))

In [9]:
pd.DataFrame(X_train).head()

Unnamed: 0,age,gender,review,education
12,51,Male,Poor,School
4,16,Female,Average,UG
37,94,Male,Average,PG
8,65,Female,Average,UG
3,72,Female,Good,PG


## transform the column

A **ColumnTransformer** is used to apply different preprocessing steps to different columns in a dataset at once. For example, it can scale numeric features and encode categorical ones in a single step. This makes data preprocessing cleaner, more organized, and less error-prone, especially when combined with a pipeline.


In [11]:
transformer = ColumnTransformer(
    transformers=[
        ('age_sacle_t', StandardScaler(), ['age']),
        ('gender_t', OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore'), ['gender']),
        ('review_t', OrdinalEncoder(categories=[['Poor', 'Average', 'Good']], handle_unknown='use_encoded_value', unknown_value=-1), ['review']),
        ('education_t', OrdinalEncoder(categories=[['School', 'UG', 'PG']], handle_unknown='use_encoded_value', unknown_value=-1), ['education']),
        
    ], remainder='passthrough'
)
lb_en= LabelEncoder()

In [12]:
transformer

0,1,2
,transformers,"[('age_sacle_t', ...), ('gender_t', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,"[['Poor', 'Average', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,"[['School', 'UG', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,


#### Datafarme of transformed data

In [29]:
#target
y_train_encoded = lb_en.fit_transform(y_train)

X_train_transformed = transformer.fit_transform(X_train)

# Get feature names
feature_names = transformer.get_feature_names_out()

#DataFrame
df = pd.DataFrame(X_train_transformed, columns=feature_names)

df.head()

  y = column_or_1d(y, warn=True)


Unnamed: 0,age_sacle_t__age,gender_t__gender_Male,review_t__review,education_t__education
0,-0.039711,1.0,0.0,0.0
1,-1.395705,0.0,1.0,1.0
2,1.626224,1.0,1.0,2.0
3,0.502686,0.0,1.0,1.0
4,0.773885,0.0,2.0,2.0


## Pipeline

In [25]:

pipe= Pipeline([
    ('process', transformer),
    ('model', LogisticRegression())
])

In [32]:
#method 2
pipe=make_pipeline(transformer, LogisticRegression())

In [34]:
pipe.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


0,1,2
,steps,"[('columntransformer', ...), ('logisticregression', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('age_sacle_t', ...), ('gender_t', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,"[['Poor', 'Average', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,"[['School', 'UG', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


# Test via pipeline

In [35]:
y_pred=pipe.predict(X_test)
acc=accuracy_score(y_test, y_pred)
acc

0.5