## Example 1 (numberical data Processing)

In [1]:
#creating a dummy dataset

from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=21, n_classes=2)

In [2]:
#Spliting Training and Testing data

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state=42)

In [3]:
#importing module

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

#This is for displaying the pipeline the diagram form 
from sklearn import set_config
set_config(display = 'diagram')

In [4]:
step = [('scaling', StandardScaler()),
        ('model', LogisticRegression())]


In [5]:

pipe = Pipeline(step)

In [6]:
pipe.fit(X_train, y_train)

In [7]:
y_pred = pipe.predict(X_test)

In [8]:
#checking the accuracy of the model
from sklearn import metrics

accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9575


In [9]:
print(metrics.confusion_matrix(y_test, y_pred))

[[194  10]
 [  7 189]]


## Example 2
**working with the both classification and numberical data Also using the Column Trasformer**

In [57]:
#importing all the required module
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

In [40]:
df = pd.read_csv('csvFile/titanic/train.csv')
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [41]:
#droping unwanted columns
df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace = True)


In [42]:
df.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C


In [84]:
#Spliting Training and Testing data

from sklearn.model_selection import train_test_split
X = df.drop(columns=['Survived'])
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state=42)
X_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
709,3,male,,1,1,15.2458,C
439,2,male,31.0,0,0,10.5000,S
840,3,male,20.0,0,0,7.9250,S
720,2,female,6.0,0,1,33.0000,S
39,3,female,14.0,1,0,11.2417,C
...,...,...,...,...,...,...,...
176,3,male,,3,1,25.4667,S
841,2,male,16.0,0,0,10.5000,S
678,3,female,43.0,1,6,46.9000,S
163,3,male,17.0,0,0,8.6625,S


In [44]:
X_train.isnull().sum()

Pclass        0
Sex           0
Age         107
SibSp         0
Parch         0
Fare          0
Embarked      0
dtype: int64

**Process to follow**
```
1. replacing the null values of Age with mean
2. OneHotEncoding
3. Scaling
```

In [45]:
trf1 = ColumnTransformer([
    ('replace', SimpleImputer(), [2]),
    ('replaec2', SimpleImputer(strategy='most_frequent'), [6])],
    remainder='passthrough')

In [85]:
trf2 = ColumnTransformer([('ohe_sex_embarked', OneHotEncoder(sparse_output=False, handle_unknown='ignore'),[1,6])], remainder='passthrough')


In [70]:
trf3 =  ColumnTransformer([('scaling', MinMaxScaler(),slice(0,10))])

In [71]:
trf4 = DecisionTreeClassifier()

In [86]:
list = [
    ('trf1', trf1),
    ('trf2', trf2),
    ('trf3', trf3),
    ('trf4', trf4)
]

In [87]:
pipe = Pipeline(list)

In [88]:
pipe.fit(X_train, y_train)

In [89]:
y_pred = pipe.predict(X_test)

In [90]:
y_pred

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0,

In [91]:
from sklearn import metrics

metrics.accuracy_score(y_test, y_pred)

0.6162464985994398