<a href="https://colab.research.google.com/github/priyanshigarg17/mysql-workbench/blob/main/loan_data_set.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Simple Imputer, Column Transformer and Pipeline**

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [13]:
#Read the data set
df1 = pd.read_csv('loan_data_set.csv')

In [14]:
#seperating dependent(target) and independent variab;e
X = df1.drop(['Loan_Status','Loan_ID'], axis=1)
y = df1['Loan_Status']

In [15]:
#Using the simple imputer to handle the missing values
cat_cols = X.select_dtypes(include='object').columns
num_cols = X.select_dtypes(include='number').columns

In [16]:
cat_cols
num_cols

Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History'],
      dtype='object')

In [17]:
# Finding the outliers
Q1 = X['ApplicantIncome'].quantile(0.25)
Q3 = X['ApplicantIncome'].quantile(0.75)

#Calculating the interquartile range
IQR = Q3 - Q1
upper_limit = Q3 + 1.5*IQR
lower_limit = Q1 - 1.5*IQR

#outliers = X[(X['ApplicantIncome'] < lower_limit) | (X['ApplicantIncome'] > upper_limit)]
#handling outliers by trimming or code for removing outliers
outliers = X[(X['ApplicantIncome'] > lower_limit) | (X['ApplicantIncome'] < upper_limit)]

In [18]:
#caping
import numpy as np
capping = np.where(X['ApplicantIncome'] < lower_limit, lower_limit, np.where(X['ApplicantIncome'] > upper_limit, upper_limit, X['ApplicantIncome']))

In [19]:
capping.shape

(614,)

In [20]:
outliers.shape

(614, 11)

Handling outliers has two methods Trimming and capping
Trimming - droping
Caping  - done from upper or lower limit

Trimming done on the complete data



---



In [21]:
#handling the missing values
cat_imput = SimpleImputer(strategy = 'most_frequent')
num_imput = SimpleImputer()

# column transformer and pipeline

In [25]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [28]:
num_pipeline =  Pipeline(steps = [('num_impute', SimpleImputer(strategy = 'mean')),
                                  ('Scaling', StandardScaler())])

In [29]:
#fit and transform
num_pipeline.fit(X[num_cols])

In [31]:
transformed_data = num_pipeline.transform(X[num_cols])
transformed_data


array([[ 0.07299082, -0.55448733,  0.        ,  0.27985054,  0.45164045],
       [-0.13441195, -0.03873155, -0.21927331,  0.27985054,  0.45164045],
       [-0.39374734, -0.55448733, -0.957641  ,  0.27985054,  0.45164045],
       ...,
       [ 0.43717437, -0.47240418,  1.26937121,  0.27985054,  0.45164045],
       [ 0.35706382, -0.55448733,  0.4833669 ,  0.27985054,  0.45164045],
       [-0.13441195, -0.55448733, -0.15972753,  0.27985054, -2.41044061]])

In [33]:
from sklearn.preprocessing import OneHotEncoder

In [41]:
#Cat_cols -> Missing values treatment -> Encoding
cat_pipline = Pipeline(steps=[('cat_impute', SimpleImputer(strategy = 'most_frequent')),
                              ('encoding', OneHotEncoder(sparse_output=False))])

In [44]:
cat_pipline.fit(X[cat_cols])

In [45]:
cat_transformed_data = cat_pipline.transform(X[cat_cols])
cat_transformed_data

array([[0., 1., 1., ..., 0., 0., 1.],
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 0., 0., 1.],
       ...,
       [0., 1., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 1.],
       [1., 0., 1., ..., 0., 1., 0.]])

In [46]:
from sklearn.compose import ColumnTransformer

In [47]:
transformer = ColumnTransformer([('num', num_pipeline, num_cols),
                                 ('cat', cat_pipline, cat_cols)])


In [48]:
transformer.fit(X)

In [49]:
Transformed_data = transformer.transform(X)

In [51]:
pd.DataFrame(Transformed_data)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.072991,-0.554487,0.000000,0.279851,0.451640,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,-0.134412,-0.038732,-0.219273,0.279851,0.451640,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,-0.393747,-0.554487,-0.957641,0.279851,0.451640,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3,-0.462062,0.251980,-0.314547,0.279851,0.451640,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
4,0.097728,-0.554487,-0.064454,0.279851,0.451640,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,-0.410130,-0.554487,-0.898095,0.279851,0.451640,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
610,-0.212557,-0.554487,-1.267279,-2.518655,0.451640,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
611,0.437174,-0.472404,1.269371,0.279851,0.451640,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
612,0.357064,-0.554487,0.483367,0.279851,0.451640,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
