<h1> Create pipeline to preprocess numerical as well as categorical varivable and classify the data</h1>

In [1]:
# import modules

import numpy as np
import pandas as pd

# ignore warnings

from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
# read the data

df = pd.read_csv('D:/Study/DataScience/Data/loan_train.csv')

df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [13]:
# define feature and target variables

x = df.drop(columns=['Loan_ID', 'Loan_Status'])
y = df['Loan_Status']

In [15]:
print(y.head())

0    Y
1    N
2    Y
3    Y
4    Y
Name: Loan_Status, dtype: object


In [4]:
# Split the data into training and testing datasets

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [5]:
# import pipeline module

from sklearn.pipeline import Pipeline

In [6]:
# import modules for preprocessing the data

# For numerical variables : Imputer with median stratergy and Feature Scaling with Standard Scaler
# For categorical variables : Imputer with constant strateergy and One Hot Encoding

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [7]:
# Create separate list of numerical and categorical columns

numerical_features = df.select_dtypes(include= ['int64', 'float64']).columns
categorical_features = df.select_dtypes(include= ['object']).drop(['Loan_Status', 'Loan_ID'], axis=1).columns

In [17]:
categorical_features

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'Property_Area'],
      dtype='object')

In [8]:
# Create seperate pipeline for numerical and categorical variables

numerical_pipe = Pipeline([('imputer', SimpleImputer(strategy = 'median')), ('scaler', StandardScaler())])

categorical_pipe = Pipeline([('imputer', SimpleImputer(strategy = 'constant', fill_value = 'missing')), ('ohe', OneHotEncoder())])

In [9]:
# Create columntransformer to map the pipelines to correct columns

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(transformers=[('num', numerical_pipe, numerical_features), 
                                               ('cat', categorical_pipe, categorical_features)])

In [10]:
# Creating the classifier

from sklearn.ensemble import RandomForestClassifier

rf = Pipeline([('preprocessor', preprocessor), ('classifier', RandomForestClassifier())])

In [11]:
# Fitting the classifier

rf.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [12]:
# Print the score

print('Test score: {:0.2f}'.format(rf.score(x_test,y_test)))

Test score: 0.71


<b> Use Grid Search CV to find best model parameters </b>

In [35]:
# import module for Grid Search

from sklearn.model_selection import GridSearchCV

# Create parameter search grid

param_grid = { 'classifier__n_estimators' : [110, 50, 100],
               'classifier__max_features' : ['auto', 'sqrt', 'log2'],
               'classifier__max_depth' : [4,5,6,7,8],
               'classifier__criterion' : ['gini', 'entropy']}

In [36]:
# Use Grid Search CV

grid = GridSearchCV(rf, param_grid = param_grid, cv = 5)

In [37]:
# Fit the grid search cv

grid.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbo...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'classifier__n_estimators': [110, 50, 100], 'classifier__max_features': ['auto', 'sqrt', 'log2'], 'classifier__max_depth': [4, 5, 6, 7, 8], 'classifier__criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [38]:
# Print best parameters

print('Best Parameters:\n {}'.format(grid.best_params_))

Best Parameters:
 {'classifier__criterion': 'gini', 'classifier__max_depth': 4, 'classifier__max_features': 'sqrt', 'classifier__n_estimators': 110}
