#### Import Necessary Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
import pickle

import warnings
warnings.filterwarnings("ignore")

#### Importing Data

In [2]:
train_file = pd.read_csv('train_file.csv')

#### Display first 5 records from the file

In [3]:
train_file.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


#### Understand the structure of the datasets

In [4]:
train_file.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [5]:
print('The size of the train file:', train_file.shape)

The size of the train file: (614, 13)


In [6]:
#drop the null values
train_file.dropna(axis=0, inplace=True)

In [7]:
train_file.shape

(480, 13)

In [8]:
train_file.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

#### List of Preprocessing steps:                             
1) Retain only important fields 'Credit_History', 'Property_Area', 'Married' & 'LoanAmount'.      
2) Convert the field 'Married' into '0's & '1's using Label Encoder.              
3) Use one-hot encoder for the field 'Property_Area'.  
4) LoanAmount is rescaled using Standard Scaler

In [9]:
train_x = train_file[[ 'Credit_History', 'Property_Area', 'Married', 'LoanAmount']]
train_y = train_file['Loan_Status']
train_y

1      N
2      Y
3      Y
4      Y
5      Y
      ..
609    Y
610    Y
611    Y
612    Y
613    N
Name: Loan_Status, Length: 480, dtype: object

In [10]:
train_y = train_y.replace({'Y':1, 'N':0})

In [11]:
train_x.head()

Unnamed: 0,Credit_History,Property_Area,Married,LoanAmount
1,1.0,Rural,Yes,128.0
2,1.0,Urban,Yes,66.0
3,1.0,Urban,Yes,120.0
4,1.0,Urban,No,141.0
5,1.0,Urban,Yes,267.0


In [12]:
train_x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 480 entries, 1 to 613
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Credit_History  480 non-null    float64
 1   Property_Area   480 non-null    object 
 2   Married         480 non-null    object 
 3   LoanAmount      480 non-null    float64
dtypes: float64(2), object(2)
memory usage: 18.8+ KB


In [13]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

numeric_features = ['LoanAmount']
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])    

categorical_features = ['Property_Area', 'Married']
categorical_transformer = OneHotEncoder(drop='first', sparse=False)

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),
                                               ('cat', categorical_transformer, categorical_features)])
preprocessor

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('scaler', StandardScaler())]),
                                 ['LoanAmount']),
                                ('cat',
                                 OneHotEncoder(drop='first', sparse=False),
                                 ['Property_Area', 'Married'])])

In [15]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(random_state=100))])

In [16]:
clf.fit(train_x, train_y)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['LoanAmount']),
                                                 ('cat',
                                                  OneHotEncoder(drop='first',
                                                                sparse=False),
                                                  ['Property_Area',
                                                   'Married'])])),
                ('classifier', LogisticRegression(random_state=100))])

In [18]:
# open a file, where you ant to store the data
file = open('model.pkl', 'wb')

# dump information to that file
pickle.dump(clf, file)