In [9]:
# data manipulation and plotting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# for saving the pipeline
import joblib

# from Scikit-learn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import auc, accuracy_score, confusion_matrix, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Binarizer

# from feature-engine
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer,
)

from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder,
)

from feature_engine.transformation import LogTransformer

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

# import preprocessors as pp

In [2]:
# load dataset
data = pd.read_csv('CustomerCreditHistory.csv')

# rows and columns of the data
print(data.shape)

# visualise the dataset
data.head()

(32581, 14)


Unnamed: 0,Cust_Id,Location,Age,Car Ownership,Income,Home Type,Job Experience,Loan Purpose,Loan Grade,Loan Balance,Interest Rate,loan_status,Default History,Credit History
0,1,Hyderabad,22,0,4425000,RENT,123.0,PERSONAL,D,2625000,16.02,1,Y,3
1,2,Pune,21,1,720000,OWN,5.0,EDUCATION,B,75000,11.14,0,N,2
2,3,Mumbai,25,1,720000,MORTGAGE,1.0,MEDICAL,C,412500,12.87,1,N,3
3,4,Hyderabad,23,1,4912500,RENT,4.0,MEDICAL,C,2625000,15.23,1,N,2
4,5,Delhi,24,1,4080000,RENT,8.0,MEDICAL,C,2625000,14.27,1,Y,4


In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['Cust_Id', 'loan_status'], axis=1), # predictive variables
    data['loan_status'], # target
    test_size=0.1, # portion of dataset to allocate to test set
    random_state=0, # we are setting the seed here
)

X_train.shape, X_test.shape

((29322, 12), (3259, 12))

In [5]:
NUMERICAL_VARS_WITH_NA = ['Job Experience', 'Interest Rate']

# categorical variables to encode
CATEGORICAL_VARS = ['Location', 'Home Type', 'Loan Purpose', 'Loan Grade', 'Default History']

In [6]:
# set up the pipeline
status_pipe = Pipeline([

    # ===== IMPUTATION =====


    # add missing indicator
    ('missing_indicator', AddMissingIndicator(variables=NUMERICAL_VARS_WITH_NA)),

    # impute numerical variables with the mean
    ('mean_imputation', MeanMedianImputer(
        imputation_method='mean', variables=NUMERICAL_VARS_WITH_NA
    )),


    # == CATEGORICAL ENCODING
    ('rare_label_encoder', RareLabelEncoder(
        tol=0.01, n_categories=1, variables=CATEGORICAL_VARS
    )),

    # encode categorical and discrete variables using the target mean
    ('categorical_encoder', OrdinalEncoder(
        encoding_method='ordered', variables=CATEGORICAL_VARS)),
    
    
    ('scaler', MinMaxScaler()),

    ('RandomForest', RandomForestClassifier(n_estimators=500, bootstrap=True, max_features='sqrt')),
])

In [8]:
# train the pipeline
status_pipe.fit(X_train, y_train)

In [11]:
# evaluate the model:
# ====================

# make predictions for train set
pred = status_pipe.predict(X_train)

# determine accuracy
print('train accuracy: {}'.format(int(
    accuracy_score(y_train, pred))))

# make predictions for test set
pred = status_pipe.predict(X_test)

# determine accuracy
print('test accuracy: {}'.format(
    accuracy_score(y_test, pred)))



train accuracy: 1
test accuracy: 0.924823565510893
