In [1]:
# to handle datasets
import pandas as pd
import numpy as np

# for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# for the yeo-johnson transformation
import scipy.stats as stats

# to display all the columns of the dataframe in the notebook
pd.pandas.set_option('display.max_columns', None)

In [2]:
# load dataset
data = pd.read_csv('CustomerCreditHistory.csv')

# rows and columns of the data
print(data.shape)

# visualise the dataset
data.head()

(32581, 14)


Unnamed: 0,Cust_Id,Location,Age,Car Ownership,Income,Home Type,Job Experience,Loan Purpose,Loan Grade,Loan Balance,Interest Rate,loan_status,Default History,Credit History
0,1,Hyderabad,22,0,4425000,RENT,123.0,PERSONAL,D,2625000,16.02,1,Y,3
1,2,Pune,21,1,720000,OWN,5.0,EDUCATION,B,75000,11.14,0,N,2
2,3,Mumbai,25,1,720000,MORTGAGE,1.0,MEDICAL,C,412500,12.87,1,N,3
3,4,Hyderabad,23,1,4912500,RENT,4.0,MEDICAL,C,2625000,15.23,1,N,2
4,5,Delhi,24,1,4080000,RENT,8.0,MEDICAL,C,2625000,14.27,1,Y,4


In [3]:
# drop Cust_Id, it is just a number given to identify each cutomer
data.drop('Cust_Id', axis=1, inplace=True)

data.shape

(32581, 13)

### Target

In [5]:
data['loan_status'].value_counts()

loan_status
0    25473
1     7108
Name: count, dtype: int64

### Variable types

In [9]:
# let's identify the categorical variables
# we will capture those of type *object*

cat_vars = [var for var in data.columns if data[var].dtype == 'O']
len(cat_vars)

5

In [10]:
# cast all variables as categorical
data[cat_vars] = data[cat_vars].astype('O')

In [19]:
# now let's identify the numerical variables

num_vars = [
    var for var in data.columns if var not in cat_vars# and var != 'loan_status'
]

# number of numerical variables
len(num_vars)

8

### Missing values

In [20]:
# make a list of the variables that contain missing values
vars_with_na = [var for var in data.columns if data[var].isnull().sum() > 0]

# determine percentage of missing values (expressed as decimals)
# and display the result ordered by % of missin data

data[vars_with_na].isnull().mean().sort_values(ascending=False)

Interest Rate     0.095639
Job Experience    0.027470
dtype: float64

In [21]:
# now we can determine which variables, from those with missing data,
# are numerical and which are categorical

cat_na = [var for var in cat_vars if var in vars_with_na]
num_na = [var for var in num_vars if var in vars_with_na]

print('Number of categorical variables with na: ', len(cat_na))
print('Number of numerical variables with na: ', len(num_na))

Number of categorical variables with na:  0
Number of numerical variables with na:  2


In [22]:
num_na

['Job Experience', 'Interest Rate']

### Discrete variables

In [23]:
#  let's male a list of discrete variables
discrete_vars = [var for var in num_vars if len(
    data[var].unique()) < 20]


print('Number of discrete variables: ', len(discrete_vars))

Number of discrete variables:  2


In [24]:
data[discrete_vars].head()

Unnamed: 0,Car Ownership,loan_status
0,0,1
1,1,0
2,1,1
3,1,1
4,1,1


In [25]:
cat_vars

['Location', 'Home Type', 'Loan Purpose', 'Loan Grade', 'Default History']

In [26]:
num_vars

['Age',
 'Car Ownership',
 'Income',
 'Job Experience',
 'Loan Balance',
 'Interest Rate',
 'loan_status',
 'Credit History']