# Import, read and slice the data

In [1]:
import pandas as pd

In [2]:
# Read csv file using pandas
dataset = pd.read_csv('loan_small.csv')
dataset

Unnamed: 0,Loan_ID,Gender,ApplicantIncome,CoapplicantIncome,LoanAmount,Area,Loan_Status
0,LP001002,,5849.0,0.0,,urban,Y
1,LP001003,Male,4583.0,,128.0,semi,N
2,LP001005,Male,3000.0,0.0,66.0,,Y
3,LP001006,Female,2583.0,2358.0,120.0,semi,
4,LP001008,Male,,0.0,141.0,urban,Y
5,LP001011,Male,5417.0,4196.0,267.0,semi,Y
6,LP001013,Male,2333.0,1516.0,,rural,Y
7,LP001014,Female,3036.0,2504.0,158.0,semi,N
8,LP001018,Male,4006.0,1526.0,168.0,rural,Y
9,LP001020,Male,12841.0,10968.0,349.0,semi,N


In [3]:
# Access the data using iloc
# iloc[row(s), column(s)]
subset = dataset.iloc[:5, :8]
subset

Unnamed: 0,Loan_ID,Gender,ApplicantIncome,CoapplicantIncome,LoanAmount,Area,Loan_Status
0,LP001002,,5849.0,0.0,,urban,Y
1,LP001003,Male,4583.0,,128.0,semi,N
2,LP001005,Male,3000.0,0.0,66.0,,Y
3,LP001006,Female,2583.0,2358.0,120.0,semi,
4,LP001008,Male,,0.0,141.0,urban,Y


In [4]:
# Access the data by column names
# first 5 rows
subsetC = dataset[['Loan_ID', 'ApplicantIncome', 'LoanAmount']][:5]
subsetC

Unnamed: 0,Loan_ID,ApplicantIncome,LoanAmount
0,LP001002,5849.0,
1,LP001003,4583.0,128.0
2,LP001005,3000.0,66.0
3,LP001006,2583.0,120.0
4,LP001008,,141.0


In [5]:
# reading the first 5 rows
dataset.head()

Unnamed: 0,Loan_ID,Gender,ApplicantIncome,CoapplicantIncome,LoanAmount,Area,Loan_Status
0,LP001002,,5849.0,0.0,,urban,Y
1,LP001003,Male,4583.0,,128.0,semi,N
2,LP001005,Male,3000.0,0.0,66.0,,Y
3,LP001006,Female,2583.0,2358.0,120.0,semi,
4,LP001008,Male,,0.0,141.0,urban,Y


In [6]:
# reading the first 10 rows
dataset.head(10)

Unnamed: 0,Loan_ID,Gender,ApplicantIncome,CoapplicantIncome,LoanAmount,Area,Loan_Status
0,LP001002,,5849.0,0.0,,urban,Y
1,LP001003,Male,4583.0,,128.0,semi,N
2,LP001005,Male,3000.0,0.0,66.0,,Y
3,LP001006,Female,2583.0,2358.0,120.0,semi,
4,LP001008,Male,,0.0,141.0,urban,Y
5,LP001011,Male,5417.0,4196.0,267.0,semi,Y
6,LP001013,Male,2333.0,1516.0,,rural,Y
7,LP001014,Female,3036.0,2504.0,158.0,semi,N
8,LP001018,Male,4006.0,1526.0,168.0,rural,Y
9,LP001020,Male,12841.0,10968.0,349.0,semi,N


In [7]:
# get the shape of our dataset
dataset.shape

(16, 7)

In [8]:
# list of columns in our dataset
dataset.columns

Index(['Loan_ID', 'Gender', 'ApplicantIncome', 'CoapplicantIncome',
       'LoanAmount', 'Area', 'Loan_Status'],
      dtype='object')

In [9]:
# columns with missing values
dataset.isnull().sum(axis=0)

Loan_ID              0
Gender               1
ApplicantIncome      2
CoapplicantIncome    1
LoanAmount           3
Area                 1
Loan_Status          1
dtype: int64

In [10]:
# drop the rows with missing values in any column
clean_dataset = dataset.dropna()
print(clean_dataset.shape)
clean_dataset

(9, 7)


Unnamed: 0,Loan_ID,Gender,ApplicantIncome,CoapplicantIncome,LoanAmount,Area,Loan_Status
5,LP001011,Male,5417.0,4196.0,267.0,semi,Y
7,LP001014,Female,3036.0,2504.0,158.0,semi,N
8,LP001018,Male,4006.0,1526.0,168.0,rural,Y
9,LP001020,Male,12841.0,10968.0,349.0,semi,N
10,LP001024,Female,3200.0,700.0,70.0,urban,Y
11,LP001027,Male,2500.0,1840.0,109.0,urban,Y
13,LP001029,Male,1853.0,2840.0,114.0,urban,N
14,LP001030,Male,1299.0,1086.0,17.0,semi,Y
15,LP001032,Male,4950.0,0.0,125.0,semi,Y


In [11]:
# drop the rows with missing values in a specific column
clean_dataset = dataset.dropna(subset=['Loan_Status'])
clean_dataset

Unnamed: 0,Loan_ID,Gender,ApplicantIncome,CoapplicantIncome,LoanAmount,Area,Loan_Status
0,LP001002,,5849.0,0.0,,urban,Y
1,LP001003,Male,4583.0,,128.0,semi,N
2,LP001005,Male,3000.0,0.0,66.0,,Y
4,LP001008,Male,,0.0,141.0,urban,Y
5,LP001011,Male,5417.0,4196.0,267.0,semi,Y
6,LP001013,Male,2333.0,1516.0,,rural,Y
7,LP001014,Female,3036.0,2504.0,158.0,semi,N
8,LP001018,Male,4006.0,1526.0,168.0,rural,Y
9,LP001020,Male,12841.0,10968.0,349.0,semi,N
10,LP001024,Female,3200.0,700.0,70.0,urban,Y


In [12]:
# copying the dataset
dataset_copy = dataset.copy()

In [13]:
# replace categorical values with mode - most frequent values
cols = ['Gender', 'Area', 'Loan_Status']
dataset_copy[cols] = dataset_copy[cols].fillna(dataset_copy.mode().iloc[0])
dataset_copy.isnull().sum(axis=0)

Loan_ID              0
Gender               0
ApplicantIncome      2
CoapplicantIncome    1
LoanAmount           3
Area                 0
Loan_Status          0
dtype: int64

In [14]:
# replace numerical value with mean
cols2 = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']
dataset_copy[cols2] = dataset_copy[cols2].fillna(dataset_copy.mean())
dataset_copy.isnull().sum(axis=0)

Loan_ID              0
Gender               0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Area                 0
Loan_Status          0
dtype: int64

In [15]:
dataset_copy

Unnamed: 0,Loan_ID,Gender,ApplicantIncome,CoapplicantIncome,LoanAmount,Area,Loan_Status
0,LP001002,Male,5849.0,0.0,140.923077,urban,Y
1,LP001003,Male,4583.0,2509.333333,128.0,semi,N
2,LP001005,Male,3000.0,0.0,66.0,semi,Y
3,LP001006,Female,2583.0,2358.0,120.0,semi,Y
4,LP001008,Male,4103.571429,0.0,141.0,urban,Y
5,LP001011,Male,5417.0,4196.0,267.0,semi,Y
6,LP001013,Male,2333.0,1516.0,140.923077,rural,Y
7,LP001014,Female,3036.0,2504.0,158.0,semi,N
8,LP001018,Male,4006.0,1526.0,168.0,rural,Y
9,LP001020,Male,12841.0,10968.0,349.0,semi,N


In [16]:
# Hot Encoding of categorical values

# let's check our data types
dataset_copy.dtypes

Loan_ID               object
Gender                object
ApplicantIncome      float64
CoapplicantIncome    float64
LoanAmount           float64
Area                  object
Loan_Status           object
dtype: object

In [17]:
# change our categorical column data type from object to category
cols = ['Gender', 'Area', 'Loan_Status']
dataset_copy[cols] = dataset_copy[cols].astype('category')
dataset_copy.dtypes

Loan_ID                object
Gender               category
ApplicantIncome       float64
CoapplicantIncome     float64
LoanAmount            float64
Area                 category
Loan_Status          category
dtype: object

In [18]:
# label encoding
for column in cols:
    dataset_copy[column] = dataset_copy[column].cat.codes

dataset_copy

Unnamed: 0,Loan_ID,Gender,ApplicantIncome,CoapplicantIncome,LoanAmount,Area,Loan_Status
0,LP001002,1,5849.0,0.0,140.923077,2,1
1,LP001003,1,4583.0,2509.333333,128.0,1,0
2,LP001005,1,3000.0,0.0,66.0,1,1
3,LP001006,0,2583.0,2358.0,120.0,1,1
4,LP001008,1,4103.571429,0.0,141.0,2,1
5,LP001011,1,5417.0,4196.0,267.0,1,1
6,LP001013,1,2333.0,1516.0,140.923077,0,1
7,LP001014,0,3036.0,2504.0,158.0,1,0
8,LP001018,1,4006.0,1526.0,168.0,0,1
9,LP001020,1,12841.0,10968.0,349.0,1,0


In [19]:
# creating dummy variables or one-hot encoding

# dropping Loan_ID column. axis = 1 means you are asking for the column
dataset2 = dataset.drop(['Loan_ID'], axis=1)

# get_dummies automatically figures out which columns are categorical and one-hot encode them
dataset2 = pd.get_dummies(dataset2)
dataset2

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Gender_Female,Gender_Male,Area_rural,Area_semi,Area_urban,Loan_Status_N,Loan_Status_Y
0,5849.0,0.0,,0,0,0,0,1,0,1
1,4583.0,,128.0,0,1,0,1,0,1,0
2,3000.0,0.0,66.0,0,1,0,0,0,0,1
3,2583.0,2358.0,120.0,1,0,0,1,0,0,0
4,,0.0,141.0,0,1,0,0,1,0,1
5,5417.0,4196.0,267.0,0,1,0,1,0,0,1
6,2333.0,1516.0,,0,1,1,0,0,0,1
7,3036.0,2504.0,158.0,1,0,0,1,0,1,0
8,4006.0,1526.0,168.0,0,1,1,0,0,0,1
9,12841.0,10968.0,349.0,0,1,0,1,0,1,0
