# CA_01 : Import CSV file into Data Frames and pre process that dataset

Description:

Importing a dataset for Loan Eligibility in a bank to check if the Account holder is either eligible or not eligible to avail a loan from bank based on the Account balance of the Account holder.

### Loading the dataset

In [1]:
# Importing required library
import pandas as pd

In [2]:
# Importing the dataset
df = pd.read_csv(r"Loan_Eligibility.csv", dtype='str')

### Printing some Properties of the dataset

In [3]:
# Printing the first five rows of the dataset
df.head()

Unnamed: 0,Acc_No,Acc_Balance,Loan_Eligibility
0,4561237895,247126336.0,Yes
1,1596789425,44552007.0,Yes
2,4478945574,15466636.0,Yes
3,4478945574,15466636.0,Yes
4,2563256322,3998520.0,Yes


In [4]:
# Printing the last five rows of the dataset
df.tail()

Unnamed: 0,Acc_No,Acc_Balance,Loan_Eligibility
27,125879634,25622005.0,Yes
28,2365853223,2552220.0,No
29,2478963356,45000217.0,Yes
30,4587522652,750000.0,Yes
31,3644795562,457800.0,No


In [5]:
# Printing the dimensions of the dataset
df.shape

(32, 3)

In [6]:
# Printing the total no.of elements in the dataset
df.size

96

In [7]:
# Printing the features of the dataset
df.columns

Index(['Acc_No', 'Acc_Balance', 'Loan_Eligibility'], dtype='object')

In [8]:
# Showing the summary of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Acc_No            32 non-null     object
 1   Acc_Balance       30 non-null     object
 2   Loan_Eligibility  32 non-null     object
dtypes: object(3)
memory usage: 896.0+ bytes


In [9]:
# Showing the Statistics of the dataset
df.describe(include='all')

Unnamed: 0,Acc_No,Acc_Balance,Loan_Eligibility
count,32,30.0,32
unique,27,24.0,2
top,4478945574,72300.0,Yes
freq,2,3.0,24


### Feature Selection

In [10]:
# Selecting the required features
df = df[['Acc_Balance', 'Loan_Eligibility']]

In [11]:
# Printing the first five rows to check
df.head()

Unnamed: 0,Acc_Balance,Loan_Eligibility
0,247126336.0,Yes
1,44552007.0,Yes
2,15466636.0,Yes
3,15466636.0,Yes
4,3998520.0,Yes


### Changing datatype of a Feature

In [12]:
# Changing the datatype of Acc_Balance to float
df = df.astype({"Acc_Balance": float, "Loan_Eligibility": str})

In [13]:
# Showing the summary of the dataset after changing datatype
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Acc_Balance       30 non-null     float64
 1   Loan_Eligibility  32 non-null     object 
dtypes: float64(1), object(1)
memory usage: 640.0+ bytes


In [14]:
# Showing the Statistics of the dataset after changing datatype
df.describe(include='all')

Unnamed: 0,Acc_Balance,Loan_Eligibility
count,30.0,32
unique,,2
top,,Yes
freq,,24
mean,190725900.0,
std,636724600.0,
min,22323.0,
25%,1659421.0,
50%,5106416.0,
75%,25622000.0,


### Handling Missing Values

In [15]:
# Showing True for null values and False for not null values
df.isnull()

Unnamed: 0,Acc_Balance,Loan_Eligibility
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
5,False,False
6,False,False
7,False,False
8,False,False
9,False,False


In [16]:
# Finding the features which have null values
df.isnull().sum()

Acc_Balance         2
Loan_Eligibility    0
dtype: int64

In [17]:
# Total no.of null values present in the dataset
df.isnull().values.sum()

2

In [18]:
# Filling the missing value with Acc_Balance as 0 
df = df.fillna(0)

In [19]:
# Cross checking for missing values
df.isnull().values.sum()

0

### Handling String Data

In [20]:
# Replacing Yes as 1
df.replace(to_replace ="Yes", value = 1, inplace = True)

In [21]:
# Replacing No as 0
df.replace(to_replace ="No", value = 0, inplace = True)

In [22]:
df.head()

Unnamed: 0,Acc_Balance,Loan_Eligibility
0,247126336.0,1
1,44552007.0,1
2,15466636.0,1
3,15466636.0,1
4,3998520.0,1


In [23]:
# Checking datatype of Loan_Eligibility
df['Loan_Eligibility'].dtype

dtype('int64')

### Standarization (z-score standarization)

In [24]:
# Importing required library
from sklearn.preprocessing import StandardScaler
    
# Creating object for MinMaxScaler 
scaling = StandardScaler()

# Scaling the dataset
scaling.fit_transform(df[["Acc_Balance", "Loan_Eligibility"]])

array([[ 0.11238835,  0.57735027],
       [-0.22084825,  0.57735027],
       [-0.26869395,  0.57735027],
       [-0.26869395,  0.57735027],
       [-0.2875591 ,  0.57735027],
       [-0.29389681,  0.57735027],
       [-0.28558207,  0.57735027],
       [-0.29159412,  0.57735027],
       [-0.29409998, -1.73205081],
       [-0.25753717,  0.57735027],
       [-0.28992018,  0.57735027],
       [-0.21640036,  0.57735027],
       [ 3.86217969,  0.57735027],
       [ 3.86217969,  0.57735027],
       [-0.13084927,  0.57735027],
       [-0.2941367 , -1.73205081],
       [-0.29401777, -1.73205081],
       [-0.29401777, -1.73205081],
       [-0.29401777, -1.73205081],
       [-0.2864673 ,  0.57735027],
       [-0.28982131,  0.57735027],
       [-0.29084539,  0.57735027],
       [-0.25370419,  0.57735027],
       [-0.25370419,  0.57735027],
       [-0.28589113,  0.57735027],
       [-0.2941367 , -1.73205081],
       [-0.25198827,  0.57735027],
       [-0.25198827,  0.57735027],
       [-0.28993828,

### Normalization (MinMaxNormalization)

In [25]:
# Importing required library
from sklearn.preprocessing import MinMaxScaler
    
# Creating object for MinMaxScaler 
norm = MinMaxScaler()

# Normalizing the dataset
norm.fit_transform(df[["Acc_Balance", "Loan_Eligibility"]])

array([[9.78089770e-02, 1.00000000e+00],
       [1.76330305e-02, 1.00000000e+00],
       [6.12146755e-03, 1.00000000e+00],
       [6.12146755e-03, 1.00000000e+00],
       [1.58255554e-03, 1.00000000e+00],
       [5.77173739e-05, 1.00000000e+00],
       [2.05822379e-03, 1.00000000e+00],
       [6.11738858e-04, 1.00000000e+00],
       [8.83511580e-06, 0.00000000e+00],
       [8.80576119e-03, 1.00000000e+00],
       [1.01448565e-03, 1.00000000e+00],
       [1.87031830e-02, 1.00000000e+00],
       [1.00000000e+00, 1.00000000e+00],
       [1.00000000e+00, 1.00000000e+00],
       [3.92865743e-02, 1.00000000e+00],
       [0.00000000e+00, 0.00000000e+00],
       [2.86152790e-05, 0.00000000e+00],
       [2.86152790e-05, 0.00000000e+00],
       [2.86152790e-05, 0.00000000e+00],
       [1.84523905e-03, 1.00000000e+00],
       [1.03827235e-03, 1.00000000e+00],
       [7.91881736e-04, 1.00000000e+00],
       [9.72796791e-03, 1.00000000e+00],
       [9.72796791e-03, 1.00000000e+00],
       [1.983864