# Loan Regression

## import library and read dataset

In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [21]:
# read dataset with pandas

df = pd.read_csv('./loans.csv', index_col="client_id")
df.head()

Unnamed: 0_level_0,loan_type,loan_amount,repaid,loan_id,loan_start,loan_end,rate
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
46109,home,13672,0,10243,2002-04-16,2003-12-20,2.15
46109,credit,9794,0,10984,2003-10-21,2005-07-17,1.25
46109,home,12734,1,10990,2006-02-01,2007-07-05,0.68
46109,cash,12518,1,10596,2010-12-08,2013-05-05,1.24
46109,credit,14049,1,11415,2010-07-07,2012-05-21,3.13


### exploration of the dataset

In [22]:
# shape of data set and name columns

print(df.shape)
print(df.columns)

(443, 7)
Index(['loan_type', 'loan_amount', 'repaid', 'loan_id', 'loan_start',
       'loan_end', 'rate'],
      dtype='object')


In [23]:
# checking the data types of the columns

df.dtypes

loan_type       object
loan_amount      int64
repaid           int64
loan_id          int64
loan_start      object
loan_end        object
rate           float64
dtype: object

In [24]:
# get general information about data set

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 443 entries, 46109 to 26945
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   loan_type    443 non-null    object 
 1   loan_amount  443 non-null    int64  
 2   repaid       443 non-null    int64  
 3   loan_id      443 non-null    int64  
 4   loan_start   443 non-null    object 
 5   loan_end     443 non-null    object 
 6   rate         443 non-null    float64
dtypes: float64(1), int64(3), object(3)
memory usage: 27.7+ KB


In [25]:
# Statistical description of numerical data sets

df.describe()

Unnamed: 0,loan_amount,repaid,loan_id,rate
count,443.0,443.0,443.0,443.0
mean,7982.311512,0.534989,11017.10158,3.217156
std,4172.891992,0.499338,581.826222,2.397168
min,559.0,0.0,10009.0,0.01
25%,4232.5,0.0,10507.5,1.22
50%,8320.0,1.0,11033.0,2.78
75%,11739.0,1.0,11526.0,4.75
max,14971.0,1.0,11991.0,12.62


In [26]:
# Statistical description of non-numerical data sets

df.describe(exclude=[np.number])

Unnamed: 0,loan_type,loan_start,loan_end
count,443,443,443
unique,4,430,428
top,home,2007-05-16,2008-08-29
freq,121,2,2


In [27]:
# with nunique show different types for each feature

df.nunique()

loan_type        4
loan_amount    438
repaid           2
loan_id        443
loan_start     430
loan_end       428
rate           336
dtype: int64

In [28]:
# the number of different values for the loan_type column

df['loan_type'].value_counts()

loan_type
home      121
cash      108
credit    107
other     107
Name: count, dtype: int64

In [29]:
# the number of different values for the repaid column

df['repaid'].value_counts()

repaid
1    237
0    206
Name: count, dtype: int64

In [30]:
# checking for the missing values

df.isnull().sum()

loan_type      0
loan_amount    0
repaid         0
loan_id        0
loan_start     0
loan_end       0
rate           0
dtype: int64

#### remove the loan_id feature

In [31]:
# drop loan_id

df = df.drop(['loan_id'], axis=1)
df.head()

Unnamed: 0_level_0,loan_type,loan_amount,repaid,loan_start,loan_end,rate
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
46109,home,13672,0,2002-04-16,2003-12-20,2.15
46109,credit,9794,0,2003-10-21,2005-07-17,1.25
46109,home,12734,1,2006-02-01,2007-07-05,0.68
46109,cash,12518,1,2010-12-08,2013-05-05,1.24
46109,credit,14049,1,2010-07-07,2012-05-21,3.13


#### change the data types

In [32]:
# We need to change the type of columns loan_type, loan_start and loan_end and repaid

df['repaid'] = df['repaid'].astype('category')
df['loan_type'] = df['loan_type'].astype('category')

df['loan_start'] = pd.to_datetime(df['loan_start'])
df['loan_end'] = pd.to_datetime(df['loan_end'])

df.dtypes

loan_type            category
loan_amount             int64
repaid               category
loan_start     datetime64[ns]
loan_end       datetime64[ns]
rate                  float64
dtype: object