In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
data = pd.read_csv('../input/loan-prediction-analysis/Loan Prediction Dataset.csv')
data.head()

In [3]:
data.describe()

In [4]:
data.isnull().sum()

In [5]:
data['LoanAmount'] = data['LoanAmount'].fillna(data['LoanAmount'].mean())
data['Loan_Amount_Term'] = data['Loan_Amount_Term'].fillna(data['Loan_Amount_Term'].mean())
data['Married'] = data['Married'].fillna(data['Married'].mode())
data['Dependents'] =data['Dependents'].fillna(0)

In [6]:
data.shape

In [7]:
#i cant replace gender by guessing so i will drop missing gender rows
data = data.dropna(axis=0)

In [8]:
data.isnull().sum()

In [9]:
data.shape

In [10]:
data = data.drop('Loan_ID',axis =1)

checking labels in each column and values

In [11]:
print('....gender....\n',data['Gender'].value_counts())
print('\n....Self_Employed....\n',data['Self_Employed'].value_counts())
print('\n....Credit history....\n',data['Credit_History'].value_counts())
print('\n....Loan_Status....\n',data['Loan_Status'].value_counts())

# so we only have 353 loan approvals and 170 non approval

visualization

In [12]:
import warnings 
warnings.simplefilter("ignore")


In [13]:
data.groupby('Self_Employed')['Loan_Status'].value_counts()

In [14]:
plt.figure(figsize =(4, 4))
sns.countplot(data['Self_Employed'],hue = data['Loan_Status'])
plt.title("self employed vs loan status")
plt.show

# people who are not self employed got high number of loan approvals 

In [15]:
print(data.groupby('Education')['Loan_Status'].value_counts())

plt.figure(figsize =(4, 4))
sns.countplot(data['Education'],hue = data['Loan_Status'])
plt.title("education vs loan status")
plt.show

In [16]:
print(data.groupby('Dependents')['Loan_Status'].value_counts())

plt.figure(figsize =(5, 5))
sns.countplot(data['Dependents'],hue = data['Loan_Status'])
plt.title("dependents vs loan status")
plt.show

#almost same percentage for all labels, so it doesnt make much difference if customer has dependents or not


In [17]:
print(data.groupby('Credit_History')['Loan_Status'].value_counts())

# very low chances of getting loan if you dont have any credit history

plt.figure(figsize =(5, 5))
sns.countplot(data['Credit_History'],hue = data['Loan_Status'])
plt.title("credit history vs loan status")
plt.show

In [18]:
data = data.drop('Property_Area' ,axis =1)
data= data.drop('Gender',axis =1)
data= data.drop('Married',axis =1)

In [19]:
data.head()

In [20]:

data= data.astype('str')
data.info()

In [21]:

cat_data = ['Dependents','Education','Self_Employed','ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term','Credit_History','Loan_Status']

In [22]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
for col in cat_data:
    data[col] = enc.fit_transform(data[col])


In [23]:
data.head()

In [24]:
x= data.drop('Loan_Status',axis =1)
y= data['Loan_Status']

In [70]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size =0.2,random_state =0)

In [26]:
from sklearn.model_selection import cross_val_score
 
def model_train(model,x,y):   
    x_train, x_test, y_train, y_test = train_test_split(x,y,test_size =0.2,random_state =0)
    model.fit(x_train,y_train)
    predict = model.predict(x_test)

    print("model score  ", model.score(x_test,y_test)*100)

    cross_val = cross_val_score(model,x,y,cv=5)
    print("cross val score  ", np.mean(cross_val)*100)

In [27]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

model_train(model,x,y)

In [28]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model_train(model,x,y)

In [32]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()

model_train(model,x,y)

In [35]:
from xgboost import XGBClassifier

model = XGBClassifier()

model_train(model,x,y)

In [42]:
from catboost import CatBoostClassifier
cb = CatBoostClassifier()

model_train(cb,x,y)

hyperparameter tuning 

In [44]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(100,1200,12)]
max_features = ['auto','sqrt']
max_depth = [int(x) for x in np.linspace(5,30,6)]
min_samples_split = [2,5,10,15,100]
min_samples_leaf = [1,2,5,10]


In [45]:
grid_search = { 'n_estimators' : n_estimators ,
              'max_features': max_features,
              'max_depth' : max_depth,
              'min_samples_split' : min_samples_split ,
              'min_samples_leaf' : min_samples_leaf }


In [47]:
random_forest = RandomForestClassifier()

random_forest = RandomizedSearchCV(estimator = random_forest, param_distributions = grid_search, scoring = 'accuracy', cv=5,n_iter =10, n_jobs =-1,random_state =42,verbose =2)
random_forest.fit(x,y)


In [49]:
random_forest.best_params_

In [57]:
random_forest.best_score_

In [51]:
dt = DecisionTreeClassifier()


max_features = ['auto','sqrt']
max_depth = [int(x) for x in np.linspace(5,30,6)]
min_samples_split = [2,5,10,15,100]
min_samples_leaf = [1,2,5,10]

In [52]:
grid_search = {
              'max_features': max_features,
              'max_depth' : max_depth,
              'min_samples_split' : min_samples_split ,
              'min_samples_leaf' : min_samples_leaf }


In [53]:
dt = RandomizedSearchCV(estimator = dt, param_distributions = grid_search, scoring = 'accuracy', cv=5,n_iter =10, n_jobs =-1,random_state =42,verbose =2)
dt.fit(x,y)


In [54]:
dt.best_params_

In [56]:
dt.best_score_

In [60]:
from scipy.stats import uniform, randint

xgb = XGBClassifier()



In [61]:
params = {     'Gamma' : uniform(0,0.5),
              'learning_rate': uniform(0.03,0.3),
              'max_depth' : randint(2,6),
              'n_estimators' : randint(100,150) ,
              'subsample' : uniform(0.6,0.4) }


In [62]:
xgboost = RandomizedSearchCV(estimator = xgb, param_distributions = params, scoring = 'accuracy', cv=5,n_iter =10, n_jobs =-1,random_state =42,verbose =2)
xgboost.fit(x,y)


In [63]:
xgboost.best_params_

In [64]:
xgboost.best_score_

confusion matrix

In [67]:
from sklearn.metrics import confusion_matrix

In [75]:
y_pred = xgboost.predict(x_test)
cm = confusion_matrix(y_test,y_pred)
print(cm)
sns.heatmap(cm,annot=True)

In [76]:
y_pred = dt.predict(x_test)
cm = confusion_matrix(y_test,y_pred)
print(cm)
sns.heatmap(cm,annot=True)