In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
%matplotlib inline

## Loading the dataset 

In [None]:
df = pd.read_csv("/kaggle/input/loan-prediction-problem-dataset/train_u6lujuX_CVtuZ9i.csv")
df.head()

In [None]:
df.describe()

In [None]:
df.info()

## Preprocessing of Dataset

In [None]:
# find the null  values

df.isnull().sum()

In [None]:
# fill the missing values for numerical terms - mean
df['LoanAmount']=df['LoanAmount'].fillna(df['LoanAmount'].mean())
df['Loan_Amount_Term']=df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mean())
df['Credit_History']=df['Credit_History'].fillna(df['Credit_History'].mean())

In [None]:
# fill the missing values for categorical terms - mode
df['Gender']=df['Gender'].fillna(df['Gender'].mode()[0])
df['Married']=df['Married'].fillna(df['Married'].mode()[0])
df['Dependents']=df['Dependents'].fillna(df['Dependents'].mode()[0])
df['Self_Employed']=df['Self_Employed'].fillna(df['Self_Employed'].mode()[0])

In [None]:
df.isnull().sum()

## Exploratory Data Analysis

### Categorical Attributes Visualization

In [None]:
sns.countplot(df['Gender'])

In [None]:
sns.countplot(df['Dependents'])

In [None]:
sns.countplot(df['Education'])

In [None]:
sns.countplot(df['Self_Employed'])

In [None]:
sns.countplot(df['Property_Area'])

In [None]:
sns.countplot(df['Loan_Status'])

### Numerical attributes visualization

In [None]:
sns.distplot(df['ApplicantIncome'])

In [None]:
# apply Log Transformation to the attribute
df['ApplicantIncome']=np.log(df['ApplicantIncome'])

In [None]:
sns.distplot(df['ApplicantIncome'])

In [None]:
df['ApplicantIncome']=np.log(df['ApplicantIncome']+1)

In [None]:
sns.distplot(df['ApplicantIncome'])

In [None]:
sns.distplot(df['CoapplicantIncome'])

In [None]:
df['CoapplicantIncome']=np.log(df['CoapplicantIncome']+1)

In [None]:
sns.distplot(df['CoapplicantIncome'])

In [None]:
sns.distplot(df['LoanAmount'])

In [None]:
df['LoanAmount']=np.log(df['LoanAmount']+1)

In [None]:
sns.distplot(df['LoanAmount'])

In [None]:
sns.distplot(df['Loan_Amount_Term'])

In [None]:
df['Loan_Amount_Term']=np.log(df['Loan_Amount_Term'])

In [None]:
sns.distplot(df['Loan_Amount_Term'])

In [None]:
sns.distplot(df['Credit_History'])

## Coorelation Matrix

In [None]:
corr = df.corr()
sns.heatmap(corr,annot=True, cmap='BuPu')

In [None]:
df.head()

In [None]:
# drop unnecessary columns
cols = ['Loan_ID', 'CoapplicantIncome']
df=df.drop(columns=cols, axis=1)
df.head()

## Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
cols=['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status', 'Dependents']
le=LabelEncoder()
for col in cols:
    df[col]=le.fit_transform(df[col])

In [None]:
df.head()

## Train-Test Split

In [None]:
# Specify input and output attributes
X=df.drop(columns=['Loan_Status'],axis=1)
y=df['Loan_Status']

In [None]:
from sklearn.model_selection import train_test_split
X_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)

## Model Training

In [None]:
# classify function
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
def classify(model,x,y):
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42,shuffle=True)
    model.fit(x_train,y_train)
    print("Accuracy is ", model.score(x_test,y_test)*100)
    score=cross_val_score(model,x,y,cv=5)
    print("Cross validation is ," ,np.mean(score)*100)
    y_pred=model.predict(x_test)
    confusionmatrix(y_pred,y_test)
def confusionmatrix(y_pred,y_test):
    cm=confusion_matrix(y_test,y_pred)
    print(cm)
    sns.heatmap(cm,annot=True)

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
classify(model,X,y)

In [None]:
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
model = RandomForestClassifier(n_estimators=100,min_samples_split=25, max_depth=7,max_features=1)
classify(model,X,y)

In [None]:
model = ExtraTreesClassifier(n_estimators=100,min_samples_split=25, max_depth=7,max_features=1)
classify(model,X,y)

In [None]:
from xgboost import XGBClassifier
# specify parameters via map
model=XGBClassifier( learning_rate =0.01,
 n_estimators=250,
 max_depth=5,
 min_child_weight=1,
  subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 use_label_encoder=False,
 eval_metric='mlogloss',                   
 seed=45)
classify(model,X,y)