In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


In [None]:
train_df = pd.read_csv('../input/loan-eligible-dataset/loan-train.csv')
test_df = pd.read_csv('../input/loan-eligible-dataset/loan-test.csv')

In [None]:
train_df.head(10)

In [None]:
test_df.head(10)

In [None]:
train_df.shape

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
100*train_df.isnull().sum()/len(train_df)

In [None]:
train_df['Gender'].value_counts()

Null value columns:
1. LoanAmount, Loan_amount_term, Credit History: Float
2. Gender, Married, Dependents, SelfEmployed: Object

## Null value Treatment and Imputation

In [None]:
train_df['Married'].value_counts()

In [None]:
train_df['Dependents'].value_counts()

In [None]:
train_df['Self_Employed'].value_counts()

In [None]:
train_df['Married'] = train_df['Married'].replace(np.nan,'Yes')
train_df['Dependents'] = train_df['Dependents'].replace(np.nan,'0')
train_df['Self_Employed'] = train_df['Self_Employed'].replace(np.nan,'No')

In [None]:
train_df['Gender'].value_counts()

In [None]:
train_df['Gender'] = train_df['Gender'].replace(np.nan,'Male')

In [None]:
100*train_df.isnull().sum()/len(train_df)

In [None]:
train_df['LoanAmount'] = train_df['LoanAmount'].fillna(train_df['LoanAmount'].median())
train_df['Loan_Amount_Term'] = train_df['Loan_Amount_Term'].fillna(train_df['Loan_Amount_Term'].median())
train_df['Credit_History'] = train_df['Credit_History'].fillna(train_df['Credit_History'].median())

In [None]:
100*train_df.isnull().sum()/len(train_df)

## Exploratory Data Analysis and Data Visualization

In [None]:
train_df.info()

In [None]:
train_df.columns

### Categorical Column Plot and Visualization

In [None]:
def plot_count(var_list):
    plt.figure(figsize=(30,30))
    for var in var_list:
        plt.subplot(4,4,var_list.index(var)+1)
        ax=sns.countplot(train_df[var], data = train_df)   
    plt.show()

In [None]:
plot_count(['Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed','Property_Area', 'Loan_Status'])

In [None]:
plt.figure(figsize = (10,10))
sns.heatmap(train_df.corr(), annot = True)

In [None]:
sns.pairplot(train_df)

In [None]:
sns.countplot('Gender', data = train_df, hue = 'Married')

In [None]:
def plot_count1(var_list):
    plt.figure(figsize=(30,30))
    for var in var_list:
        plt.subplot(3,2,var_list.index(var)+1)
        ax=sns.countplot(train_df[var], data = train_df, hue = 'Loan_Status')   
    plt.show()

In [None]:
plot_count1(['Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed','Property_Area'])

In [None]:
train_df.Loan_Status.replace(['N', 'Y'], [0, 1], inplace =True)

In [None]:
train_df.head()

In [None]:
plt.figure(figsize = (10,10))
sns.heatmap(train_df.corr(), annot = True)

#CoApplicantIncome and ApplicantIncome
#LoanAmount and ApplicantIncome
#CoApplicantIncome and LoanAmount
#Credit_History and Loan_Status

In [None]:
train_df.columns

In [None]:
df_cat = train_df[['Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed','Property_Area']]

In [None]:
df_cat.info()

In [None]:
df_cat_dummies = pd.get_dummies(df_cat)

In [None]:
df_cat_dummies

In [None]:
train_df = pd.concat([train_df, df_cat_dummies], axis = 1)
train_df.head()

In [None]:
train_df.drop(['Loan_ID','Gender', 'Married', 'Dependents', 'Education','Self_Employed','Property_Area'], axis = 1, inplace  =True)

In [None]:
train_df

In [None]:
plt.figure(figsize = (20,10))
sns.heatmap(train_df.corr(), annot = True)

## Data Preprocessing and Modelling

In [None]:
X = train_df.drop(['Loan_Status'], axis =1)
y = train_df['Loan_Status']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state = 142)

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler


scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import  mean_squared_error, r2_score,plot_roc_curve
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

lr_pred = logreg.predict(X_test)
print("Accuracy {}".format(metrics.accuracy_score(y_test, lr_pred)))
print("Recall/Sensitivity {}".format(metrics.recall_score(y_test, lr_pred)))
print(confusion_matrix(y_test, lr_pred))
print(classification_report(y_test, lr_pred))

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()

rf_model.fit(X_train, y_train)

rf_pred = rf_model.predict(X_test)
print("Accuracy {}".format(metrics.accuracy_score(y_test, rf_pred)))
print("Recall/Sensitivity {}".format(metrics.recall_score(y_test, rf_pred)))
print(confusion_matrix(y_test,rf_pred))
print(classification_report(y_test, rf_pred))

### SVM

In [None]:
from sklearn.svm import SVC

svm=SVC()
svm.fit(X_train,y_train)
svm_pred=svm.predict(X_test)

print("Accuracy {}".format(metrics.accuracy_score(y_test, svm_pred)))
print("Recall/Sensitivity {}".format(metrics.recall_score(y_test, svm_pred)))
print(confusion_matrix(y_test,svm_pred))
print(classification_report(y_test, svm_pred))

### Decision Tree Classfifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier(max_leaf_nodes=10, random_state=30, criterion='entropy')
dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)

print("Accuracy {}".format(metrics.accuracy_score(y_test, dt_pred)))
print("Recall/Sensitivity {}".format(metrics.recall_score(y_test, dt_pred)))
print(confusion_matrix(y_test, dt_pred))
print(classification_report(y_test, dt_pred))

### Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
gbc_pred = gbc.predict(X_test)

print("Accuracy {}".format(metrics.accuracy_score(y_test, gbc_pred)))
print("Recall/Sensitivity {}".format(metrics.recall_score(y_test, gbc_pred)))
print(confusion_matrix(y_test, gbc_pred))
print(classification_report(y_test, gbc_pred))

Accuracy Scores:
- Logistic Regression: 81.62%
- Random Forest: 81.08%
- Decision Tree Classifier: 80.54%
- SVM(Support Vector Machine): 81.62%
- Gradient Boosting: 77.83%

In [None]:
pd.concat([pd.DataFrame(X.columns, columns = ['variable']),
           pd.DataFrame(rf_model.feature_importances_, columns = ['importance'])],
          axis = 1).sort_values(by = 'importance', ascending = False)

### Let's work on test_csv

In [None]:
test_df.head()

In [None]:
100*test_df.isnull().sum()/len(test_df)

In [None]:
test_df['Gender'].value_counts()

In [None]:
test_df['Gender'] = test_df['Gender'].replace(np.nan,'Male')
test_df['Dependents'] = test_df['Dependents'].replace(np.nan,'0')
test_df['Self_Employed'] = test_df['Self_Employed'].replace(np.nan,'No')

In [None]:
test_df['LoanAmount'] = test_df['LoanAmount'].fillna(test_df['LoanAmount'].median())
test_df['Loan_Amount_Term'] = test_df['Loan_Amount_Term'].fillna(test_df['Loan_Amount_Term'].median())
test_df['Credit_History'] = test_df['Credit_History'].fillna(test_df['Credit_History'].median())

In [None]:
100*test_df.isnull().sum()/len(test_df)

In [None]:
test_df.columns

In [None]:
test_df.info()

In [None]:
test_cat = test_df[['Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed','Property_Area']]

In [None]:
test_cat_dummies = pd.get_dummies(test_cat)

In [None]:
test_df = pd.concat([test_df, test_cat_dummies], axis = 1)
test_df.head()

In [None]:
test_df.drop(['Loan_ID','Gender', 'Married', 'Dependents', 'Education','Self_Employed','Property_Area'], axis = 1, inplace  =True)

#### Logistic Regression

In [None]:
test_lr_predict = logreg.predict(test_df)
test_lr_predict

#### Random Forest

In [None]:
test_rf_predict = rf_model.predict(test_df)
test_rf_predict

#### SVM

In [None]:
test_sv_predict = svm.predict(test_df)
test_sv_predict

#### Decision Tree

In [None]:
test_dt_predict = dt_clf.predict(test_df)
test_dt_predict

#### Gradient Boosting Classifier

In [None]:
test_gbc_predict = gbc.predict(test_df)
test_gbc_predict