### 1. Import Libraries 

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### 2. Import Dataset

In [None]:
df_train = pd.read_csv('../input/loan-prediction-problem-dataset/train_u6lujuX_CVtuZ9i.csv')
df_train

In [None]:
df_test = pd.read_csv('../input/loan-prediction-problem-dataset/test_Y3wMUE5_7gLdaTN.csv')
df_test

### 3. Check the information of the Dataset

In [None]:
df_train.info()

In [None]:
df_test.info()

From the result, we can see that there are several types of data. For prediction purpose, we wish to only use the **numerical** types of data, so we need to change the `object type` data into `numerical type`. Also, we can see that the number of data (rows) for each column is different. For `df_train` the rows for each column should have been 614 an for `df_test` is 367.

This following event indicates that there are **missing values**

In [None]:
df_train.describe()

Knowing the statistical value of the data help us to know further to data transformation and analysis.

In [None]:
#print all of the data types and their unique values
for column in df_train.columns:
  if df_train[column].dtype == object:
    print(str(column) + ' : '+str(df_train[column].unique()))
    print(df_train[column].value_counts())
    print('_____________________________________________________________________')

As we can see from the result above, IDs are different for each costumer. So, it's definitely not a predictor variable. You can drop it or just leave it as it is.

Also we can see for some columns there are `nan` or zero value. Let's check it out more

In [None]:
#Get a count of the missing values for each column
df_train.isna().sum()

In [None]:
df_test.isna().sum()

There are missing values in both dataset. For the train dataset, we should transform the **missing values** into some values, but for the test dataset I choose to drop **it** instead.

### 4. Handling Missing Values

#### 4.1 Data Train

* Missing Values of `Gender`, `Married`, `Self_Employed`
    
    using Mode

In [None]:
df_train['Gender'] = df_train['Gender'].fillna(df_train['Gender'].mode())
df_train['Married'] = df_train['Married'].fillna(df_train['Married'].mode())
df_train['Self_Employed'] = df_train['Self_Employed'].fillna(df_train['Self_Employed'].mode())

* Missing Values of `LoanAmount`

    using Mean

In [None]:
df_train['LoanAmount'] = df_train['LoanAmount'].fillna(df_train['LoanAmount'].mean())

* Missing Values of `Credit_History`
    
    using Median

In [None]:
df_train['Credit_History'] = df_train['Credit_History'].fillna(df_train['Credit_History'].median())

* Missing Values of `Dependents` and `Loan_Amount_Term`

    Drop Missing Values
   

In [None]:
df_train.dropna(inplace=True)

#### 4.2 Data Test

Drop Missing Values

In [None]:
df_test.dropna(inplace=True)

#### 4.3 Drop `Loan_ID`

In [None]:
df_train.drop('Loan_ID', inplace=True, axis = 1)

In [None]:
df_test.drop('Loan_ID', inplace=True, axis = 1)

**Check!**

In [None]:
df_train.isna().sum()

In [None]:
df_test.isna().sum()

### 5. Object type data mapping

#### 5.1 Data Train

In [None]:
df_train.Gender = df_train.Gender.map({'Male':1, 'Female':0})
df_train.Married = df_train.Married.map({'Yes':1, 'No':0})
df_train.Education = df_train.Education.map({'Graduate':1, 'Not Graduate':0})
df_train.Self_Employed = df_train.Self_Employed.map({'Yes':1, 'No':0})
df_train.Dependents = df_train.Dependents.map({'0':0, '1':1, '2':2, '3+':3})
df_train.Property_Area = df_train.Property_Area.map({'Urban':1, 'Rural':0, 'Semiurban':2})
df_train.Loan_Status = df_train.Loan_Status.map({'Y':1, 'N':0})

In [None]:
df_test.Gender = df_test.Gender.map({'Male':1, 'Female':0})
df_test.Married = df_test.Married.map({'Yes':1, 'No':0})
df_test.Education = df_test.Education.map({'Graduate':1, 'Not Graduate':0})
df_test.Self_Employed = df_test.Self_Employed.map({'Yes':1, 'No':0})
df_test.Dependents = df_test.Dependents.map({'0':0, '1':1, '2':2, '3+':3})
df_test.Property_Area = df_test.Property_Area.map({'Urban':1, 'Rural':0, 'Semiurban':2})

In [None]:
df_train.dtypes

In [None]:
df_test.dtypes

### 6. Data Visualization

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(20,10))

fig.suptitle('Each Column Value Counts')

sns.countplot(df_train['Gender'], hue = df_train['Loan_Status'], ax=axes[0, 0], palette='rainbow')
sns.countplot(df_train['Married'], hue = df_train['Loan_Status'], ax=axes[0, 1], palette='rainbow')
sns.countplot(df_train['Education'], hue = df_train['Loan_Status'], ax=axes[0, 2], palette='rainbow')
sns.countplot(df_train['Self_Employed'], hue = df_train['Loan_Status'], ax=axes[1, 0], palette='rainbow')
sns.countplot(df_train['Dependents'], hue = df_train['Loan_Status'], ax=axes[1, 1], palette='rainbow')
sns.countplot(df_train['Property_Area'], hue = df_train['Loan_Status'], ax=axes[1, 2], palette='rainbow')

### 7. Check Correlation

In [None]:
corr = df_train.corr()
corr.style.background_gradient(cmap='gist_earth_r').set_precision(2)

from the result shown above,, we can see that `Credit_History` have a quite high positive correlation number (0.52). That means, our target (`Loan_Status`) is highly dependant with `Credit_History`

### 8. Random Forest Classifier

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

#### 8.1 Train and Test Data Split

In [None]:
X = df_train.drop('Loan_Status', axis = 1)
y = df_train['Loan_Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

#### 8.2 Train and Validate Model using Random Forest Classifier



In [None]:
RF = RandomForestClassifier()
RF.fit(X_train, y_train)

y_predict = RF.predict(X_test)


#Classification Report
print(classification_report(y_test, y_predict))

# Accuracy score
RF_score = accuracy_score(y_predict,y_test)
print(f"Accurate {round(RF_score*100,2)}%")

In [None]:
Loan_Status_Validation=pd.DataFrame({'y_test':y_test,'prediction':y_predict})
Loan_Status_Validation

### 9. Make Prediction

In [None]:
y_test_predict = RF.predict(df_test)

In [None]:
Loan_Status_Prediction=pd.DataFrame({'y_test_predict':y_test_predict})
Loan_Status_Prediction

**Done!**