# **Import Libraries**

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
pd.set_option('display.width', 100)
pd.set_option('display.max_columns', 20)
sns.set_theme(color_codes=True, style='darkgrid', 
              palette='deep', font='sans-serif')

# **Load Data Train**

In [None]:
# Load data
trainData = pd.read_csv('../input/loan-prediction-based-on-customer-behavior/Training Data.csv')

# **Check and Clean Data Train**

In [None]:
# Getting the first 5 rows of data
trainData.head()

In [None]:
# Getting the last 5 rows of data
trainData.tail()

In [None]:
# Checking the shape of data
trainData.shape

In [None]:
# Checking the null value
trainData.isnull().sum()

In [None]:
# Getting the information 
trainData.info()

In [None]:
# Checking the target
trainData.Risk_Flag.value_counts()

# **Encode**

we need to encode the data since the data have a categorical data, and i'm using LabelEncoder here.

In [None]:
labelEncoder = LabelEncoder()

In [None]:
# Accommodate data into dataVariables
data = trainData

# Encode the object data to type int
for e in data.columns:
    if data[e].dtype == 'object':
        labelEncoder.fit(list(data[e].values))
        data[e] = labelEncoder.transform(data[e].values)
        
        # Accommodate the data that has been changed
        trainData = data

In [None]:
# Checking the first 5 rows of data
trainData.head()

In [None]:
# Getting the information
trainData.info()

# **EDA**

In [None]:
# Getting the statistical info
trainData.describe()

In [None]:
# Make a correlation data to knowing Value Strength and Direction of Linear Relationship
corr = trainData.corr()
corr

In [None]:
# Constructing a heatmap to understand the correlation
plt.figure(figsize=(10, 10))
sns.heatmap(corr, cbar=True, square=True, fmt='.1f', annot=True, annot_kws={'size': 8}, cmap='YlGnBu')
plt.plot()

In [None]:
corr["Risk_Flag"].sort_values(ascending=False)

Id - City Ownership have a better correlation.

In [None]:
# Create a histogram
trainData.hist(figsize=(12, 12))
plt.show()

In [None]:
# Checking the structure of the data
trainData.sample(11, random_state=1).T

**Coefficient of Variation**

The coefficient of variation is a measure of variance that can be used to compare a data distribution that has different units.

* **The higher the Coefficient of Variation** = the wider the data you have compared to the average data (more difficult to predict).
* **The Lower Coefficient of Variation** = The narrower the data you have compared to the Average data (Easier to predict).

In [None]:
# Coefficient of Risk Flag
covRiskFlag = ((trainData['Risk_Flag'].std()/trainData['Risk_Flag'].mean()) * 100)
print(f'Coefficient Of Variation Potability : {covRiskFlag}%')

as you can see the output above, the **coefficient of variation is so high**, which mean, is so **difficult to predict**.

# **Divide and Split Data**

In [None]:
dataX = trainData.drop(['Risk_Flag'], axis=1)
dataY = trainData['Risk_Flag']

In [None]:
trainX, testX, trainY, testY = train_test_split(dataX, dataY, 
                                                test_size=.2,
                                                random_state=12)

In [None]:
# Checking the value of train X
trainX.value_counts()

In [None]:
# Checking the value of test X
testX.value_counts()

In [None]:
# Checking the value of train Y
trainY.value_counts()

In [None]:
# Checking the value of test Y
testY.value_counts()

# **Upsampling Data**

upsampling the target using **SMOTE**, because we can see the risk flag data **have a huge difference**.

In [None]:
sm = SMOTE(random_state=12)
trainXres, trainYres = sm.fit_resample(trainX, trainY)

# **Train and Predict**

In [None]:
# Our model
model = RandomForestClassifier(n_estimators=500, 
                               random_state=12, 
                               min_samples_leaf=2, 
                               criterion='entropy')

In [None]:
# Fit model
model.fit(trainXres, trainYres)

In [None]:
# Predict 
predY = model.predict(testX)

In [None]:
# Print roc_auc_score
print(roc_auc_score(testY, predY))