In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Let's just begin by reading our data and displaying it to see how our data looks like and how much cleaning it requires.

sample_prediction = pd.read_csv("/kaggle/input/loan-prediction-based-on-customer-behavior/Sample Prediction Dataset.csv")
training_data = pd.read_csv("/kaggle/input/loan-prediction-based-on-customer-behavior/Training Data.csv")
test_data = pd.read_csv("/kaggle/input/loan-prediction-based-on-customer-behavior/Test Data.csv")

In [None]:
print(sample_prediction.head(), training_data.head(), test_data.head(), sep='\n')

In [None]:
training_data.shape

In [None]:
#let's see what our data has for us
training_data.describe()

let's look if our data has any missing value to avoid any mistakes or we can fix them

In [None]:
training_data.isnull().sum()

No missing entries means we're good to go with any analysis

## Preprocessing

let's find pearson correlation of the dataframe to check how strongly the values are related with each other.

In [None]:
correlation = training_data.corr()
correlation

since we know that the models best work with numeric data instead of categorical data, let's convert our data into numeric data

In [None]:
#let's convert the categorical data to numeric data and find the correlation again on all columns unlike above
from sklearn.preprocessing import LabelEncoder
col = ["Profession","CITY", "STATE","Married/Single", "House_Ownership", "Car_Ownership"]
train_data_labelled = training_data.copy()
test_data_labelled = test_data.copy()
for item in col:
    if(train_data_labelled[item].dtype=='object' and test_data_labelled[item].dtype=='object'):
        train_data_labelled[item].fillna('N', inplace=True)
        test_data_labelled.fillna('N', inplace=True)
        lbl= LabelEncoder()
        lbl.fit(train_data_labelled[item].values)
        train_data_labelled[item] = lbl.transform(train_data_labelled[item].values)
        
        lbl.fit(test_data_labelled[item].values)
        test_data_labelled[item]= lbl.transform(test_data_labelled[item].values)
        
train_data_labelled.head()

In [None]:
train_corr = train_data_labelled.corr()
train_corr

In [None]:
#let's visualize it
train_data_labelled.hist(figsize=(13,13))
plt.show()

In [None]:
#visualize correlation with pairplot
cols = train_data_labelled.columns.to_list()
rem = ['Id','Risk_Flag','CITY','STATE']
cols = [item for item in cols if item not in rem]
sns.pairplot(train_data_labelled[cols])

The pairplot shows every column's correlation with another column in the pictorial form, some of them form the linear line showing positive linear correlation such as experience and current job years.

## Visualization Through GML SWEETVIZ

In [None]:
pip install sweetviz

In [None]:
import sweetviz as sv

In [None]:
#using just one line of analyze function we can quickly analyze the report that will be created to us automatically using gml
report = sv.analyze(train_data_labelled)
report.show_html('train_data_labelled.html')

The visualization from GML shows the direct method for finding relations between every columns, numerical columns, and categorical columns.

In [None]:
comparision = sv.compare(train_data_labelled[1000:], train_data_labelled[:1000])
comparision.show_html('compare.html')

## **Modeling, Training, Spliting and Evaluating**

we'll perform our analysis through different Machine Learning Models and see which one works best in terms of accuracy

### Ridge Regression

In [None]:
col = train_data_labelled.columns.to_list() 
col.remove('Risk_Flag')

x_train= train_data_labelled[col]
y_train= train_data_labelled['Risk_Flag']

#difference in column name ID in test_data and Id in train data, let's fix it
test_data_labelled = test_data_labelled.rename(columns={'ID':'Id'})

x_test = test_data_labelled[test_data_labelled.columns]
y_test = sample_prediction['risk_flag']
print("number of test samples:", x_test.shape[0])
print("number of training samples:",x_train.shape[0])

In [None]:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=0.1)
ridge.fit(x_train, y_train)
pred = ridge.predict(x_test)

**Evaluate based on ROC_AUC**

we need to understand that since our prediction output isn't binary so we can't really use accuracy_score in ridge regression instead we'll use roc test

In [None]:
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(y_test, pred)
print('The accuracy score based on roc_auc_score is :',auc)

## Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
DTClassifier= DecisionTreeClassifier(criterion='entropy', random_state=0)


In [None]:
DTClassifier.fit(x_train,y_train)
y_pred = DTClassifier.predict(x_test)

In [None]:
#Evaluate based on roc_auc_score
print("The accuracy score based on roc_auc_score is : ", roc_auc_score(y_test, y_pred))

As we can see that score is better, let's try with RFC then we will compare all three.

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rc = RandomForestClassifier(n_estimators=500)
rc.fit(x_train, y_train)

In [None]:
y_pred = rc.predict(x_test)

In [None]:
#Evaluate based on roc_auc_score
print("The accuracy score based on roc_auc_score is : ", roc_auc_score(y_test, y_pred))

## Comparision with voting classifiers

We are also adding Logistic Regression and Support Vector Classifier in the tuple to check if it has greater score than those we used earlier.

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression

rmd_clf = RandomForestClassifier()
DTC_clf = DecisionTreeClassifier()
log_clf = LogisticRegression()

voting_clf = VotingClassifier(estimators = [('lr',log_clf),('rf',rmd_clf),('dc',DTC_clf)], voting="hard")

for clf in [log_clf,rmd_clf,DTC_clf]:
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print(clf.__class__.__name__,roc_auc_score(y_test, y_pred))

# Conclusion
We can see that the Logistic Regression has the highest accuracy score so we can use that model for our prediction analysis.