In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Table Of Contents
- ## <a href='#0'>  Dataset Introduction </a> 
- ## <a href='#1'>1. Importing Libraries and Dataset </a> 
- ## <a href='#2'>2. Data </a> 
- ## <a href='#3'>3.Exploratory Data Analysis </a> 
- ### <a href='#4'>3.1 Describe Function  </a> 
- ### <a href='#5'>3.2 Missing Value </a>
- ### <a href='#6'>3.3 Questions </a> 
- ## <a href='#7'>4.Gender  </a> 
- ## <a href='#8'>5.Marriage   </a>
- ## <a href='#9'>6.Education  </a>
- ## <a href='#10'>7.Self Employed  </a>
- ## <a href='#11'>8.Applicant Income </a>
- ## <a href='#12'>9.Location Of The Houses </a>
- ## <a href='#13'>10.Credit History </a>
- ## <a href='#14'>11.Loan Status </a>
- ## <a href='#15'>12.Correlation Matrix </a>
- ## <a href='#16'>13.Dependents  </a> 
- ## <a href='#17'>14.Summary Conclusion </a>
- ## <a href='#18'>15.Machine Learning </a>
- ### <a href='#19'>15.1 Data Preprocessing </a>
- ### <a href='#20'>15.2 Local Outlier Factor </a>
- ### <a href='#21'>15.3 Normalize </a> 
- ## <a href='#22'>16.Logistic Regression </a>
- ## <a href='#23'>17.KNN </a>
- ## <a href='#24'>18.Artificial Neural Networks </a>
- ## <a href='#25'>19.Random Forest </a>
- ## <a href='#26'>20.Gaussian Naive Bayes </a>
- ## <a href='#27'>21.SVC </a>
- ## <a href='#28'>22.Model Comparison </a>
- ## <a href='#29'>23.Conclusion</a>
- ## <a href='#30'>24.End Note</a>

# <a id='0'> Dataset Introduction </a>

![](https://media.giphy.com/media/5xtDarqCp0eomZaFJW8/giphy.gif)

 * Let's Say, You are the owner of the **Housing Finance Company** and you want to<br> build your own model to predict the customers are applying for the home loan and<br> company want to check and validate the customer are eligible for the home loan.<br>
 * Company wants to make automate the Loan Eligibility Process in a real time<br>  scenario related to customer's detail provided while applying <br>application for home loan forms.<br>
 * We will try to build a model using data from loan applications. <br>
 * Let's start <br>
 


# <a id='1'> 1. Importing Libraries and Dataset</a>

In [None]:
import numpy as np 
import pandas as pd
import datetime
import seaborn as sns 
import matplotlib.pyplot as plt
# Plotly Libraris
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import warnings
warnings.filterwarnings("ignore")

# <a id='2'> 2.Data </a>

## Data Columns Means
Loan_ID --------------> Unique Loan ID. <br>
Gender --------------> Male/ Female (cinsiyet) <br>
Married --------------> Applicant married (medeni hali) (Y/N) <br>
Dependents ------------> Number of dependents (bakması gereken kişi sayısı) <br>
Education -------------> Applicant Education  (egitim durumu) (Graduate/ Under Graduate) <br>
Self_Employed ---------> Self-employed (kendi işinde çalışan) (Y/N) <br>
ApplicantIncome -------> Applicant income  (basvuru yapanın geliri)<br>
CoapplicantIncome -----> Coapplicant income (basvuru yapanın eşinin geliri) <br>
LoanAmount -----------> Loan amount in thousands (1 bin karsı odeditleri miktar) <br>
Loan_Amount_Term ------> Term of a loan in months ( kaç aylık ) <br>
Credit_History --------> Credit history meets guidelines (kredi gecmişi kosuları saglıyor mu,kara liste  ) <br>
Property_Area ---------> Urban/ Semi-Urban/ Rural (evlerin mekanları) <br>
Loan_Status -----------> Loan approved (Y/N) (kredi onay) <br>

In [None]:
loan_data=pd.read_csv("/kaggle/input/loan-eligible-dataset/loan-train.csv")
df_train=loan_data.copy()
df_train.head()

# <a id='3'> 3.Exploratory Data Analysis</a>


* Exploratory Data Analysis refers to the critical process of performing 
initial investigations on data so as to discover patterns,to spot anomalies, 
to test hypothesis and to check assumptions with
the help of summary statistics and graphical representations. <br>

* Your goal during EDA is to develop an understanding of your data. The easiest way to do this is to use questions as tools to guide your investigation. When you ask a question, the question focuses your attention on a specific part of your dataset and helps you decide which graphs, models, or transformations to make.<br>

* Generate questions about your data.<br>

* Search for answers by visualising, transforming, and modelling your data. <br>

* Use what you learn to refine your questions and/or generate new questions. <br>

* EDA is not a formal process with a strict set of rules.<br> 
* More than anything, EDA  is a state of mind.<br> 
* During the initial phases of EDA you should feel free to investigate every idea that occurs to you.<br> 
* Some of these ideas will pan out, and some will be dead ends.<br>
* As your exploration continues, you will home in on a few particularly productive areas that you’ll eventually write up and communicate to others.<br>

* Let's start exploring our data

In [None]:
# Size Of Data Set
df_train.shape

* Dataset comprises of 614 observations and 13 characteristics.<br>
* Out of which one is dependent variable and rest 12 are independent variables <br>

In [None]:
# Columns Names
df_train.columns

In [None]:
# Columns Types
df_train.dtypes

In [None]:
#Info
df_train.info()

* Data has only float,object and integer values.<br>
* Variable column has null/missing values. <br>

In [None]:
# Duplicated data
df_train[df_train.duplicated() == True]

* We don't have duplicated data 


# <a id='4'> 3.1 Describe Function</a>

* Generate descriptive statistics.<br>
* Descriptive statistics include those that summarize the central tendency, dispersion and shape of a dataset’s distribution, excluding NaN values.<br>
* The **describe**() function in pandas is very handy
in getting various **summary statistics**.<br>
* This function returns the **count**, **mean**, **standard deviation**,
**minimum** and **maximum** **values** and the **quantiles of the data**.<br>

In [None]:
df_train.describe().T

# <a id='5'> 3.2 Missing Value</a>

* Missing data is a big topic, I'll try to explain it at another time. <br>
* In this project, I will remove all of the missing data from the data set. <br>

In [None]:
df_train.isnull().values.any()

In [None]:
df_train.isnull().sum()

In [None]:
def missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data(df_train)

In [None]:
df_train_clean=df_train.copy()
df_train_clean=df_train_clean.dropna()
df_train_clean.drop(['Loan_ID'],inplace=True,axis=1)
df_train_clean.info()

In [None]:
# Unique Value
{column: list(df_train_clean[column].unique()) for column in df_train_clean.select_dtypes('object').columns}

# <a id='6'> 3.3 Questions</a>
* Generate questions about your data.
* Find answer and visualize it.make inferences
* Q1.How is the gender distribution?
* Q2.How is the marriage distribution?
* Q3.How is the distribution of marriage by gender?
* Q4.How is the education distribution?
* Q5.How is the distribution of education by gender,marriage?
* Q6.How is the self employed distribution?
* Q7.How is the distribution of self employed by gender?
* Q8.How is the  applicant income distribution? 
* Q9.How much is the average,min,max applicant income?
* Q9.How much is the average,min,max applicant income by self employed?
* Q10.How is the location of the houses? 
* Q11.How much is the income of the applicants distributed according to the location of the houses?
* Q12.How much is the Loan Amount of the applicants distributed according to the location of the houses?
* Q13.How is the Credit History Distribution ?
* Q14.How does the credit history relate to applicant income,loan amount,loan amount term?(min,max mean)
* Q15.How is the Credit History Distribution ?
* Q16.How does the credit history relate to applicant income,loan amount,loan amount term?(min,max mean)
* .....

## <a id='7'> 4.Gender </a>

* How is the gender distribution? <br>


In [None]:
df_gender=df_train_clean['Gender'].value_counts().to_frame().reset_index().rename(columns={'index':'Gender','Gender':'count'})


fig = go.Figure([go.Pie(labels=df_gender['Gender'], values=df_gender['count'], pull=[0,0.2],hole=0.4)])

fig.update_traces(hoverinfo='label+percent', textinfo='value+percent', textfont_size=12,insidetextorientation='radial')

fig.update_layout(title="Gender Count",title_x=0.5)
fig.show()

In [None]:
df_gender=df_train_clean['Gender'].value_counts().to_frame().reset_index().rename(columns={'index':'Gender','Gender':'count'})


fig = go.Figure([go.Pie(labels=df_gender['Gender'], values=df_gender['count'])])

fig.update_traces(hoverinfo='label+percent', textinfo='value+percent', textfont_size=12,insidetextorientation='radial')

fig.update_layout(title="Gender Count",title_x=0.5)
fig.show()

In [None]:
df_gender=df_train_clean['Gender'].value_counts().to_frame().reset_index().rename(columns={'index':'Gender','Gender':'count'})

fig = go.Figure(go.Bar(
    x=df_gender['Gender'],y=df_gender['count'],
    marker={'color': df_gender['count'], 
    'colorscale': 'Viridis'},  
    text=df_gender['count'],
    textposition = "outside",
))
fig.update_layout(title_text='Gender Distribution',xaxis_title="Gender",yaxis_title="Count ",title_x=0.5)
fig.show()

**Conclude Q1**
* Most of the people who apply are men.<br>
* We will work in an area where men are dominant. <br>


## <a id='8'> 5.Married </a>
* How is the marriage distribution? <br>
* How is the distribution of marriage by gender? <br>

In [None]:
df_Married=df_train_clean['Married'].value_counts().to_frame().reset_index().rename(columns={'index':'Married','Married':'count'})


fig = go.Figure([go.Pie(labels=df_Married['Married'], values=df_Married['count'],hole=0.2)])

fig.update_traces(hoverinfo='label+percent', textinfo='value+percent', textfont_size=12,insidetextorientation='radial')

fig.update_layout(title="Married Count",title_x=0.5)
fig.show()

In [None]:
df_M_and_G=df_train_clean.groupby(by =['Gender','Married'])['Dependents'].count().to_frame().reset_index().rename(columns={'Gender':'Gender','Married':'Married','Dependents':'Count'})
df_M_and_G

fig = px.bar(df_M_and_G, x="Married", y="Count",color="Gender",barmode="group",
             
             )
fig.update_layout(title_text='Married Count With Gender',title_x=0.5,
                 )
fig.show()

In [None]:
import seaborn as sns

ax = sns.countplot(x="Married", data=df_train_clean)

In [None]:
ax = sns.countplot(x="Married",hue="Gender", data=df_train_clean)

## Conclusion 2 
* More than half of the people are married <br>
* The marriage rate for men is 72% <br>
* The marriage rate for women is 30% <br>

## <a id='9'> 6.Education </a>
* How is the Education distribution? <br>
* How is the distribution of education by gender,marriage? <br>

In [None]:
df_Education=df_train_clean['Education'].value_counts().to_frame().reset_index().rename(columns={'index':'Education','Education':'count'})


fig = go.Figure([go.Pie(labels=df_Education['Education'], values=df_Education['count'],hole=0.2)])

fig.update_traces(hoverinfo='label+percent', textinfo='value+percent', textfont_size=12,insidetextorientation='radial')

fig.update_layout(title="Education Count",title_x=0.5)
fig.show()

In [None]:
df_M_and_G_and_E=df_train_clean.groupby(by =['Gender','Education','Married'])['Dependents'].count().to_frame().reset_index().rename(columns={'Gender':'Gender','Married':'Married','Dependents':'count'})

fig = px.bar(df_M_and_G_and_E, x="Gender", y="count", color="Married", barmode="group",
             facet_col="Education"
             )
fig.update_layout(title_text='Education Count With Gender,Married',title_x=0.5,
                  hoverlabel=dict(
                  bgcolor="white",
                  font_size=16,
                  font_family="Rockwell",
                                
     )
  )
fig.show()

In [None]:
ax = sns.countplot(x="Education", data=df_train_clean)

In [None]:
g = sns.FacetGrid(df_train_clean, col="Married", row="Gender")
g.map_dataframe(sns.countplot, x="Education")
g.set_axis_labels(" ", "Count");

## Conclusion 3
* Education rate in people 80% <br>
* The education rate for men is 78% <br>
* The education rate for women is 84% <br>

## <a id='10'> 7.Self Employed </a>
* How is the Self Employed distribution? <br>
* How is the distribution of education by gender? <br>

In [None]:
df_Self_Employed=df_train_clean['Self_Employed'].value_counts().to_frame().reset_index().rename(columns={'index':'Self_Employed','Self_Employed':'count'})

fig = go.Figure([go.Pie(labels=df_Self_Employed['Self_Employed'], values=df_Self_Employed['count'],hole=0.2)])

fig.update_traces(hoverinfo='label+percent', textinfo='value+percent', textfont_size=12,insidetextorientation='radial')

fig.update_layout(title="Self Employed Count",title_x=0.5)
fig.show()

In [None]:
df_M_and_G_and_E=df_train_clean.groupby(by =['Gender','Self_Employed'])['Dependents'].count().to_frame().reset_index().rename(columns={'Gender':'Gender','Self_Employed':'Self_Employed','Dependents':'count'})

fig = px.bar(df_M_and_G_and_E, x="Gender", y="count", color="Self_Employed", barmode="group",
             
             )
fig.update_layout(title_text='Self Employed Count With Gender',title_x=0.5,
                  hoverlabel=dict(
                  bgcolor="white",
                  font_size=16,
                  font_family="Rockwell",
                                
     )
  )
fig.show()

In [None]:
ax = sns.countplot(x="Self_Employed",hue="Gender", data=df_train_clean)

## Conclusion 4 
* Self Employed rate in people 14%
* self Employed rate in for men 14%
* self Employed rate in for women 14%

## <a id='11'> 8.Applicant Income </a>
* How is the  applicant income distribution? <br>
* How much is the average,min,max applicant income?<br>
* How much is the average,min,max applicant income by self employed?<br>

In [None]:
df_train_clean.ApplicantIncome.describe().T

In [None]:
fig = go.Figure(data=[go.Histogram(x=df_train_clean['ApplicantIncome'],  # To get Horizontal plot ,change axis - 
                                  marker_color="Crimson",
                       xbins=dict(
                      start=0, #start range of bin
                      end=25000,  #end range of bin
                      size=1000   #size of bin
                      ))])
fig.update_layout(title="Distribution Of Applicant Income",xaxis_title="Applicant Income",yaxis_title="Counts",title_x=0.5)
fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(
    y=df_train_clean['ApplicantIncome'],
    marker_color='royalblue',
    boxmean=True # represent mean
))
fig.update_layout(title_text='Applicant Income',yaxis_title="Count",title_x=0.5)
fig.show()

In [None]:
fig = px.box(df_train_clean, x="Self_Employed", y="ApplicantIncome")
fig.update_layout(title_text='Applicant Income With Self Employed',xaxis_title="Self Employed",yaxis_title="Applicant Income",title_x=0.5)
fig.show()

In [None]:
# Multiple Bullet

min_income=df_train_clean.ApplicantIncome.min()

max_income=df_train_clean.ApplicantIncome.max()

mean_income=df_train_clean.ApplicantIncome.mean()


fig = go.Figure()

fig.add_trace(go.Indicator(
    mode = "number+gauge", value =  min_income,
    domain = {'x': [0.25, 1], 'y': [0.4, 0.5]},
    title = {'text': "Min Income",'font':{'color': 'black','size':15}},
     number={'font':{'color': 'black'}},
    gauge = {
        'shape': "bullet",
        'axis': {'range': [None, 200]},
        'bar': {'color': "blue"}}))

fig.add_trace(go.Indicator(
    mode = "number+gauge", value = max_income,
    domain = {'x': [0.25, 1], 'y': [0.6, 0.7]},
    title = {'text': "Max Income",'font':{'color': 'black','size':15}},
    number={'font':{'color': 'black'}},
    gauge = {
        'shape': "bullet",
        'axis': {'range': [None,100500]},
        'bar': {'color': "cyan"}}))

fig.add_trace(go.Indicator(
    mode = "number+gauge", value = mean_income,
    domain = {'x': [0.25, 1], 'y': [0.8, 0.9]},
    title = {'text' :"Mean Income",'font':{'color': 'black','size':15}},
     number={'font':{'color': 'black'}},
    gauge = {
        'shape': "bullet",
        'axis': {'range': [None,6000]},
        'bar': {'color': "darkblue"}}
))

fig.update_layout(title="Applicant Income Statistics ",title_x=0.5)
fig.show()

In [None]:
SE_Y_income=df_train_clean[df_train_clean["Self_Employed"]=="Yes"]
SE_N_income=df_train_clean[df_train_clean["Self_Employed"]=="No"]


minY_income=SE_Y_income.ApplicantIncome.min()

maxY_income=SE_Y_income.ApplicantIncome.max()

meanY_income=SE_Y_income.ApplicantIncome.mean()

minN_income=SE_N_income.ApplicantIncome.min()

maxN_income=SE_N_income.ApplicantIncome.max()

meanN_income=SE_N_income.ApplicantIncome.mean()

fig = go.Figure()

fig.add_trace(go.Indicator(
    mode = "number+gauge", value =  minY_income,
    domain = {'x': [0.25, 1], 'y': [0.05, 0.15]},
    title = {'text': "Self Employed Min Income ",'font':{'color': 'black','size':12}},
     number={'font':{'color': 'black'}},
    gauge = {
        'shape': "bullet",
        'axis': {'range': [None, 1500]},
        'bar': {'color': "blue"}}))

fig.add_trace(go.Indicator(
    mode = "number+gauge", value = maxY_income,
    domain = {'x': [0.25, 1], 'y': [0.22, 0.32]},
    title = {'text': "Self Employed Max Income",'font':{'color': 'black','size':12}},
    number={'font':{'color': 'black'}},
    gauge = {
        'shape': "bullet",
        'axis': {'range': [None,42500]},
        'bar': {'color': "cyan"}}))

fig.add_trace(go.Indicator(
    mode = "number+gauge", value = meanY_income,
    domain = {'x': [0.25, 1], 'y': [0.39, 0.49]},
    title = {'text' :"Self Employed Mean Income",'font':{'color': 'black','size':12}},
     number={'font':{'color': 'black'}},
    gauge = {
        'shape': "bullet",
        'axis': {'range': [None,8000]},
        'bar': {'color': "darkblue"}}
))
fig.add_trace(go.Indicator(
    mode = "number+gauge", value = minN_income,
    domain = {'x': [0.25, 1], 'y': [0.55, 0.65]},
    title = {'text' :"Not Self Employed Min Income",'font':{'color': 'black','size':12}},
     number={'font':{'color': 'black'}},
    gauge = {
        'shape': "bullet",
        'axis': {'range': [None,200]},
        'bar': {'color': "darkcyan"}}
))
fig.add_trace(go.Indicator(
    mode = "number+gauge", value = maxN_income,
    domain = {'x': [0.25, 1], 'y': [0.72,0.82]},
    title = {'text' :"Not Self Employed Max Income",'font':{'color': 'black','size':12}},
     number={'font':{'color': 'black'}},
    gauge = {
        'shape': "bullet",
        'axis': {'range': [None,90000]},
        'bar': {'color': "red"}}
))
fig.add_trace(go.Indicator(
    mode = "number+gauge", value = meanN_income,
    domain = {'x': [0.25, 1], 'y': [0.88,0.98]},
    title = {'text' :"Not Self Employed Mean Income",'font':{'color': 'black','size':12}},
     number={'font':{'color': 'black'}},
    gauge = {
        'shape': "bullet",
        'axis': {'range': [None,6000]},
        'bar': {'color': "red"}}
))
fig.update_layout(title=" Self Employed And Not Self Employed Applicant Income Statistics ",title_x=0.5)
fig.show()

## Conclusion 5

* Min Income =150
* Max Income =81k
* Mean Income =5350
* Self Employed Min Income =1000
* Self Employed Max Income =39.1k
* Self Employed Mean Income =7790
* Employed Min Income =150
* Employed Max Income =81k
* Employed Mean Income =4980

## <a id='12'> 9.Location Of The Houses </a>
* How is the location of the houses? <br>
* How much is the income of the applicants distributed according to the location of the houses?<br>
* How much is the Loan Amount of the applicants distributed according to the location of the houses?<br>

In [None]:
df_Property_Area=df_train_clean['Property_Area'].value_counts().to_frame().reset_index().rename(columns={'index':'Property_Area','Property_Area':'count'})

fig = go.Figure(data=[go.Scatter(
    x=df_Property_Area['Property_Area'], y=df_Property_Area['count'],
    mode='markers',
    marker=dict(
        color=df_Property_Area['count'],
        size=df_Property_Area['count']*0.3, # Multiplying by 0.3 to reduce size and stay uniform accross all points
        showscale=True
    ))])

fig.update_layout(title='Property Area',xaxis_title="Property Area ",yaxis_title="Number Of Property ",title_x=0.5)
fig.show()

In [None]:
df_Property_Area=df_train_clean['Property_Area'].value_counts().to_frame().reset_index().rename(columns={'index':'Property_Area','Property_Area':'count'})

colors=['cyan','darkblue',"darkcyan"]

fig = go.Figure([go.Pie(labels=df_Property_Area['Property_Area'], values=df_Property_Area['count'])])
fig.update_traces(hoverinfo='label+percent', textinfo='percent+value', textfont_size=15,
                 marker=dict(colors=colors, line=dict(color='#000000', width=2)))
fig.update_layout(title="Property Area Count",title_x=0.5)
fig.show()

In [None]:
df_Property_Area=df_train_clean['Property_Area'].value_counts().to_frame().reset_index().rename(columns={'index':'Property_Area','Property_Area':'count'})


fig = go.Figure(go.Bar(
    x=df_Property_Area['Property_Area'],y=df_Property_Area['count'],
    marker={'color': df_Property_Area['count'], 
    'colorscale': 'Viridis'},  
    text=df_Property_Area['count'],
    textposition = "outside",
))
fig.update_layout(title_text='Property Area Count',xaxis_title="Property Area",yaxis_title="Number Of Property ",title_x=0.5)
fig.show()

In [None]:
df_PA_mean=df_train_clean.groupby(by =['Property_Area'])['ApplicantIncome'].mean().to_frame().reset_index().rename(columns={'Property_Area':'Property_Area','ApplicantIncome':'mean'})
df_PA_min=df_train_clean.groupby(by =['Property_Area'])['ApplicantIncome'].min().to_frame().reset_index().rename(columns={'Property_Area':'Property_Area1','ApplicantIncome':'min'})
df_PA_max=df_train_clean.groupby(by =['Property_Area'])['ApplicantIncome'].max().to_frame().reset_index().rename(columns={'Property_Area':'Property_Area2','ApplicantIncome':'max'})
result = pd.concat([df_PA_mean, df_PA_min,df_PA_max], axis=1)
result.drop(['Property_Area1','Property_Area2'],inplace=True,axis=1)


fig = make_subplots(rows=3, cols=1,
                   subplot_titles=("Mean Applicant Income",
                                   "Min Applicant Income",
                                   "Max  Applicant Income"))  # Subplot titles

fig.add_trace(go.Bar(
    x=result['Property_Area'],y=result['mean'],
    marker={'color': result['mean'], 
    'colorscale': 'fall'},  
    text=result['mean'],
    textposition = "inside"),
    row=1, col=1         
)
fig.add_trace(go.Bar(
    x=result['Property_Area'],y=result['min'],
    marker={'color': result['min'], 
    'colorscale': 'fall'},  
    text=result['min'],
    textposition = "inside"),
    row=2, col=1         
)
fig.add_trace(go.Bar(
    x=result['Property_Area'],y=result['max'],
    marker={'color': result['max'], 
    'colorscale': 'fall'},  
    text=result['max'],
    textposition = "inside"),
    row=3, col=1           
)
fig.update_layout(title = "Property Area With Applicant Income",title_x=0.5)
fig.show()

In [None]:
df_PA=df_train_clean.groupby(by =['Property_Area'])['ApplicantIncome'].mean().to_frame().reset_index().rename(columns={'Property_Area':'Property_Area','ApplicantIncome':'mean'})

fig = go.Figure(go.Bar(
    x=df_PA['Property_Area'],y=df_PA['mean'],
    marker={'color': df_PA['mean'], 
    'colorscale': 'twilight'},  
    text=df_PA['mean'],
    textposition = "outside",
))
fig.update_layout(title_text='Property Area With Applicant Mean Income',xaxis_title="Property Area",yaxis_title="Applicant Income",title_x=0.5)
fig.show()

In [None]:
df_PA_mean=df_train_clean.groupby(by =['Property_Area'])['LoanAmount'].mean().to_frame().reset_index().rename(columns={'Property_Area':'Property_Area','LoanAmount':'mean'})
df_PA_min=df_train_clean.groupby(by =['Property_Area'])['LoanAmount'].min().to_frame().reset_index().rename(columns={'Property_Area':'Property_Area1','LoanAmount':'min'})
df_PA_max=df_train_clean.groupby(by =['Property_Area'])['LoanAmount'].max().to_frame().reset_index().rename(columns={'Property_Area':'Property_Area2','LoanAmount':'max'})
result = pd.concat([df_PA_mean, df_PA_min,df_PA_max], axis=1)
result.drop(['Property_Area1','Property_Area2'],inplace=True,axis=1)


fig = make_subplots(rows=3, cols=1,
                   subplot_titles=("Mean Loan Amount",
                                   "Min Loan Amount",
                                   "Max  Loan Amount"))  # Subplot titles

fig.add_trace(go.Bar(
    x=result['Property_Area'],y=result['mean'],
    marker={'color': result['mean'], 
    'colorscale': 'curl'},  
    text=result['mean'],
    textposition = "inside"),
    row=1, col=1         
)
fig.add_trace(go.Bar(
    x=result['Property_Area'],y=result['min'],
    marker={'color': result['min'], 
    'colorscale': 'curl'},  
    text=result['min'],
    textposition = "inside"),
    row=2, col=1         
)
fig.add_trace(go.Bar(
    x=result['Property_Area'],y=result['max'],
    marker={'color': result['max'], 
    'colorscale': 'curl'},  
    text=result['max'],
    textposition = "inside"),
    row=3, col=1           
)
fig.update_layout(title = "Property Area With Loan Amount",title_x=0.5)
fig.show()

In [None]:
fig = px.scatter(df_train_clean, x='ApplicantIncome', y='LoanAmount',color="Property_Area")
fig.update_layout(title='Applicant Income Vs Loan Amount With Property Area ',xaxis_title="Applicant Income",yaxis_title="Loan Amount ",title_x=0.5)
fig.show()

## **Conclusion** 6

* Number of houses in **Semiurban** Area:191 
* Number of houses in **Rural** Area:139 
* Number of houses in **Urban** Area:150
<br>
<br>
* Applicant Income with **Rural** Area
* **mean**=5764,**min**=150, **max**=81000
* Applicant Income with **Semiurban** Area 
* **mean**=5368,**min**=1500, **max**=39999
* Applicant Income with **Urban** Area
* **mean**=4988,**min**=1000, **max**=18333
<br>
<br>
* Loan Amount with **Semiurban** Area:
* **mean**=146,**min**=25, **max**=600
* Loan Amount with **Rural** Area:
* **mean**=155,**min**=40, **max**=570
* Loan Amount with **Urban** Area:
* **mean**=132,**min**=9, **max**=500
<br>
<br>
* Loan Amount Term with **Semiurban** Area:
* **mean**=344,**min**=36, **max**=480
* Loan Amount Term with **Rural** Area:
* **mean**=345,**min**=84, **max**=480
* Loan Amount Term with **Urban** Area:
* **mean**=336,**min**=60, **max**=480

## <a id='13'> 10.Credit History </a>
* How is the Credit History Distribution ?
* How does the credit history relate to applicant income,loan amount,loan amount term?(min,max mean) <br>


In [None]:
df_C_H=df_train_clean['Credit_History'].value_counts().to_frame().reset_index().rename(columns={'index':'Credit_History','Credit_History':'count'})


fig = go.Figure([go.Pie(labels=df_C_H['Credit_History'], values=df_C_H['count'],pull=[0,0.3])])

fig.update_traces(hoverinfo='label+percent', textinfo='value+percent', textfont_size=12,insidetextorientation='radial')

fig.update_layout(title="Credit History Count ",title_x=0.5)
fig.show()

In [None]:
df_C_H=df_train_clean['Credit_History'].value_counts().to_frame().reset_index().rename(columns={'index':'Credit_History','Credit_History':'count'})

fig = go.Figure(go.Bar(
    x=df_C_H['Credit_History'],y=df_C_H['count'],
    marker={'color': df_C_H['count'], 
    'colorscale': 'Viridis'},  
    text=df_C_H['count'],
    textposition = "outside",
))
fig.update_layout(title_text='Credit History Distribution',xaxis_title="Result",yaxis_title="Count ",title_x=0.5)
fig.show()

In [None]:
df_CH_mean=df_train_clean.groupby(by =['Credit_History'])['Loan_Amount_Term'].mean().to_frame().reset_index().rename(columns={'Credit_History':'Credit_History','Loan_Amount_Term':'mean'})
df_CH_min=df_train_clean.groupby(by =['Credit_History'])['Loan_Amount_Term'].min().to_frame().reset_index().rename(columns={'Credit_History':'Credit_History1','Loan_Amount_Term':'min'})
df_CH_max=df_train_clean.groupby(by =['Credit_History'])['Loan_Amount_Term'].max().to_frame().reset_index().rename(columns={'Credit_History':'Credit_History2','Loan_Amount_Term':'max'})
result = pd.concat([df_CH_mean, df_CH_min,df_CH_max], axis=1)
result.drop(['Credit_History1','Credit_History2'],inplace=True,axis=1)


fig = make_subplots(rows=3, cols=1,
                   subplot_titles=("Mean Loan Amount Term",
                                   "Min Loan Amount Term",
                                   "Max  Loan Amount Term"))  # Subplot titles

fig.add_trace(go.Bar(
    x=result['Credit_History'],y=result['mean'],
    marker={'color': result['mean'], 
    'colorscale': 'balance'},  
    text=result['mean'],
    textposition = "inside"),
    row=1, col=1         
)
fig.add_trace(go.Bar(
    x=result['Credit_History'],y=result['min'],
    marker={'color': result['min'], 
    'colorscale': 'balance'},  
    text=result['min'],
    textposition = "inside"),
    row=2, col=1         
)
fig.add_trace(go.Bar(
    x=result['Credit_History'],y=result['max'],
    marker={'color': result['max'], 
    'colorscale': 'balance'},  
    text=result['max'],
    textposition = "inside"),
    row=3, col=1           
)
fig.update_layout(title = "Credit History With Loan Amount Term",title_x=0.5)
fig.show()

In [None]:
df_CH_mean=df_train_clean.groupby(by =['Credit_History'])['ApplicantIncome'].mean().to_frame().reset_index().rename(columns={'Credit_History':'Credit_History','ApplicantIncome':'mean'})
df_CH_min=df_train_clean.groupby(by =['Credit_History'])['ApplicantIncome'].min().to_frame().reset_index().rename(columns={'Credit_History':'Credit_History1','ApplicantIncome':'min'})
df_CH_max=df_train_clean.groupby(by =['Credit_History'])['ApplicantIncome'].max().to_frame().reset_index().rename(columns={'Credit_History':'Credit_History2','ApplicantIncome':'max'})
result = pd.concat([df_CH_mean, df_CH_min,df_CH_max], axis=1)
result.drop(['Credit_History1','Credit_History2'],inplace=True,axis=1)


fig = make_subplots(rows=3, cols=1,
                   subplot_titles=("Mean Applicant Income",
                                   "Min Applicant Income",
                                   "Max  Applicant Income"))  # Subplot titles

fig.add_trace(go.Bar(
    x=result['Credit_History'],y=result['mean'],
    marker={'color': result['mean'], 
    'colorscale': 'balance'},  
    text=result['mean'],
    textposition = "inside"),
    row=1, col=1         
)
fig.add_trace(go.Bar(
    x=result['Credit_History'],y=result['min'],
    marker={'color': result['min'], 
    'colorscale': 'balance'},  
    text=result['min'],
    textposition = "inside"),
    row=2, col=1         
)
fig.add_trace(go.Bar(
    x=result['Credit_History'],y=result['max'],
    marker={'color': result['max'], 
    'colorscale': 'balance'},  
    text=result['max'],
    textposition = "inside"),
    row=3, col=1           
)
fig.update_layout(title = "Credit History With Applicant Income",title_x=0.5)
fig.show()

In [None]:
df_CH_mean=df_train_clean.groupby(by =['Credit_History'])['LoanAmount'].mean().to_frame().reset_index().rename(columns={'Credit_History':'Credit_History','LoanAmount':'mean'})
df_CH_min=df_train_clean.groupby(by =['Credit_History'])['LoanAmount'].min().to_frame().reset_index().rename(columns={'Credit_History':'Credit_History1','LoanAmount':'min'})
df_CH_max=df_train_clean.groupby(by =['Credit_History'])['LoanAmount'].max().to_frame().reset_index().rename(columns={'Credit_History':'Credit_History2','LoanAmount':'max'})
result = pd.concat([df_CH_mean, df_CH_min,df_CH_max], axis=1)
result.drop(['Credit_History1','Credit_History2'],inplace=True,axis=1)


fig = make_subplots(rows=3, cols=1,
                   subplot_titles=("Mean Loan Amount ",
                                   "Min Loan Amount ",
                                   "Max  Loan Amount "))  # Subplot titles

fig.add_trace(go.Bar(
    x=result['Credit_History'],y=result['mean'],
    marker={'color': result['mean'], 
    'colorscale': 'balance'},  
    text=result['mean'],
    textposition = "inside"),
    row=1, col=1         
)
fig.add_trace(go.Bar(
    x=result['Credit_History'],y=result['min'],
    marker={'color': result['min'], 
    'colorscale': 'balance'},  
    text=result['min'],
    textposition = "inside"),
    row=2, col=1         
)
fig.add_trace(go.Bar(
    x=result['Credit_History'],y=result['max'],
    marker={'color': result['max'], 
    'colorscale': 'balance'},  
    text=result['max'],
    textposition = "inside"),
    row=3, col=1           
)
fig.update_layout(title = "Credit History With Loan Amount ",title_x=0.5)
fig.show()

## **Conclusion** 7

* 85 percent of accepted applications have a positive credit history


## <a id='14'> 11.Loan Status </a>
* How is the Loan Status Distribution ?
* How is the Loan Status Distribution with credit history?

In [None]:
df_L_S=df_train_clean['Loan_Status'].value_counts().to_frame().reset_index().rename(columns={'index':'Loan_Status','Loan_Status':'count'})


fig = go.Figure([go.Pie(labels=df_L_S['Loan_Status'], values=df_L_S['count'],pull=[0,0.2])])

fig.update_traces(hoverinfo='label+percent', textinfo='value+percent', textfont_size=12,insidetextorientation='radial')

fig.update_layout(title="Loan Status Count ",title_x=0.5)
fig.show()

In [None]:
df_L_S=df_train_clean['Loan_Status'].value_counts().to_frame().reset_index().rename(columns={'index':'Loan_Status','Loan_Status':'count'})

fig = go.Figure(go.Bar(
    x=df_L_S['Loan_Status'],y=df_L_S['count'],
    marker={'color': df_L_S['count'], 
    'colorscale': 'Viridis'},  
    text=df_L_S['count'],
    textposition = "outside",
))
fig.update_layout(title_text='Loan Status Distribution',xaxis_title="Result",yaxis_title="Count ",title_x=0.5)
fig.show()

In [None]:
g = sns.FacetGrid(df_train_clean, col="Credit_History",)
g.map_dataframe(sns.countplot, x="Loan_Status")
g.set_axis_labels(" ", "Count");

In [None]:
df_M_and_G_and_E=df_train_clean.groupby(by =['Loan_Status','Property_Area'])['Dependents'].count().to_frame().reset_index().rename(columns={'Property_Area':'Property_Area','Loan_Status':'Loan_Status','Dependents':'count'})
fig = px.bar(df_M_and_G_and_E, x="Property_Area", y="count", color="Loan_Status", barmode="group",
             
             )
fig.update_layout(title_text='Loan Status With Property Area',title_x=0.5,
                  hoverlabel=dict(
                  bgcolor="white",
                  font_size=16,
                  font_family="Rockwell",
                                
     )
  )
fig.update_layout(xaxis_title="Property Area",yaxis_title="Count")
fig.show()

In [None]:
df_M_and_G_and_E=df_train_clean.groupby(by =['Loan_Status','Education'])['Dependents'].count().to_frame().reset_index().rename(columns={'Education':'Education','Loan_Status':'Loan_Status','Dependents':'count'})

fig = px.bar(df_M_and_G_and_E, x="Loan_Status", y="count", color="Education", barmode="group",
             
             )
fig.update_layout(title_text='Loan Status With Education ',title_x=0.5,
                  hoverlabel=dict(
                  bgcolor="white",
                  font_size=16,
                  font_family="Rockwell",
                                
     )
  )
fig.update_layout(xaxis_title="Education",yaxis_title="Count")
fig.show()

In [None]:
fig = px.scatter(df_train_clean, x='ApplicantIncome', y='LoanAmount',color="Loan_Status")
fig.update_layout(title='Applicant Income Vs Loan Amount With Loan Status ',xaxis_title="Applicant Income",yaxis_title="Loan Amount ",title_x=0.5)
fig.show()

## **Conclusion** 8

* Rate of accepted applications 70 meaning that the data set is unbalanced


## <a id='15'> 12.Correlation Matrix </a>

* A correlation matrix is a table showing correlation coefficients between variables. Each cell in the table shows the correlation between two variables.<br>
* A correlation matrix is used to summarize data, as an input into a more advanced analysis, and as a diagnostic for advanced analyses.<br>

* **There are three broad reasons for computing a correlation matrix** <br>

* To summarize a large amount of data where the goal is to see patterns. <br>

* To input into other analyses. For example, people commonly use correlation matrixes as inputs for exploratory factor analysis, confirmatory factor analysis, structural equation models, and linear regression when excluding missing values pairwise. <br>

* As a diagnostic when checking other analyses. For example, with linear regression, a high amount of correlations suggests that the linear regression estimates will be unreliable.<br>

In [None]:
df_CM_train=df_train_clean.copy()

In [None]:
Gender_map= {'Male':0,'Female':1}
Married_map= {'Yes':1,'No':0}
Education_map= {'Graduate':1,'Not Graduate':0}
Self_Employed_map= {'Yes':1,'No':0}
Dependents_map= {'0':0,'1':1,'2':2,'3+':3}
Loan_Status_map= {'Y':1,'N':0}
Rural_map={'Rural':1,'Urban':0,'Semiurban':0}
Urban_map={'Rural':0,'Urban':1,'Semiurban':0}
Semiurban_map={'Rural':0,'Urban':0,'Semiurban':1}

In [None]:
df_CM_train["Gender"]=df_CM_train["Gender"].map(Gender_map)
df_CM_train["Married"]=df_CM_train["Married"].map(Married_map)
df_CM_train["Education"]=df_CM_train["Education"].map(Education_map)
df_CM_train["Dependents"]=df_CM_train["Dependents"].map(Dependents_map)
df_CM_train["Self_Employed"]=df_CM_train["Self_Employed"].map(Self_Employed_map)
df_CM_train["Loan_Status"]=df_CM_train["Loan_Status"].map(Loan_Status_map)
df_CM_train["Rural_Area"]=df_CM_train["Property_Area"].map(Rural_map)
df_CM_train["Urban_Area"]=df_CM_train["Property_Area"].map(Urban_map)
df_CM_train["Semiurban_Area"]=df_CM_train["Property_Area"].map(Semiurban_map)

In [None]:
df_CM_train

In [None]:
df_CM_train['Loan_Status1']=df_CM_train['Loan_Status']
df_CM_train.drop(['Loan_Status'],inplace=True,axis=1)
df_CM_train

In [None]:
print("Correlation Matrix")
plt.rcParams['figure.figsize']=(12,8)
sns.heatmap(df_CM_train.corr(),cmap='coolwarm',linewidths=.5,fmt=".2f",annot = True);

## <a id='16'> 13.Dependents </a>
* How is the Dependents Distribution ?
* How much is the per capita income ?

In [None]:
df_Dependents=df_CM_train['Dependents'].value_counts().to_frame().reset_index().rename(columns={'index':'Dependents','Dependents':'count'})

fig = go.Figure(go.Bar(
    x=df_Dependents['Dependents'],y=df_Dependents['count'],
    marker={'color': df_Dependents['count'], 
    'colorscale': 'Viridis'},  
    text=df_Dependents['count'],
    textposition = "outside",
))
fig.update_layout(title_text='Dependents Distribution',xaxis_title="Dependents",yaxis_title="Count ",title_x=0.5)
fig.show()

In [None]:
df_M_and_G_and_E=df_CM_train.groupby(by =['Dependents','Loan_Status1'])['Education'].count().to_frame().reset_index().rename(columns={'Dependents':'Dependents','Loan_Status':'Loan_Status','Education':'count'})
df_M_and_G_and_E['Loan_Status1']=df_M_and_G_and_E['Loan_Status1'].astype('category')

fig = px.bar(df_M_and_G_and_E, x="Dependents", y="count", color="Loan_Status1", barmode="group",
             
             )
fig.update_layout(title_text='Loan Status With Dependents ',title_x=0.5,
                  hoverlabel=dict(
                  bgcolor="white",
                  font_size=16,
                  font_family="Rockwell",
                                
     )
  )
fig.update_layout(xaxis_title="Dependents",yaxis_title="Count")
fig.show()

In [None]:
df_M_and_G=df_CM_train.groupby(by =['Dependents'])['ApplicantIncome'].mean().to_frame().reset_index().rename(columns={'Dependents':'Dependents','ApplicantIncome':'Mean'})
df_M_and_G["deneme"]=[1,2,3,4]
df_M_and_G["per_capita_income"]=df_M_and_G['Mean']/df_M_and_G['deneme']

fig = go.Figure(go.Bar(
    x=df_M_and_G['Dependents'],y=df_M_and_G['per_capita_income'],
    marker={'color': df_M_and_G['per_capita_income'], 
    'colorscale': 'Viridis'},  
    text=df_M_and_G['per_capita_income'],
    textposition = "outside",
))
fig.update_layout(title_text=' Mean Per Capita Income Distribution With Dependents ',xaxis_title="Dependents",yaxis_title="Income ",title_x=0.5)
fig.show()

In [None]:
df_M_and_G=df_CM_train.groupby(by =['Dependents'])['ApplicantIncome'].median().to_frame().reset_index().rename(columns={'Dependents':'Dependents','ApplicantIncome':'Median'})

fig = go.Figure(go.Bar(
    x=df_M_and_G['Dependents'],y=df_M_and_G['Median'],
    marker={'color': df_M_and_G['Median'], 
    'colorscale': 'Viridis'},  
    text=df_M_and_G['Median'],
    textposition = "outside",
))
fig.update_layout(title_text=' Median Applicant Income Distribution With Dependents ',xaxis_title=" Dependents",yaxis_title="Income ",title_x=0.5)
fig.show()

## <a id='17'> 14.Summary Conclusion </a>

* We will work in an area where men are dominant.

* More than half of the people are married

* Education rate in people 80%

* Self Employed rate in people 14%

* Min Income =150

* Max Income =81k

* Mean Income =5350

* 40 percent of homes are in semi-urban areas

* 31 percent of homes are in urban areas

* 29 percent of homes are in rural areas

* 85 percent of accepted applications have a positive credit history

* Rate of accepted applications 70 meaning that the data set is unbalanced



 ## <a id='18'> 15.Machine Learning </a>
![](https://miro.medium.com/max/700/1*1NNRxaTDtmZFF51JZOd-Cw.jpeg)

* Applications range from datamining programs that discover general rules in large data sets, to information filtering systems that automatically learn users' interests.


 ## <a id='19'> 15.1 Data Preprocessing</a>


In [None]:
df_NMV_train=df_train_clean.copy()

In [None]:
Gender_map= {'Male':0,'Female':1}
Married_map= {'Yes':1,'No':0}
Education_map= {'Graduate':1,'Not Graduate':0}
Self_Employed_map= {'Yes':1,'No':0}
Dependents_map= {'0':0,'1':1,'2':2,'3+':3}
Loan_Status_map= {'Y':1,'N':0}
Rural_map={'Rural':1,'Urban':0,'Semiurban':0}
Urban_map={'Rural':0,'Urban':1,'Semiurban':0}
Semiurban_map={'Rural':0,'Urban':0,'Semiurban':1}

In [None]:
df_NMV_train["Gender"]=df_NMV_train["Gender"].map(Gender_map)
df_NMV_train["Married"]=df_NMV_train["Married"].map(Married_map)
df_NMV_train["Education"]=df_NMV_train["Education"].map(Education_map)
df_NMV_train["Dependents"]=df_NMV_train["Dependents"].map(Dependents_map)
df_NMV_train["Self_Employed"]=df_NMV_train["Self_Employed"].map(Self_Employed_map)
df_NMV_train["Loan_Status"]=df_NMV_train["Loan_Status"].map(Loan_Status_map)
df_NMV_train["Rural_Area"]=df_NMV_train["Property_Area"].map(Rural_map)
df_NMV_train["Urban_Area"]=df_NMV_train["Property_Area"].map(Urban_map)
df_NMV_train["Semiurban_Area"]=df_NMV_train["Property_Area"].map(Semiurban_map)

In [None]:
df_NMV_train

In [None]:
df_NMV_train.drop(['Property_Area'],inplace=True,axis=1)
df_NMV_train.head()

 ## <a id='20'> 15.2 Local Outlier Factor</a>

![](http://upload.wikimedia.org/wikipedia/commons/4/4e/LOF-idea.svg)
*  The local outlier factor is based on a concept of a local density, where locality is given by k nearest neighbors, whose distance is used to estimate the density. By comparing the local density of an object to the local densities of its neighbors, one can identify regions of similar density, and points that have a substantially lower density than their neighbors. These are considered to be outliers.

In [None]:
from sklearn.neighbors import LocalOutlierFactor
clf = LocalOutlierFactor(n_neighbors = 20, contamination = 0.1)

In [None]:
df_out=df_NMV_train.copy()
clf.fit_predict(df_out)
df_scores = clf.negative_outlier_factor_

In [None]:
np.sort(df_scores)[0:70]

In [None]:
threshold_value = np.sort(df_scores)[14]
threshold_value

In [None]:
Outlier_df= df_out[df_scores < threshold_value]
indexs=Outlier_df.index
Outlier_df

In [None]:
# Kick Outliers
for i in indexs:
    df_NMV_train.drop(i, axis = 0,inplace = True)

In [None]:
df_NMV_train.info()

In [None]:
y=df_NMV_train['Loan_Status']
X=df_NMV_train.drop('Loan_Status',axis=1)

print('X shape :',X.shape)
print('y shape :',y.shape)

 ## <a id='21'> 15.3  Normalize</a>


In [None]:
X = (X - np.min(X)) / (np.max(X) - np.min(X)).values
X.head()

### Data split

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
X_train,X_test,y_train,y_test=train_test_split(X,y,
                                               test_size=0.2,
                                               random_state=42)

In [None]:
print('X_train :',X_train.shape)
print('X_test :',X_test.shape)
print('y_train :',y_train.shape)
print('y_test :',y_test.shape)

 ## <a id='22'> 16.Logistic Regression </a>


In [None]:
from sklearn.linear_model import LogisticRegression
loj = LogisticRegression(solver = "liblinear")
loj_model = loj.fit(X_train,y_train)
loj_model

In [None]:
y_pred_loj = loj_model.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
accuracy_score(y_test, y_pred_loj)

In [None]:
# Cofusion Matrix
cm = confusion_matrix(y_test, y_pred_loj)
plt.rcParams['figure.figsize'] = (5, 5)
sns.heatmap(cm, annot = True, annot_kws = {'size':15}, cmap = 'PuBu')

In [None]:
print("Training Accuracy :", loj_model.score(X_train, y_train))

print("Testing Accuracy :", loj_model.score(X_test, y_test))

In [None]:
cross_val_score(loj_model, X_test, y_test, cv = 10).mean()

In [None]:
print(classification_report(y_test, y_pred_loj))

In [None]:
from sklearn.metrics import roc_auc_score,roc_curve

logit_roc_auc = roc_auc_score(y, loj_model.predict(X))

fpr, tpr, thresholds = roc_curve(y, loj_model.predict_proba(X)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='AUC (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Oranı')
plt.ylabel('True Positive Oranı')
plt.title('ROC')
plt.show()

 ## <a id='23'> 17.KNN </a>

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn_model = knn.fit(X_train, y_train)
knn_model

In [None]:
y_pred_knn = knn_model.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred_knn)

In [None]:
# Cofusion Matrix
cm = confusion_matrix(y_test, y_pred_knn)
plt.rcParams['figure.figsize'] = (5, 5)
sns.heatmap(cm, annot = True, annot_kws = {'size':15}, cmap = 'PuBu')

In [None]:
print(classification_report(y_test, y_pred_knn))

### Model Tuning

In [None]:
knn_params = {"n_neighbors": np.arange(1,50)}

In [None]:
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, knn_params, cv=10)
knn_cv.fit(X_train, y_train)

In [None]:
print("Best score:" + str(knn_cv.best_score_))
print("Best params: " + str(knn_cv.best_params_))

In [None]:
knn = KNeighborsClassifier(8)
knn_tuned = knn.fit(X_train, y_train)

In [None]:
knn_tuned.score(X_test, y_test)

In [None]:
y_pred_tuned = knn_tuned.predict(X_test)

In [None]:
# Cofusion Matrix
cm = confusion_matrix(y_test, y_pred_tuned)
plt.rcParams['figure.figsize'] = (5, 5)
sns.heatmap(cm, annot = True, annot_kws = {'size':15}, cmap = 'PuBu')

 ## <a id='24'> 18.Artificial Neural Networks </a>
 
 
 

In [None]:
from sklearn.neural_network import MLPClassifier
mlpc = MLPClassifier().fit(X_train, y_train)

In [None]:
y_pred_mlpc = mlpc.predict(X_test)
accuracy_score(y_test,y_pred_mlpc)

In [None]:
accuracy_score(y_test, y_pred_mlpc)

In [None]:
# Cofusion Matrix
cm = confusion_matrix(y_test, y_pred_mlpc)
plt.rcParams['figure.figsize'] = (5, 5)
sns.heatmap(cm, annot = True, annot_kws = {'size':15}, cmap = 'PuBu')

In [None]:
print(classification_report(y_test, y_pred_mlpc))

In [None]:
mlpc_params = {"alpha": [0.1, 0.01, 0.02, 0.005, 0.0001,0.00001],
              "hidden_layer_sizes": [(10,10,10),
                                     (100,100,100),
                                     (100,100),
                                     (3,5), 
                                     (5, 3)],
              "solver" : ["lbfgs","adam","sgd"],
              "activation": ["relu","logistic"]}


In [None]:
mlpc = MLPClassifier()
mlpc_cv_model = GridSearchCV(mlpc, mlpc_params, 
                         cv = 10, 
                         n_jobs = -1,
                         verbose = 2)

mlpc_cv_model.fit(X_train, y_train)

In [None]:
print("Best params: " + str(mlpc_cv_model.best_params_))

In [None]:
mlpc_tuned = MLPClassifier(activation = "relu", 
                           alpha = 0.0001, 
                           hidden_layer_sizes = (10,10,10),
                          solver = "adam")

In [None]:
mlpc_tuned.fit(X_train, y_train)

In [None]:
y_pred_mlpc = mlpc_tuned.predict(X_test)
accuracy_score(y_test, y_pred_mlpc)

In [None]:
# Cofusion Matrix
cm = confusion_matrix(y_test, y_pred_mlpc)
plt.rcParams['figure.figsize'] = (5, 5)
sns.heatmap(cm, annot = True, annot_kws = {'size':15}, cmap = 'PuBu')

 ## <a id='25'> 19.Random Forest </a>

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier().fit(X_train, y_train)

In [None]:
y_pred_rf = rf_model.predict(X_test)
accuracy_score(y_test, y_pred_rf)

In [None]:
# Cofusion Matrix
cm = confusion_matrix(y_test, y_pred_rf)
plt.rcParams['figure.figsize'] = (5, 5)
sns.heatmap(cm, annot = True, annot_kws = {'size':15}, cmap = 'PuBu')

In [None]:
print(classification_report(y_test, y_pred_rf))

In [None]:
Importance = pd.DataFrame({"Importance": rf_model.feature_importances_*100},
                         index = X_train.columns)

In [None]:
Importance.sort_values(by = "Importance", 
                       axis = 0, 
                       ascending = True).plot(kind ="barh", color = "r")

plt.xlabel("Variable Significance Levels")

## Model Tuning

In [None]:
rf_params = {"max_depth": [2,5,8,10],
            "max_features": [2,5,8],
            "n_estimators": [10,500,1000],
            "min_samples_split": [2,5,10]}

In [None]:
rf_model = RandomForestClassifier()

rf_cv = GridSearchCV(rf_model, 
                           rf_params, 
                           cv = 10, 
                           n_jobs = -1, 
                           verbose = 2) 
rf_cv_model=rf_cv.fit(X_train, y_train)

In [None]:
print("Best params: " + str(rf_cv_model.best_params_))

In [None]:
rf_tuned= RandomForestClassifier(max_depth = 8, 
                                  max_features = 2, 
                                  min_samples_split = 10,
                                  n_estimators = 1000)

rf_tuned.fit(X_train, y_train)

In [None]:
y_pred_tuned = rf_tuned.predict(X_test)
accuracy_score(y_test, y_pred_tuned)

In [None]:
# Cofusion Matrix
cm = confusion_matrix(y_test, y_pred_tuned)
plt.rcParams['figure.figsize'] = (5, 5)
sns.heatmap(cm, annot = True, annot_kws = {'size':15}, cmap = 'PuBu')

 ## <a id='26'> 20.Gaussian Naive Bayes </a>

In [None]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb_model = nb.fit(X_train, y_train)
nb_model

In [None]:
y_pred_nb = nb_model.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred_nb)

In [None]:
# Cofusion Matrix
cm = confusion_matrix(y_test, y_pred_nb)
plt.rcParams['figure.figsize'] = (5, 5)
sns.heatmap(cm, annot = True, annot_kws = {'size':15}, cmap = 'PuBu')

In [None]:
print(classification_report(y_test, y_pred_nb))

In [None]:
cross_val_score(nb_model, X_test, y_test, cv = 10).mean()

 ## <a id='27'> 21.SVC</a>

In [None]:
from sklearn.svm import SVC
svm_model = SVC(kernel = "linear").fit(X_train, y_train)

In [None]:
y_pred = svm_model.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred_mlpc))

In [None]:
# Cofusion Matrix
cm = confusion_matrix(y_test, y_pred_nb)
plt.rcParams['figure.figsize'] = (5, 5)
sns.heatmap(cm, annot = True, annot_kws = {'size':15}, cmap = 'PuBu')

 ## <a id='28'> 22.Model Comparison </a>

In [None]:
models = [
    loj_model,
    knn_tuned,
    mlpc_tuned,   
    rf_tuned,
    nb_model,
    svm_model
      
]

for model in models:
    names = model.__class__.__name__
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("-"*28)
    print(names + ":" )
    print("Accuracy: {:.4%}".format(accuracy))

In [None]:
results = pd.DataFrame(columns= ["Models","Accuracy"])

for model in models:
    names = model.__class__.__name__
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)    
    result = pd.DataFrame([[names, accuracy*100]], columns= ["Models","Accuracy"])
    results = results.append(result)
    
    
sns.barplot(x= 'Accuracy', y = 'Models', data=results, color="r")
plt.xlabel('Accuracy %')
plt.title('Accuracy Ratios of Models');  

## Test Data Preparation

In [None]:
Gender_map= {'Male':0,'Female':1}
Married_map= {'Yes':1,'No':0}
Education_map= {'Graduate':1,'Not Graduate':0}
Self_Employed_map= {'Yes':1,'No':0}
Dependents_map= {'0':0,'1':1,'2':2,'3+':3}
Loan_Status_map= {'Y':1,'N':0}
Rural_map={'Rural':1,'Urban':0,'Semiurban':0}
Urban_map={'Rural':0,'Urban':1,'Semiurban':0}
Semiurban_map={'Rural':0,'Urban':0,'Semiurban':1}

In [None]:
df_test=pd.read_csv("/kaggle/input/loan-eligible-dataset/loan-test.csv")

In [None]:
df_test["Gender"]=df_test["Gender"].map(Gender_map)
df_test["Married"]=df_test["Married"].map(Married_map)
df_test["Education"]=df_test["Education"].map(Education_map)
df_test["Dependents"]=df_test["Dependents"].map(Dependents_map)
df_test["Self_Employed"]=df_test["Self_Employed"].map(Self_Employed_map)
df_test["Rural_Area"]=df_test["Property_Area"].map(Rural_map)
df_test["Urban_Area"]=df_test["Property_Area"].map(Urban_map)
df_test["Semiurban_Area"]=df_test["Property_Area"].map(Semiurban_map)

In [None]:
df_test.drop(['Property_Area','Loan_ID'],inplace=True,axis=1)

In [None]:
df_test=df_test.dropna()

In [None]:
sample_df=df_test.sample(20)
sample_df.head()

In [None]:
df_test.info()

In [None]:
df_test.head()

In [None]:
models = [
    loj_model,
    knn_tuned,
    mlpc_tuned,   
    rf_tuned,
    nb_model,
    svm_model
]
 #test sonucları yok neyle kıyalıcaz Sadge
for model in models:
    names = model.__class__.__name__
    y_pred = model.predict(sample_df)
    print("-"*28)
    print(names + ":" )
    print(" Sunuclar:" )
    print(y_pred)

## <a id='29'>23.Conclusion </a>

* As you can see our models are overfeeding.<br>
* We have little data. <br>
* Learning is less because the dataset is unstable.<br>
* We either approve or reject all incoming loan applications <br>
* We can create new data columns. <br>
* We can drop the columns that are not important. <br>
* We need do feature engineering.<br>


**Models**

![ ](https://media.giphy.com/media/l22ysLe54hZP0wubek/giphy.gif) 
                    


## <a id='30'><font color="LIGHTSEAGREEN" size=+2.5><b>24.End Note</b></font> </a>

I hope you enjoyed my kernel.If you like this notebook, an <font color="DARKCYAN"><b>Upvote</b></font> would be great ! :)

I am new with data science. Please <font color="GREEN"><b>comments</b></font> me your <font color="GREEN"><b>feedbacks</b></font> to help me improve myself. 
    
Thanks for your time

## <a id='17'> <font size="+2" color="LIGHTSEAGREEN"><b>Reference</b></font><br>
* https://seaborn.pydata.org/api.html
* https://plotly.com/python/
* https://stackoverflow.com 
* https://towardsdatascience.com/exploratory-data-analysis-8fc1cb20fd15 <br>
* https://r4ds.had.co.nz/exploratory-data-analysis.html#introduction-3  <br>
* https://pandas.pydata.org/pandas-docs/stable/reference/index.html <br>
* https://www.displayr.com/what-is-a-correlation-matrix/ <br>
* https://medium.com/towards-artificial-intelligence/differences-between-ai-and-machine-learning-and-why-it-matters-1255b182fc6 <br>
* http://www.cs.cmu.edu/afs/cs.cmu.edu/user/mitchell/ftp/mlbook.html <br>

<font size="+2" color="LIGHTSEAGREEN"><b>My Other Kernels</b></font><br>

<a href="https://www.kaggle.com/drfrank/lego-transfer-cnn-classification" class="btn btn-primary" style="color:white;">Lego Transfer-CNN Classification</a>


<a href="https://www.kaggle.com/drfrank/face-image-classification" class="btn btn-primary" style="color:white;">Face Image Classification</a>

<a href="https://www.kaggle.com/drfrank/book-review-ratings-data-analysis-visualization" class="btn btn-primary" style="color:white;">Book Review Ratings Analysis & Visualization</a>

<a href="https://www.kaggle.com/drfrank/insurance-prediction-lgbm-gbm-xgboost-eda" class="btn btn-primary" style="color:white;">Insurance Prediction- LGBM,GBM,XGBoost EDA</a>

<a href="https://www.kaggle.com/drfrank/fish-market-data-visualisation-machine-learning" class="btn btn-primary" style="color:white;">Fish Market Data Visualisation & Machine Learning</a>

<a href="https://www.kaggle.com/drfrank/seabron-plotly-for-beginners" class="btn btn-primary" style="color:white;">Seabron & Plotly For Beginners</a>

<a href="https://www.kaggle.com/drfrank/basketball-players-stats-data-visualisation" class="btn btn-primary" style="color:white;">Basketball Players Stats Data Visualisation</a>

<a href="https://www.kaggle.com/drfrank/women-s-football-results-visualization" class="btn btn-primary" style="color:white;">Women's Football Results Visualization</a>

<a href="https://www.kaggle.com/drfrank/us-police-shootings-data-visualisation" class="btn btn-primary" style="color:white;">Us Police Shootings Data Visualisation</a>