In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
x = pd.read_csv("/kaggle/input/loan-eligible-dataset/loan-train.csv")
y = pd.read_csv("/kaggle/input/loan-eligible-dataset/loan-test.csv")

*Let's get a idea of what actually the data is !!!*

In [None]:
x.head()

In [None]:
x.describe()

*A histogram shows the number of instances
(on the vertical axis) that have a given value range (on the horizontal axis)*

In [None]:
%matplotlib inline 
import matplotlib.pyplot as plt
x.hist(bins=25, figsize=(15,15))
plt.show()


In [None]:
# creating a dict file 
Loan_Status = {'Y': 1,'N': 0}

# traversing through dataframe
# Gender column and writing
# values where key matches
x.Loan_Status = [Loan_Status[item] for item in x.Loan_Status]

x.head()

In [None]:
Education = {"Graduate" : 1,"Not Graduate":0}
x.Education = [Education[item] for item in x.Education]

x.head()

In [None]:
Education = {"Graduate" : 1,"Not Graduate":0}
y.Education = [Education[item] for item in y.Education]
y.head()

In [None]:
x.Property_Area.value_counts()

In [None]:
x.plot(kind = "scatter" , x = "Loan_Status" , y = "ApplicantIncome" )

 *As we can see the probability of getting loan  accepted has higher income.*

## Looking for Correlations

In [None]:
corr_matrix =x.corr()
corr_matrix["Loan_Status"].sort_values(ascending=False)

*When it is close to 1, it means that
there is a strong positive correlation; for example, the loan status value tends to go
up when the credit history goes up. When the coefficient is close to –1, it means
that there is a strong negative correlation*

In [None]:

x.plot(kind="scatter", x="Loan_Status", y="ApplicantIncome", alpha=0.4,
             s=x["LoanAmount"]/100, label="LoanAmount", figsize=(15,13),
             c="Loan_Amount_Term", cmap=plt.get_cmap("jet"), colorbar=True,
             sharex=False)
plt.legend()


In [None]:
from pandas.plotting import scatter_matrix

attributes = ["Loan_Status",         
"Credit_History",      
"ApplicantIncome",     
"Loan_Amount_Term",   
"LoanAmount"]
scatter_matrix(x[attributes], figsize=(15, 10))


## **Data Cleaning** 

*Vizualizing missing data*

In [None]:
x.isna().sum()

In [None]:
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.heatmap(x.isnull(), cbar=False)

In [None]:
x.Gender = x.Gender.replace({"Male": 1, "Female" : 0})
y.Gender = y.Gender.replace({"Male": 1, "Female" : 0})

x.Married = x.Married.replace({"Yes": 1, "No" : 0})
y.Married = y.Married.replace({"Yes": 1, "No" : 0})

x.Self_Employed = x.Self_Employed.replace({"Yes": 1, "No" : 0})
y.Self_Employed = y.Self_Employed.replace({"Yes": 1, "No" : 0})

In [None]:
x.Dependents.value_counts()

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

*Since the median can only be computed on numerical attributes, we need to create a
copy of the data without the text attribute*

In [None]:
df = x.drop(["Property_Area","Loan_ID","Dependents","CoapplicantIncome"], axis=1)

In [None]:
y_df = y.drop(["Property_Area","Loan_ID","Dependents"], axis=1)

In [None]:
y_df

In [None]:
df

In [None]:
imputer.fit(df)

In [None]:
imputer.fit(y_df)

In [None]:
imputer.statistics_


In [None]:
X = imputer.transform(df)
X

In [None]:
Y = imputer.transform(y_df)
Y

In [None]:
df1 = pd.DataFrame(X, columns=df.columns)
df1

In [None]:
y_df1 =  pd.DataFrame(Y, columns=y_df.columns)
y_df1

*Now all numerical attributes in the training set and test set has been handled*

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(x.Property_Area)
le.fit(y.Property_Area)

In [None]:
df1 = df1.join([x.Property_Area,x.Dependents])

In [None]:
df1

In [None]:
y_df1 = y_df1.join([y.Property_Area,y.Dependents])

In [None]:
y_df1

### Data Vizualisation 

In [None]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

ls_with_se =df1.groupby(by =['Loan_Status','Self_Employed'])['Dependents'].count().to_frame().reset_index().rename(columns={'Loan_Status':'Loan_Status','Self_Employed':'Self_Employed','Dependents': "counts"})



In [None]:
ls_with_se

In [None]:
fig = px.bar(ls_with_se, x="Loan_Status", y="counts", color="Self_Employed")

fig.update_layout(title_text='Self Employed Count With loan_status',title_x=0.5,
                  hoverlabel=dict(
                  bgcolor="Red",
                  font_size=16,
                  font_family="TimesNewRoman",
                                
     )
  )
fig.show()

***As we can see if a person is employed the percentage of getting loan is high !!!***

In [None]:
fig = px.scatter(df1, x='ApplicantIncome', y='LoanAmount',color="Loan_Status")
fig.update_layout(title='Applicant Income Vs Loan Amount With Loan Status ',
                  xaxis_title="Applicant Income",yaxis_title="Loan Amount ",title_x=0.5)
fig.show()

In [None]:
df_M_and_G_and_E=df1.groupby(by =['Loan_Status','Education'])['Dependents'].count().to_frame().reset_index().rename(columns={'Education':'Education','Loan_Status':'Loan_Status','Dependents':'count'})

fig = px.bar(df_M_and_G_and_E, x="Loan_Status", y="count", color="Education", barmode="group")

fig.update_layout(xaxis_title="Education",yaxis_title="Count")
fig.show()

In [None]:
groups = ['Gender','Married','Education','Self_Employed','Dependents','Property_Area']
fig, ax = plt.subplots(3,2, figsize = (12,12))
axs=ax.ravel()
for i, group in enumerate(groups):
    sns.countplot(x = group, hue = 'Loan_Status',ax=axs[i], data=df1,palette="husl")

### Applying Machine Learning

In [None]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()

In [None]:
tf = ['Credit_History', 'Education','Self_Employed','ApplicantIncome']

In [None]:
x_train = df1[tf].values
y_train = df1['Loan_Status'].values

x_test = y_df1[tf].values

In [None]:
lin_reg.fit(x_train,y_train)

In [None]:
predicted = lin_reg.predict(x_test)

In [None]:
print('Coefficient of model :', lin_reg.coef_)
print('\nIntercept of model',lin_reg.intercept_)

In [None]:
# accuracy_train = accuracy_score(x_test, predicted)
score = lin_reg.score(x_train, y_train)
print('accuracy_score overall :', score)
print('accuracy_score percent :', round(score*100,2))

*It looks like linear model done pretty bad performance*

In [None]:
from sklearn.ensemble import RandomForestClassifier
rc = RandomForestClassifier()

In [None]:
rc.fit(x_train,y_train)
predicted_rc = rc.predict(x_test)

In [None]:
score = rc.score(x_train, y_train)
print('accuracy_score overall :', score)
print('accuracy_score percent :', round(score*100,2))

*Random Forest classifier has done a good job with **97.39%** accuracy.*

### *Thanks for vieweing this far Kindly Share your thoughts and upvote ☺*