## Importing Libraries

In [None]:
import pandas as pd                                                                #used for data manipulation and analysis
import numpy as np                                              #high performance multidimensional array processing package
import matplotlib.pyplot as plt                                                                         #visualisation tool
import seaborn as sns                                                                                   #visualisation tool
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

### Importing the dataset

In [None]:
df = pd.read_csv("../input/loan-prediction-problem-dataset/train_u6lujuX_CVtuZ9i.csv")
df.head()

### Understanding the dataset

In [None]:
df.head()

In [None]:
df.shape

In [None]:
print(f"The dataset has {df.shape[0]} rows and {df.shape[1]} columns")

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
sns.pairplot(df,hue='Loan_Status')

## Exploratory Data Analysis

In [None]:
Categorical_features=[feature for feature in df.columns if df[feature].dtypes == 'object']
Categorical_features

In [None]:
Numerical_features=[feature for feature in df.columns if df[feature].dtypes != 'object']
Numerical_features

In [None]:
Continous_features=[]
Discrete_features=[]
for feature in Numerical_features:
    print(f'Number of unique values in column {feature} is {df[feature].nunique()}')
    if df[feature].nunique()>15:
        Continous_features.append(feature)
    else:
        Discrete_features.append(feature)

In [None]:
#Taking columns having unique values less than 15 as discrete columns
Continous_features

In [None]:
Discrete_features

In [None]:
for feature in Categorical_features[1:]:
    plt.figure(figsize=(12,6))
    sns.countplot(data=df,x=feature,palette="rainbow",hue='Loan_Status')
    plt.title(f"{feature} Countplot")
    plt.tight_layout()
    plt.xlabel("")
    plt.show()

In [None]:
for feature in Discrete_features:
    plt.figure(figsize=(12,6))
    sns.countplot(data=df,x=feature,palette="rainbow",hue='Loan_Status')
    plt.title(f"{feature} Countplot")
    plt.tight_layout()
    plt.xlabel("")
    plt.show()

In [None]:
for feature in Continous_features:
    plt.figure(figsize=(12,6))
    sns.histplot(df[feature],palette="rainbow",kde=True)
    plt.title(f"{feature} Distribution")
    plt.tight_layout()
    plt.show()

In [None]:
#missing values
for feature in df.columns:
    print(f'No. of missing values in column {feature} is {sum(df[feature].isnull())}')
    print(f'NUll value percentage : {round(np.mean(df[feature].isnull())*100,2)} % \n')

In [None]:
#it can be seen that all the columns except Credit_history have less than 5 percent of null value
#can be easily be substituted by suitable central tendacy 
#in case of categorical and discrete mode will be suitable
#while for continous values we can make use of boxplot to make the decision
plt.figure(figsize=(15,15),dpi=150)
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='magma')

In [None]:
for feature in Categorical_features+Discrete_features:
    df[feature].fillna(df[feature].mode()[0],inplace=True)

In [None]:
for feature in Continous_features:
    plt.figure(figsize=(12,6))
    sns.boxplot(data = df, y=feature)
    plt.title(f"{feature}")
    plt.tight_layout()
    plt.show()

In [None]:
#since there are large numbers of outliers we are gonna fill the null values with median

In [None]:
for feature in Continous_features:
    df[feature].fillna(np.nanmedian(df[feature]),inplace=True)

In [None]:
plt.figure(figsize=(15,15),dpi=150)
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='magma')

In [None]:
#understanding the coleration

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(df.corr(),annot=True,cmap='rainbow')

In [None]:
df.head()

In [None]:
#Feature Engineering

In [None]:
#creating a copy and working o new dataframe
data=df.copy()

In [None]:
data.columns

In [None]:
data["Total_Income"]=data["ApplicantIncome"]+data["CoapplicantIncome"]

In [None]:
data.drop(['Loan_ID','ApplicantIncome', 'CoapplicantIncome','Dependents'],inplace=True,axis=1)

In [None]:
Encoded_data=pd.DataFrame()

In [None]:
for feature in data.columns:
    if(data[feature].nunique()>3):
        pass
    else:
        for i in range(data[feature].nunique()-1):
            Encoded_data[feature]=np.where(str(data[feature].unique()[i]) == data[feature],1,0)

In [None]:
Encoded_data["LoanAmount"]=data["LoanAmount"]
Encoded_data["Loan_Amount_Term"]=data["Loan_Amount_Term"]
Encoded_data["Total_Income"]=data["Total_Income"]

In [None]:
Encoded_data["Credit_History"]=data["Credit_History"]


In [None]:
Encoded_data.head()

In [None]:
Standard_data=Encoded_data.copy()
Standard_data['LoanAmount']=(Standard_data['LoanAmount']-Standard_data['LoanAmount'].min())/(Standard_data['LoanAmount'].max()-Standard_data['LoanAmount'].min())
Standard_data['Loan_Amount_Term']=(Standard_data['Loan_Amount_Term']-Standard_data['Loan_Amount_Term'].min())/(Standard_data['Loan_Amount_Term'].max()-Standard_data['Loan_Amount_Term'].min())
Standard_data['Total_Income']=(Standard_data['Total_Income']-Standard_data['Total_Income'].min())/(Standard_data['Total_Income'].max()-Standard_data['Total_Income'].min())
Standard_data.head()

In [None]:
#we have scaled our data 
#since output is not given for test dataset 
#to check the performance we are gonna split our train dataset and work 
x = Standard_data.drop(columns="Loan_Status", axis=1)
y = Standard_data['Loan_Status']

In [None]:
x_train, x_cv, y_train, y_cv = train_test_split(x, y, test_size=0.2)

In [None]:
#using logistic regression model
loreg = LogisticRegression()

loreg.fit(x_train, y_train)
y_pred=loreg.predict(x_cv)

In [None]:
print("Train Accuracy: ", loreg.score(x_cv, y_cv) * 100,' %')

In [None]:
print(classification_report(y_cv,y_pred))

In [None]:
print(confusion_matrix(y_cv,y_pred))

In [None]:
score = cross_val_score(loreg, x, y, cv=5)
print("Cross validation is",np.mean(score)*100)