<a href="https://www.kaggle.com/code/samithsachidanandan/loan-prediction-analysis-classification?scriptVersionId=231267372" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

## Import the necessary libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings 
warnings.filterwarnings("ignore")

## Loading the Dataset 

In [None]:
df_train =pd.read_csv("/kaggle/input/loan-prediction-problem-dataset/train_u6lujuX_CVtuZ9i.csv")
df_train.head()

In [None]:
df_train.shape

In [None]:
df_test  =pd.read_csv("/kaggle/input/loan-prediction-problem-dataset/test_Y3wMUE5_7gLdaTN.csv")
df_test .head()

In [None]:
df_test.shape

In [None]:
df_train.describe()

In [None]:
df_train.info()

## Preprocessing the Dataset

In [None]:
# Find the null values 
df_train.isnull().sum()

In [None]:
def Preprocess (df_train, df_test):
    df = pd.concat([df_train, df_test], axis = 0)
    # Fill the missing values for the numerical terms -mean
    df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].mean())
    df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mean())
    df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].mean())

    # Fill the missing values for categorical terms -mode
    df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
    df['Dependents'] = df['Dependents'].fillna(df['Dependents'].mode()[0])
    df['Married'] = df['Married'].fillna(df['Married'].mode()[0])
    df['Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].mode()[0])

    # Creating a new attribute
    df['Total_Income'] = df ['ApplicantIncome'] + df['CoapplicantIncome']

    numerical_new = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount','Loan_Amount_Term','Total_Income']
    numerical_log =  ['ApplicantIncomeLog', 'CoapplicantIncomeLog', 'LoanAmountLog','Loan_Amount_TermLog','Total_Incomelog']
    # Apply log transformation before the loop
    df[numerical_log] = df[numerical_new].apply(lambda x: np.log(x + 1))  # Adding 1 to avoid log(0) issues
    

    df_train = df[:len(df_train)]
    df_test = df[len(df_train):]

    df_test = df_test.drop('Loan_Status', axis = 1)

    return df_train, df_test


In [None]:
train_df, test_df = Preprocess(df_train, df_test)

In [None]:
train_df.isnull().sum()

## Exploratory Data Analysis

In [None]:
# Categorical Attributes visualization 

categorical = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed','Loan_Status']

for col in categorical:  
    sns.countplot(x=col, data=train_df)
    plt.title(f"Chart for {col}") 
 
    plt.legend()
    plt.tight_layout()
    plt.show()

In [None]:
# Numercal Attributes visualization 

numerical = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount','Loan_Amount_Term', 'Credit_History']

for col in numerical:  
    sns.histplot(x=col, data=train_df)
    plt.title(f"Chart for {col}") 
 
    plt.legend()
    plt.tight_layout()
    plt.show()

## Visualizing the log transforamed attributes

In [None]:

numerical_log =  ['ApplicantIncomeLog', 'CoapplicantIncomeLog', 'LoanAmountLog','Loan_Amount_TermLog','Total_Incomelog']


for col in numerical_log: 
    sns.histplot(train_df[col], kde=True)  
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")

    plt.tight_layout()  
    plt.show()

## Corelation matrix

In [None]:
corr = train_df.corr(numeric_only=True)
fig,ax = plt.subplots(figsize=(12,8))
sns.heatmap(corr, annot=True, ax = ax, cmap = 'BuPu')

### Drop unneccessay columns

In [None]:
train_df.head()

In [None]:
cols = ['Loan_ID','ApplicantIncome', 'CoapplicantIncome', 'LoanAmount','Loan_Amount_Term','Total_Income','CoapplicantIncomeLog']
train_df =train_df.drop(columns = cols, axis = 1)
train_df.head()

In [None]:
test_df.head()

In [None]:
cols = ['Loan_ID','ApplicantIncome', 'CoapplicantIncome', 'LoanAmount','Loan_Amount_Term','Total_Income','CoapplicantIncomeLog']
test_df =test_df.drop(columns = cols, axis = 1)
test_df.head()

In [None]:
 test_df['Loan_Status'] = 0

## Label Encoding 

In [None]:
from sklearn.preprocessing import LabelEncoder
categorical_new = ['Gender', 'Married','Education','Dependents', 'Self_Employed','Property_Area','Loan_Status']
le = LabelEncoder()
for col in categorical_new:
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.fit_transform(test_df[col])

In [None]:
train_df.head()

In [None]:
test_df.head()

## Train- Test Split 

In [None]:
# Specify input and output attributes 
X = train_df.drop(columns=['Loan_Status'], axis=1)
y = train_df['Loan_Status']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)

## Model Training 

In [None]:
# Classify function 
from sklearn.model_selection import cross_val_score
def classify(model, X, y ):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)
    model.fit(X_train, y_train)
    print("Accuracy is ", model.score(X_test, y_test)*100)
    #cross validation - it is used for better validation of model 
    #eg: cv-5, train-4 , test-1
    score = cross_val_score(model, X, y, cv=5)
    print("Cross Validation is", np.mean(score)*100)

In [None]:
from sklearn.linear_model import LogisticRegression
logreg  = LogisticRegression()
classify(logreg, X, y)

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtree  = DecisionTreeClassifier()
classify(dtree , X, y)


In [None]:
#Xgboost
from xgboost import XGBClassifier 
xgb = XGBClassifier()
classify(xgb, X, y)

In [None]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

rfc = RandomForestClassifier()
classify(rfc, X, y)

In [None]:


etc = ExtraTreesClassifier()
classify(etc, X, y)

### Hyperparameter Tunning 

In [None]:
rfc_h = RandomForestClassifier(n_estimators=140, min_samples_split=25, max_depth=7, max_features=1)
classify(rfc_h,X,y)

## Confussion Matrix

In [None]:
model = RandomForestClassifier()
model.fit(X_train,y_train)

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
cm = confusion_matrix (y_test, y_pred)
cm

In [None]:
sns.heatmap(cm, annot=True)

## Predicting on the Test Dataset

In [None]:
test_df = test_df.drop(columns=['Loan_Status'], axis=1)

In [None]:
y_pred = model.predict(test_df)

In [None]:
df_test['Loan_Status'] = y_pred

In [None]:
df_test.head()

In [None]:
df_test['Loan_Status'] = df_test['Loan_Status'].map({1: 'Y', 0: 'N'})

In [None]:
df_test.head()