# Loan Prediction Model

In [1]:
# importing libraries
import pandas as pd # for data handling
import numpy as np # for mathematical calculations
import seaborn as sns # for data visualisations
import matplotlib.pyplot as plt # for plotting graphs
%matplotlib inline
import warnings # to ignore warnings
warnings.filterwarnings("ignore")

In [2]:
# Reading the Data
data = pd.read_csv("data_set.csv")

In [3]:
def preprocess(df):
    
    """ 
    this function is used to preprocess the dataset, starting from filling missing values to Outlier Treatment 
    
    Parameters: 
    df (DataFrame): input dataset for training 
  
    Returns: 
    DataFrame: preprocessed dataset 
  
    """
    
    # Missing Value Treatment
    df["Gender"].fillna(df["Gender"].mode()[0],inplace=True)
    df["Married"].fillna(df["Married"].mode()[0],inplace=True)
    df['Dependents'].fillna(df["Dependents"].mode()[0],inplace=True)
    df["Self_Employed"].fillna(df["Self_Employed"].mode()[0],inplace=True)
    df["Credit_History"].fillna(df["Credit_History"].mode()[0],inplace=True)
    
    df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0], inplace=True)
    
    df['LoanAmount'].fillna(df['LoanAmount'].median(), inplace=True)
    
    # Outlier Treatment
    df['LoanAmount_log_transformed'] = np.log(df['LoanAmount'])
    
    return df

In [4]:
data = preprocess(data)

In [5]:
def feature_engineering(df):
    """
    this function creates new features from exisiting features which could be highly important for model learning purpose
    
    Parameters:
    df (DataFrame): input dataset 
    
    Returns: 
    DataFrame: dataset with new features
  
    """
    
    df["TotalIncome"] = df["ApplicantIncome"] + df["CoapplicantIncome"]
    df['TotalIncome_log_transformed'] = np.log(df['TotalIncome']) 
    df["EMI"] = df["LoanAmount"]/df["Loan_Amount_Term"]
    df["Balance_Income"] = df["TotalIncome"]-df["EMI"]*1000 # To make the units equal we multiply with 1000
    
    df = df.drop(["ApplicantIncome","CoapplicantIncome","LoanAmount","Loan_Amount_Term", "TotalIncome"],axis=1)
    
    return df

In [6]:
data = feature_engineering(data)

In [7]:
def preparing_dataset_for_training(df):
    """
    this function make a dataset ready for model training and performs Encoding and required Splitting
    
    Parameters:
    df (DataFrame): input dataset
    
    Returns:
    DataFrame: dataset for model training
    """
    
    df = df.drop("Loan_ID",axis=1)
    X=df.drop("Loan_Status",1)
    y=df[["Loan_Status"]]
    
    # Converting the Categorical Variables into Numericals
    X = pd.get_dummies(X)
    X1=X # later to be used for feature importance labels
    
    # Feature Scaling
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    X = sc.fit_transform(X)
    
    # Splitting the Dataset into Training and Test set
    from sklearn.model_selection import train_test_split
    x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=1)
    
    return x_train,x_test,y_train,y_test, X, y, X1

In [8]:
x_train,x_test,y_train,y_test, X, y, X1 = preparing_dataset_for_training(data)

##  Training on Logistic Regression

In [9]:
# Training the Logistic Regression model on the Training set
from sklearn.linear_model import LogisticRegression
classifier_lr = LogisticRegression(C= 0.1, penalty = 'l1', random_state = 0) 
classifier_lr.fit(x_train, y_train)

# Predicting the Test set results
y_pred_lr = classifier_lr.predict(x_test)

In [10]:
def model_validation(df, classifier, y_pred, x_train, y_train):
    """
    this function is responsible for applying cross validation and Grid Search to find a robust model accuracy
    
    Parameters:
    df (DataFrame): input dataset
    
    Returns:
    DataFrame: dataset and prints the optimal parameters
    """
    
    # Applying k-fold cross validation
    from sklearn.model_selection import cross_val_score
    accuracies = cross_val_score(estimator = classifier, X = x_train, y = y_train, cv = 10)
    print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
    print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))
    
    # Using Grid Search to find optimal parameters
    from sklearn.model_selection import GridSearchCV
    parameters = [{"C":[0.001, 0.01, 0.1, 1, 10, 100], "penalty":["l1","l2"]}]# l1 lasso l2 ridge]
    grid_search = GridSearchCV(estimator = classifier,
                               param_grid = parameters,
                               scoring = 'accuracy',
                               cv = 10,
                               n_jobs = -1)
    grid_search = grid_search.fit(x_train, y_train)
    best_accuracy = grid_search.best_score_
    best_parameters = grid_search.best_params_
    print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
    print("Best Parameters:", best_parameters)
    
    
    return df

In [11]:
data = model_validation(data, classifier_lr, y_pred_lr, x_train, y_train)

Accuracy: 81.82 %
Standard Deviation: 3.06 %
Best Accuracy: 81.82 %
Best Parameters: {'C': 0.1, 'penalty': 'l1'}


