In [92]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score
from sklearn.tree import DecisionTreeClassifier
import joblib

In [93]:
df = pd.read_csv('Dataset/data.csv')

In [94]:
df['LoanApproved'] = df['ApprovalDate'].apply(lambda x: 1 if pd.notnull(x) else 0)
df['LoanApproved'] = df['LoanApproved'].astype(int)

In [95]:
# # Custom transformer to create 'LoanApproved' column based on 'ApprovalDate'
# class CreateTargetCol(BaseEstimator, TransformerMixin):
#     def fit(self, X, y=None):
#         return self
    
#     def transform(self, X):
#         # Create the 'LoanApproved' column (1 if date exists, 0 if null)
#         X['LoanApproved'] = X['ApprovalDate'].apply(lambda x: 1 if pd.notnull(x) else 0)
#         X['LoanApproved'] = X['LoanApproved'].astype(int)
        
#         return X
        

In [96]:
numerical_cols = ['CoBorrowerTotalMonthlyIncome', 'CoBorrowerAge', 'CoBorrowerYearsInSchool',
                  'BorrowerTotalMonthlyIncome', 'BorrowerAge', 'DTI', 'CLTV', 'CreditScore',
                  'TotalLoanAmount', 'LoanApproved']
categorical_cols = ['LeadSourceGroup','Group','LoanPurpose']


df = df[['CoBorrowerTotalMonthlyIncome', 'CoBorrowerAge', 'CoBorrowerYearsInSchool','BorrowerTotalMonthlyIncome', 'BorrowerAge', 'DTI', 'CLTV', 'CreditScore', 'TotalLoanAmount', 'LoanApproved', 'LeadSourceGroup','Group','LoanPurpose']]

In [97]:
# df['Group'] = df['Group'].apply(lambda x : x.strip() if x is not np.nan else x)
# df['LeadSourceGroup'] = df['LeadSourceGroup'].apply(lambda x : x.strip() if x is not np.nan else x)
df['LoanPurpose'] = df['LoanPurpose'].apply(lambda x : x.strip() if x is not np.nan else x)

In [98]:
# class SelectedFeatureDF(BaseEstimator, TransformerMixin):
#     def fit(self, X, y=None):
#         return self
    
#     def transform(self, X):
#         X = X[['CoBorrowerTotalMonthlyIncome', 'CoBorrowerAge', 'CoBorrowerYearsInSchool',
#          'BorrowerTotalMonthlyIncome', 'BorrowerAge', 'DTI', 'CLTV', 'CreditScore',
#          'TotalLoanAmount', 'LoanApproved', 'LeadSourceGroup','Group','LoanPurpose']]
#         numerical_cols = ['CoBorrowerTotalMonthlyIncome', 'CoBorrowerAge', 'CoBorrowerYearsInSchool',
#                   'BorrowerTotalMonthlyIncome', 'BorrowerAge', 'DTI', 'CLTV', 'CreditScore',
#                   'TotalLoanAmount', 'LoanApproved']
#         categorical_cols = ['LeadSourceGroup','Group','LoanPurpose']
        
#         return X

In [99]:
# Custom transformer for KNN imputation
class KNNImputerCustom(BaseEstimator, TransformerMixin):
    def __init__(self, columns, n_neighbors=5):
        self.columns = columns
        self.knn_imputer = KNNImputer(n_neighbors=n_neighbors)
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X[self.columns] = self.knn_imputer.fit_transform(X[self.columns])
        return X

In [100]:
# Custom transformer for OneHotEncoding
class OneHotEncoderCustom(BaseEstimator, TransformerMixin):
    def __init__(self, categorical_cols):
        self.categorical_cols = categorical_cols
        self.ohe = OneHotEncoder(drop='first', sparse_output=False)  # Changed sparse to sparse_output
    
    def fit(self, X, y=None):
        self.ohe.fit(X[self.categorical_cols])
        return self
    
    def transform(self, X):
        encoded_cols = pd.DataFrame(self.ohe.transform(X[self.categorical_cols]), columns=self.ohe.get_feature_names_out(self.categorical_cols))
        X = X.drop(columns=self.categorical_cols)
        X = pd.concat([X, encoded_cols], axis=1)
        return X

In [101]:
# Custom transformer for Outlier Imputation
class OutlierImputer(BaseEstimator, TransformerMixin):
    def __init__(self, method='median'):
        self.method = method

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        for col in X.select_dtypes(include=[np.number]).columns:
            Q1 = X[col].quantile(0.25)
            Q3 = X[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            outliers = X[(X[col] < lower_bound) | (X[col] > upper_bound)].index
            if self.method == 'median':
                X.loc[outliers, col] = X[col].median()
            elif self.method == 'mean':
                X.loc[outliers, col] = X[col].mean()
        return X

In [102]:
class StandardScalerWithExclusion(BaseEstimator, TransformerMixin):
    def __init__(self, exclude_column='LoanApproved'):
        """
        Initialize the StandardScalerWithExclusion transformer.
        This will apply StandardScaler to all numeric columns except for the column to be excluded.
        """
        self.exclude_column = exclude_column
        self.scaler = StandardScaler()

    def fit(self, X, y=None):
        columns_to_scale = X.select_dtypes(include=['number']).drop(columns=[self.exclude_column], errors='ignore').columns
        self.scaler.fit(X[columns_to_scale])
        
        return self

    def transform(self, X):
        columns_to_scale = X.select_dtypes(include=['number']).drop(columns=[self.exclude_column], errors='ignore').columns
        X[columns_to_scale] = self.scaler.transform(X[columns_to_scale])
        
        return X


In [103]:
# Define pipeline steps
pipeline_steps = [
    # ('create_target_col', CreateTargetCol()),
    # ('SelectedFeatureDF', SelectedFeatureDF()),
    ('knn_imputer', KNNImputerCustom(columns=['DTI'])),
    ('ohe', OneHotEncoderCustom(categorical_cols=categorical_cols)),
    ('outlier_imputer', OutlierImputer(method='median')),
    ('scaler', StandardScalerWithExclusion(exclude_column='LoanApproved'))

]

In [104]:
# Build the pipeline
full_pipeline = Pipeline(steps=pipeline_steps)

# Apply the pipeline to the dataset (Fit and transform in one step)
df_transformed = full_pipeline.fit_transform(df)
# Save the fitted pipeline to a file using joblib
joblib.dump(full_pipeline, 'Preprocessing_Pipeline.pkl')

df_transformed

Unnamed: 0,CoBorrowerTotalMonthlyIncome,CoBorrowerAge,CoBorrowerYearsInSchool,BorrowerTotalMonthlyIncome,BorrowerAge,DTI,CLTV,CreditScore,TotalLoanAmount,LoanApproved,LeadSourceGroup_Direct Mail,LeadSourceGroup_Internet,LeadSourceGroup_Radio,LeadSourceGroup_Referral,LeadSourceGroup_Repeat Client,LeadSourceGroup_Self Sourced,LeadSourceGroup_Social Media,LeadSourceGroup_TV,LeadSourceGroup_Trigger,LeadSourceGroup_nan,Group_Loan Coordinator,Group_Purchase Team - 4105,Group_Purchase Team - 4112,Group_Refinance Team - 4101,Group_Refinance Team - 4102,Group_Refinance Team - 4103,Group_Reverse Team - 4340,Group_nan,LoanPurpose_Purchase,LoanPurpose_Refinance,LoanPurpose_Refinance Cash-out,LoanPurpose_VA IRRRL
0,-0.552165,-0.907823,-0.913737,-0.372266,-1.195128,-0.152088,-0.702158,-0.509303,-0.379073,0,0.0,-1.002009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.625993,1.718339,0.0,0.0,0.0,1.252576,-0.963996,0.0
1,-0.552165,1.199373,0.629752,0.332363,0.156102,-0.152088,-1.107986,0.897318,-1.360396,1,0.0,-1.002009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.625993,1.718339,0.0,0.0,0.0,-0.798355,1.037349,0.0
2,-0.552165,-0.907823,-0.913737,2.509766,0.831717,0.051889,0.604410,-1.545760,0.113892,1,0.0,-1.002009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.625993,1.718339,0.0,0.0,0.0,-0.798355,1.037349,0.0
3,-0.552165,-0.907823,-0.913737,1.443226,0.325006,-0.152088,0.139853,1.100908,1.772467,1,0.0,-1.002009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.597463,-0.581957,0.0,0.0,0.0,-0.798355,1.037349,0.0
4,-0.552165,-0.907823,-0.913737,-0.329015,-0.097254,0.663821,0.105539,-0.805434,-0.779895,1,0.0,-1.002009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.597463,-0.581957,0.0,0.0,0.0,-0.798355,1.037349,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13446,0.034113,1.927313,0.629752,-1.097598,2.182948,0.459843,-0.338562,1.360022,-0.162537,0,0.0,0.997995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.625993,1.718339,0.0,0.0,0.0,1.252576,-0.963996,0.0
13447,-0.552165,-0.907823,-0.913737,-0.089228,0.409458,-1.783906,0.274469,1.193449,2.417468,0,0.0,-1.002009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.625993,-0.581957,0.0,0.0,0.0,-0.798355,1.037349,0.0
13448,-0.552165,0.394807,1.144248,-0.497977,-1.195128,-0.356065,-0.141257,0.934334,-0.319180,0,0.0,0.997995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.597463,-0.581957,0.0,0.0,0.0,-0.798355,1.037349,0.0
13449,2.191613,1.429249,1.144248,0.927696,0.916169,-0.458054,0.198582,-0.860958,2.420693,0,0.0,0.997995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.625993,-0.581957,0.0,0.0,0.0,-0.798355,1.037349,0.0


In [105]:
df_transformed = pd.DataFrame(df_transformed)

# Split data into features (X) and target (Y)
X = df_transformed.drop(columns='LoanApproved')
Y = df_transformed['LoanApproved']

# Train-Test Split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Logistic Regression Model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# dump the model
joblib.dump(model, 'Application_Approval_Model.pkl')
# Predict and evaluate
y_pred = model.predict(X_test)
lr_accuracy = accuracy_score(y_test, y_pred)
lr_precision = precision_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {lr_accuracy:.4f}, Precision: {lr_precision:.4f}") 

Logistic Regression Accuracy: 0.7265, Precision: 0.7280


In [106]:
# # Random Forest Model
# model = RandomForestClassifier()
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# rf_accuracy = accuracy_score(y_test, y_pred)
# rf_precision = precision_score(y_test, y_pred)
# print(f"Random Forest Accuracy: {rf_accuracy:.4f}, Precision: {rf_precision:.4f}")

In [111]:
# df = pd.read_csv('Dataset/data.csv')

In [110]:
# df['LeadSourceGroup'].unique()

array([nan, 'TV', 'Self Sourced', 'Internet', 'Radio', 'Referral',
       'Repeat Client', 'Direct Mail', '3rd Party', 'Social Media',
       'Trigger'], dtype=object)

In [107]:
# Function for user input (same as before)
def get_user_input():
    CoBorrowerTotalMonthlyIncome = float(input("Enter Co-Borrower Total Monthly Income: "))
    CoBorrowerAge = float(input("Enter Co-Borrower Age: "))
    CoBorrowerYearsInSchool = float(input("Enter Co-Borrower Years In School: "))
    BorrowerTotalMonthlyIncome = float(input("Enter Borrower Total Monthly Income: "))
    BorrowerAge = float(input("Enter Borrower Age: "))
    DTI = float(input("Enter DTI (Debt-to-Income Ratio Range ): "))
    CLTV = float(input("Enter CLTV (Loan-to-Value Ratio): "))
    CreditScore = float(input("Enter Credit Score: "))
    TotalLoanAmount = float(input("Enter Total Loan Amount: "))
    LeadSourceGroup = input("Enter Lead Source Group (TV, Self Sourced, Internet, Radio, Referral, Repeat Client, Direct Mail, 3rd Party, Social Media): ")
    Group = input("Enter Group (Admin, Loan Coordinator, Refinance Team - #number): ")
    LoanPurpose = input("Enter Loan Purpose (Purchase, VA IRRRL, Refinance Cash-out, FHA Streamlined Refinance): ")
    return {
        "CoBorrowerTotalMonthlyIncome": CoBorrowerTotalMonthlyIncome,
        "CoBorrowerAge": CoBorrowerAge,
        "CoBorrowerYearsInSchool": CoBorrowerYearsInSchool,
        "BorrowerTotalMonthlyIncome": BorrowerTotalMonthlyIncome,
        "BorrowerAge": BorrowerAge,
        "DTI": DTI,
        "CLTV": CLTV,
        "CreditScore": CreditScore,
        "TotalLoanAmount": TotalLoanAmount,
        "LeadSourceGroup": LeadSourceGroup,
        "Group": Group,
        "LoanPurpose": LoanPurpose
    }


In [108]:
user_input = get_user_input()
user_input_df = pd.DataFrame([user_input])

ValueError: could not convert string to float: ''

In [78]:
# user_input_pipeline_steps = [
#     ('ohe', OneHotEncoderCustom(categorical_cols=categorical_cols)),  # One-hot encode categorical columns
#     ('scaler', StandardScalerWithExclusion(exclude_column='LoanApproved'))  # Scale the data, excluding 'LoanApproved'
# ]

# # Create the user input pipeline
# user_input_pipeline = Pipeline(steps=user_input_pipeline_steps)

In [79]:
user_input_transformed = full_pipeline.transform(user_input_df)
user_input_transformed

Unnamed: 0,CoBorrowerTotalMonthlyIncome,CoBorrowerAge,CoBorrowerYearsInSchool,BorrowerTotalMonthlyIncome,BorrowerAge,DTI,CLTV,CreditScore,TotalLoanAmount,LeadSourceGroup_Direct Mail,LeadSourceGroup_Internet,LeadSourceGroup_Radio,LeadSourceGroup_Referral,LeadSourceGroup_Repeat Client,LeadSourceGroup_Self Sourced,LeadSourceGroup_Social Media,LeadSourceGroup_TV,LeadSourceGroup_Trigger,LeadSourceGroup_nan,Group_Loan Coordinator,Group_Purchase Team - 4105,Group_Purchase Team - 4112,Group_Refinance Team - 4101,Group_Refinance Team - 4102,Group_Refinance Team - 4103,Group_Reverse Team - 4340,Group_nan,LoanPurpose_Purchase,LoanPurpose_Refinance,LoanPurpose_Refinance Cash-out,LoanPurpose_VA IRRRL
0,1.057987,-0.026632,0.887,-1.902684,-2.884167,1.581718,-0.055473,-13.557562,16.146778,0.0,0.997995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.625993,-0.581957,0.0,0.0,1.0,-0.798355,-0.963996,0.0


In [80]:
# Load the save model pkl_file
loaded_model = joblib.load('Application_Approval_Model.pkl')

# Use the trained model to make predictions
prediction = loaded_model.predict(user_input_transformed)
prediction


array([0])

In [81]:
if prediction == 1:
    print("Loan Approved!")
else:
    print("Loan Denied!")

Loan Denied!
