## **Transformers and Pipline**

### **User-Input Testing on Models**

## **Without Pipline / Transformer**

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score
from sklearn.impute import KNNImputer

In [2]:
# Upload dataset to dataframe
df = pd.read_csv('../datasets/data.csv')

In [3]:
# Create 'LoanApproved' column using 'ApprovalDate'
df['LoanApproved'] = df['ApprovalDate'].apply(lambda x: 1 if pd.notnull(x) else 0)
df['LoanApproved'] = df['LoanApproved'].astype(int)

In [4]:
# Make a df with selected columns
df = df[['CoBorrowerTotalMonthlyIncome', 'CoBorrowerAge', 'CoBorrowerYearsInSchool',
         'BorrowerTotalMonthlyIncome', 'BorrowerAge', 'DTI', 'CLTV', 'CreditScore',
         'TotalLoanAmount', 'LoanApproved', 'LeadSourceGroup','Group','LoanPurpose']]

In [None]:
# Print rows where NaN
df[df.isna().any(axis=1)]

In [None]:
df.isna().sum()

In [5]:
# Replace NaN in DTI with KNNImputers
knn_imputer = KNNImputer(n_neighbors=5)

In [6]:
df['DTI'] = knn_imputer.fit_transform(df[['DTI']])

In [7]:
# Convert numerical columns with decimal points to float64
numerical_cols = ['CoBorrowerTotalMonthlyIncome', 'CoBorrowerAge', 'CoBorrowerYearsInSchool',
                  'BorrowerTotalMonthlyIncome', 'BorrowerAge', 'DTI', 'CLTV', 'CreditScore',
                  'TotalLoanAmount', 'LoanApproved']
for col in numerical_cols:
    df[col] = df[col].astype('float64')

In [8]:
# Categorical Columns to Encode
categorical_cols = ['LeadSourceGroup','Group','LoanPurpose']

In [9]:
# Adding key steps for the feature work to process the Categorical_columns
# Step 1: Remove numeric columns from categorical columns list (do not apply one-hot encoding on them)
categorical_cols = [col for col in categorical_cols if col not in numerical_cols]

In [13]:
# Step 2: Apply One-Hot Encoding for the categorical columns (exclude the numeric columns)
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [14]:
df_encoded.columns = df_encoded.columns.str.replace(' ', '_')

In [51]:
# # Step 3: Separate the numeric columns and the encoded columns
# encoded_columns = df_encoded.columns
# numeric_columns = [col for col in df_encoded.select_dtypes(include=[np.number]).columns if col not in categorical_cols]

In [15]:
df_encoded

Unnamed: 0,CoBorrowerTotalMonthlyIncome,CoBorrowerAge,CoBorrowerYearsInSchool,BorrowerTotalMonthlyIncome,BorrowerAge,DTI,CLTV,CreditScore,TotalLoanAmount,LoanApproved,...,Group_Purchase_Team_-_4105,Group_Purchase_Team_-_4112,Group_Refinance_Team_-_4101,Group_Refinance_Team_-_4102,Group_Refinance_Team_-_4103,Group_Reverse_Team_-_4340,LoanPurpose_Purchase__________________________________________,LoanPurpose_Refinance_________________________________________,LoanPurpose_Refinance_Cash-out________________________________,LoanPurpose_VA_IRRRL__________________________________________
0,0.0,0.0,0.0,6083.00,36.0,33.0,60.20,705.0,206500.0,0.0,...,False,False,False,False,True,False,False,True,False,False
1,0.0,55.0,12.0,8883.71,52.0,33.0,54.05,781.0,100000.0,1.0,...,False,False,False,False,True,False,False,False,True,False
2,0.0,0.0,0.0,17538.30,60.0,35.0,80.00,649.0,260000.0,1.0,...,False,False,False,False,True,False,False,False,True,False
3,0.0,0.0,0.0,13299.09,54.0,33.0,72.96,792.0,440000.0,1.0,...,False,False,False,True,False,False,False,False,True,False
4,0.0,0.0,0.0,6254.91,49.0,41.0,72.44,689.0,163000.0,1.0,...,False,False,False,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13446,1250.0,74.0,12.0,3200.00,76.0,39.0,65.71,806.0,230000.0,0.0,...,False,False,False,False,True,False,False,True,False,False
13447,0.0,0.0,0.0,26050.00,55.0,17.0,75.00,797.0,510000.0,0.0,...,False,False,True,False,False,False,False,False,True,False
13448,0.0,34.0,16.0,5583.33,36.0,31.0,68.70,783.0,213000.0,0.0,...,False,False,False,True,False,False,False,False,True,False
13449,5850.0,61.0,16.0,11250.00,61.0,30.0,73.85,686.0,510350.0,0.0,...,False,False,True,False,False,False,False,False,True,False


In [16]:
# Ensure there are no duplicates by checking column names
if df_encoded.columns.duplicated().any():
    print("Warning: There are duplicate column names after encoding!")
    print(df_encoded.columns[df_encoded.columns.duplicated()])

In [54]:
df_encoded.dtypes

CoBorrowerTotalMonthlyIncome                                      float64
CoBorrowerAge                                                     float64
CoBorrowerYearsInSchool                                           float64
BorrowerTotalMonthlyIncome                                        float64
BorrowerAge                                                       float64
DTI                                                               float64
CLTV                                                              float64
CreditScore                                                       float64
TotalLoanAmount                                                   float64
LoanApproved                                                      float64
LeadSourceGroup_Direct_Mail                                          bool
LeadSourceGroup_Internet                                             bool
LeadSourceGroup_Radio                                                bool
LeadSourceGroup_Referral              

In [17]:
# Check for missing values in the DataFrame
missing_values = df_encoded.isna().sum()

# Print the columns with missing values if any
if missing_values.any():
    print(missing_values[missing_values > 0])
else:
    print("No missing values in the DataFrame.")

No missing values in the DataFrame.


In [56]:
# column = df.get('LeadSourceGroup_Direct_Mail', None)  # Returns None if column doesn't exist
# if column is not None:
#     column.astype(int)
# else: print('none')


In [18]:
# Convert only boolean columns to integers
for col in df_encoded.select_dtypes(include='bool').columns:
    if col in df_encoded.columns:  # Check if the column exists
        df_encoded[col] = df_encoded[col].astype(int)


In [58]:
pd.set_option('display.max_columns', None)
df_encoded

Unnamed: 0,CoBorrowerTotalMonthlyIncome,CoBorrowerAge,CoBorrowerYearsInSchool,BorrowerTotalMonthlyIncome,BorrowerAge,DTI,CLTV,CreditScore,TotalLoanAmount,LoanApproved,LeadSourceGroup_Direct_Mail,LeadSourceGroup_Internet,LeadSourceGroup_Radio,LeadSourceGroup_Referral,LeadSourceGroup_Repeat_Client,LeadSourceGroup_Self_Sourced,LeadSourceGroup_Social_Media,LeadSourceGroup_TV,LeadSourceGroup_Trigger,Group_Loan_Coordinator,Group_Purchase_Team_-_4105,Group_Purchase_Team_-_4112,Group_Refinance_Team_-_4101,Group_Refinance_Team_-_4102,Group_Refinance_Team_-_4103,Group_Reverse_Team_-_4340,LoanPurpose_Purchase__________________________________________,LoanPurpose_Refinance_________________________________________,LoanPurpose_Refinance_Cash-out________________________________,LoanPurpose_VA_IRRRL__________________________________________
0,0.0,0.0,0.0,6083.00,36.0,33.0,60.20,705.0,206500.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
1,0.0,55.0,12.0,8883.71,52.0,33.0,54.05,781.0,100000.0,1.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0
2,0.0,0.0,0.0,17538.30,60.0,35.0,80.00,649.0,260000.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
3,0.0,0.0,0.0,13299.09,54.0,33.0,72.96,792.0,440000.0,1.0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0
4,0.0,0.0,0.0,6254.91,49.0,41.0,72.44,689.0,163000.0,1.0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13446,1250.0,74.0,12.0,3200.00,76.0,39.0,65.71,806.0,230000.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
13447,0.0,0.0,0.0,26050.00,55.0,17.0,75.00,797.0,510000.0,0.0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
13448,0.0,34.0,16.0,5583.33,36.0,31.0,68.70,783.0,213000.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
13449,5850.0,61.0,16.0,11250.00,61.0,30.0,73.85,686.0,510350.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0


In [19]:
# IQR Method for Outlier Detection
def iqr_outliers(df):
    outliers_iqr = {}
    for column in df.select_dtypes(include=[np.number]).columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = df[column][(df[column] < lower_bound) | (df[column] > upper_bound)]
        if not outliers.empty:
            outliers_iqr[column] = outliers.index.tolist()
    return outliers_iqr

In [20]:
# Imputation Function for IQR Outliers
def impute_outliers_iqr(df, method='median'):
    outliers = iqr_outliers(df)
    for column, outlier_indices in outliers.items():
        if method == 'median':
            impute_value = df[column].median()
        elif method == 'mean':
            impute_value = df[column].mean()
        else:
            raise ValueError("Invalid method. Choose 'median' or 'mean'.")
        df.loc[outlier_indices, column] = impute_value
    return df

In [21]:
# Detect and impute outliers with the median
df_imputed = impute_outliers_iqr(df_encoded, method='median')

In [None]:
# Standardize the numerical features (after encoding categorical variables)
features_to_scale = df_imputed.drop(columns='LoanApproved')

In [23]:
# Initialize the StandardScler
scaler = StandardScaler()
df_imputed[features_to_scale.columns] = scaler.fit_transform(features_to_scale)

In [24]:
# Split data into features (X) and target (Y)
X = df_imputed.drop(columns='LoanApproved')
Y = df_imputed['LoanApproved']

In [63]:
# Train-Test Split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [66]:
# from sklearn.model_selection import GridSearchCV
# # Logistic Regression Model
# model = LogisticRegression(max_iter=1000)
# # Define the parameter grid
# param_grid = {
#     'C': [0.1, 1, 10, 100],                # Regularization strength
#     'penalty': ['l2'],                     # Regularization type (L2 is most common)
#     'solver': ['liblinear', 'saga']    # Optimization algorithms (liblinear is good for small datasets, saga is more scalable)
#     # 'multi_class': ['ovr', 'multinomial']  # Multi-class strategies (one-vs-rest or multinomial)
# }
# # Create the GridSearchCV object
# grid_search = GridSearchCV(model, param_grid, cv=10, scoring='accuracy')
# # Fit the model with the best hyperparameters
# grid_search.fit(X_train, y_train)
# # Get the best estimator
# model = grid_search.best_estimator_

In [None]:
# Logistic Regression Model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
# Predict and evaluate
y_pred = model.predict(X_test)
# Calcualte accuracy and precision
lr_accuracy = accuracy_score(y_test, y_pred)
lr_precision = precision_score(y_test, y_pred)
lr_accuracy, lr_precision

In [None]:
# Random Forest Model
model = RandomForestClassifier()
model.fit(X_train, y_train)
# Predict and evaluate
y_pred = model.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred)
rf_precision = precision_score(y_test, y_pred)
rf_accuracy, rf_precision

### **User-Input Testing on Logestic Regressions**

In [70]:
def get_user_input():
    CoBorrowerTotalMonthlyIncome = float(input("Enter CoBorrowerTotalMonthlyIncome: "))
    CoBorrowerAge = float(input("Enter CoBorrowerAge: "))
    CoBorrowerYearsInSchool = float(input("Enter CoBorrowerYearsInSchool: "))
    BorrowerTotalMonthlyIncome = float(input("Enter BorrowerTotalMonthlyIncome: "))
    BorrowerAge = float(input("Enter BorrowerAge: "))
    DTI = float(input("Enter DTI (Debt-to-Income Ratio): "))
    CLTV = float(input("Enter CLTV (Loan-to-Value Ratio): "))
    CreditScore = float(input("Enter CreditScore: "))
    TotalLoanAmount = float(input("Enter TotalLoanAmount: "))
    LeadSourceGroup = input("Enter LeadSourceGroup (Direct_Mail, Internet, Mobile, Other, Phone): ")
    Group = input("Enter Group (1-9): ")
    LoanPurpose = input("Enter LoanPurpose (1-4): ")
    return {
        "CoBorrowerTotalMonthlyIncome": CoBorrowerTotalMonthlyIncome,
        "CoBorrowerAge": CoBorrowerAge,
        "CoBorrowerYearsInSchool": CoBorrowerYearsInSchool,
        "BorrowerTotalMonthlyIncome": BorrowerTotalMonthlyIncome,
        "BorrowerAge": BorrowerAge,
        "DTI": DTI,
        "CLTV": CLTV,
        "CreditScore": CreditScore,
        "TotalLoanAmount": TotalLoanAmount,
        "LeadSourceGroup": LeadSourceGroup,
        "Group": Group,
        "LoanPurpose": LoanPurpose
    }

In [71]:
user_input = get_user_input()

In [72]:
user_input

{'CoBorrowerTotalMonthlyIncome': 4800.0,
 'CoBorrowerAge': 34.0,
 'CoBorrowerYearsInSchool': 16.0,
 'BorrowerTotalMonthlyIncome': 1090.0,
 'BorrowerAge': 0.0,
 'DTI': 34.0,
 'CLTV': 56.0,
 'CreditScore': 476.0,
 'TotalLoanAmount': 800000.0,
 'LeadSourceGroup': 'Phone',
 'Group': '7',
 'LoanPurpose': '2'}

In [73]:
user_input_df = pd.DataFrame([user_input])
user_input_encoded = pd.get_dummies(user_input_df, columns=["LeadSourceGroup", "Group", "LoanPurpose"], drop_first=True)
user_input_encoded = user_input_encoded.reindex(columns=X.columns, fill_value=0)
user_input_scaled = scaler.transform(user_input_encoded)

In [74]:
user_input_df

Unnamed: 0,CoBorrowerTotalMonthlyIncome,CoBorrowerAge,CoBorrowerYearsInSchool,BorrowerTotalMonthlyIncome,BorrowerAge,DTI,CLTV,CreditScore,TotalLoanAmount,LeadSourceGroup,Group,LoanPurpose
0,4800.0,34.0,16.0,1090.0,0.0,34.0,56.0,476.0,800000.0,Phone,7,2


In [75]:
user_input_encoded

Unnamed: 0,CoBorrowerTotalMonthlyIncome,CoBorrowerAge,CoBorrowerYearsInSchool,BorrowerTotalMonthlyIncome,BorrowerAge,DTI,CLTV,CreditScore,TotalLoanAmount,LeadSourceGroup_Direct_Mail,LeadSourceGroup_Internet,LeadSourceGroup_Radio,LeadSourceGroup_Referral,LeadSourceGroup_Repeat_Client,LeadSourceGroup_Self_Sourced,LeadSourceGroup_Social_Media,LeadSourceGroup_TV,LeadSourceGroup_Trigger,Group_Loan_Coordinator,Group_Purchase_Team_-_4105,Group_Purchase_Team_-_4112,Group_Refinance_Team_-_4101,Group_Refinance_Team_-_4102,Group_Refinance_Team_-_4103,Group_Reverse_Team_-_4340,LoanPurpose_Purchase__________________________________________,LoanPurpose_Refinance_________________________________________,LoanPurpose_Refinance_Cash-out________________________________,LoanPurpose_VA_IRRRL__________________________________________
0,4800.0,34.0,16.0,1090.0,0.0,34.0,56.0,476.0,800000.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [76]:
prediction = model.predict(user_input_scaled)
prediction



array([1.])

In [77]:
if prediction == 1:
    print("Loan Approved!")
else:
    print("Loan Denied!")

Loan Approved!


In [37]:
# Predict and evaluate
y_pred = model.predict(X_test)

In [38]:
# Calcualte accuracy and precision
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.7194351542177629

In [39]:
precision = precision_score(y_test, y_pred)
precision

np.float64(0.7401606425702811)

In [40]:
# Evaluate the DecisionTree Model
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [41]:
# Predict and evaluate
y_pred = model.predict(X_test)

In [42]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.6191007060572278

In [43]:
precision = precision_score(y_test, y_pred)
precision

np.float64(0.7465388711395101)