# Script Assignment 2 - 
## Zach Novak, Marco Bogani, Ivan Lima, Daman Sawhney and Sulaiman Karmali. 


In [27]:
# Import the relevant libraries

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.discriminant_analysis import StandardScaler
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier


### 1. Pre-processing

In [28]:
# Read CSV into a DataFrame
df = pd.read_csv(r'C:\Users\Setup User\University of Central Florida\CAP5619_GRP - General\Script Assigments\Script_Assign_2\LoanData.csv')

# Selecting the relevant columns
df = df.loc[:, ['income', 'age', 'Credit_Score', 'dtir1', 'loan_amount','Status']]

# df preview
print("\nDataFrame head preview...\n",df.head())

# data types before processing
print("\nData types before processing...\n",df.dtypes)

# Remove NA values
df = df.dropna()


# before data type can be converted to int, we need to fill in the missing values
df['income'] = df['income'].astype(int)
df['Credit_Score'] = df['Credit_Score'].astype(int)
df['dtir1'] = df['dtir1'].astype(int)
df['loan_amount'] = df['loan_amount'].astype(int)
df['Status'] = df['Status'].astype(int)
df['age'] = df['age'].astype(str)

# Function to reformat age column
def process_age_range(value):
    if '-' in value:
        age_range = value.split('-')
        return age_range[1]
    elif value.startswith('>'):
        return value[1:]
    elif value.startswith('<'):
        return value[1:]
    else:
        return value
df['age'] = df['age'].apply(process_age_range)

# Set age to int data type
df['age'] = df['age'].fillna(0).astype(int)



df_with_nan = df.copy()



DataFrame head preview...
     income    age  Credit_Score  dtir1  loan_amount  Status
0   1740.0  25-34           758   45.0       116500       1
1   4980.0  55-64           552    NaN       206500       1
2   9480.0  35-44           834   46.0       406500       0
3  11880.0  45-54           587   42.0       456500       0
4  10440.0  25-34           602   39.0       696500       0

Data types before processing...
 income          float64
age              object
Credit_Score      int64
dtir1           float64
loan_amount       int64
Status            int64
dtype: object


* Above, it can be observed that the columns chosen for this study are different data types (float64, object, int64). Code is implemented to convert all columns to an integer type as required by the Gradient Descent. For the age column, first the Process_age_function is defined to remove the value range format as seen in the DataFrame head preview per the above cell's output.

In [29]:
# DataFrame after preprocessing

print("\nData types post processing...\n",df.dtypes)
print("\nDataFrame head preview...\n",df.head(3))
print("\nDataFrame tail preview...\n",df.tail(3))
print("\nDataFrame description...\n",df.describe().round(2))





Data types post processing...
 income          int32
age             int32
Credit_Score    int32
dtir1           int32
loan_amount     int32
Status          int32
dtype: object

DataFrame head preview...
    income  age  Credit_Score  dtir1  loan_amount  Status
0    1740   34           758     45       116500       1
2    9480   44           834     46       406500       0
3   11880   54           587     42       456500       0

DataFrame tail preview...
         income  age  Credit_Score  dtir1  loan_amount  Status
148667    6900   54           702     49       446500       0
148668    7140   64           737     29       196500       0
148669    7260   54           830     44       406500       0

DataFrame description...
           income        age  Credit_Score      dtir1  loan_amount     Status
count  124439.00  124439.00     124439.00  124439.00    124439.00  124439.00
mean     7013.19      54.85        699.80      37.74    328865.90       0.16
std      6508.56      13.26     

* The outcome of the preprocessing can be viewed in the above cell's output. All columns are now an integer data type, the DataFrame head preview shows the data is formatted, and the DataFrame description is able to now give meaningful analysis for the dataset. However, there is one item of concern. The min values for income and age are 0 which signifies incomplete data for those rows in the DataFrame. 

In [30]:
# set columns to replace 0 values to NaN, then have the NaN values removed

df['age'] = df['age'].replace(0, np.nan)
df['income'] = df['income'].replace(0, np.nan)

# remove NaN values from the DataFrame
df = df.dropna()

# DataFrame statistics after removing NaN values
print("\nDataFrame description...\n",df.describe().round(2))


print("\n\nDataFrame WITH NaN values row count...\n", len(df_with_nan))
print("\nDataFrame WITHOUT NaN values row count...\n", len(df))


DataFrame description...
           income        age  Credit_Score      dtir1  loan_amount     Status
count  124091.00  124091.00     124091.00  124091.00    124091.00  124091.00
mean     7032.86      54.85        699.80      37.81    328905.25       0.16
std      6507.06      13.26        115.83      10.47    182183.98       0.37
min        60.00      25.00        500.00       5.00     16500.00       0.00
25%      3780.00      44.00        600.00      31.00    196500.00       0.00
50%      5820.00      54.00        699.00      39.00    296500.00       0.00
75%      8580.00      64.00        800.00      45.00    436500.00       0.00
max    578580.00      74.00        900.00      61.00   3576500.00       1.00


DataFrame WITH NaN values row count...
 124439

DataFrame WITHOUT NaN values row count...
 124091


* With the 0 values removed, the dataset description shows a more meaningful statistical overview. 
* It can be confirmed by the before and after DataFrame len() change. 10,410 rows contained 0 or NaN which were dropped from the DataFrame.

### 2. Splitting training and test data

In [31]:

# Prepare the dataset for training
X = df[['income', 'age', 'Credit_Score', 'dtir1', 'loan_amount']].values
y = df['Status'].values  # Assuming the target variable column is 'Status'

# Split the dataset into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### 3. Scaling and cost computation

In [32]:

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### 4. Define Models

In [33]:
rf = RandomForestClassifier(random_state=42)
gb = GradientBoostingClassifier(random_state=42)
vc = VotingClassifier(estimators=[('rf', rf), ('gb', gb)], voting='soft')

In [34]:
# Create a dictionary to hold models and their names
models = {'Random Forest': rf, 'Gradient Boosting': gb, 'Voting Classifier': vc}


In [35]:
# Train and evaluate models
results = {}
for name, model in models.items():
    # Train model
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    # Evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob)

    # Store results
    results[name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 Score': f1, 'ROC-AUC': roc_auc}


In [36]:
# Compare models
best_model = max(results, key=lambda k: results[k]['F1 Score'])
print(f"Best performing model: {best_model}\n")
print("**Performance metrics**")
for metric, value in results[best_model].items():
    print(f"{metric}: {value:.4f}")

Best performing model: Random Forest

**Performance metrics**
Accuracy: 0.8433
Precision: 0.5604
Recall: 0.1423
F1 Score: 0.2269
ROC-AUC: 0.6662


In [37]:
# Insights
print("\nInsights:")
print(
    f"The **{best_model}** model\n"
    f"performed best in terms of F1 Score,\n"
    f"indicating a balanced performance between precision and recall.\n"
    f"This model might be well-suited for credit risk assessment in fintech\n"
    f"as it effectively balances the trade-off between identifying potential\n"
    f"defaults (recall) and minimizing false alarms (precision), crucial for\n"
    f"making informed loan approval decisions.")


Insights:
The **Random Forest** model
performed best in terms of F1 Score,
indicating a balanced performance between precision and recall.
This model might be well-suited for credit risk assessment in fintech
as it effectively balances the trade-off between identifying potential
defaults (recall) and minimizing false alarms (precision), crucial for
making informed loan approval decisions.
