# Deliverable 3


### Section 1: Full Pipeline Construction (Preprocessing + Model)


In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import make_scorer, recall_score, f1_score
from sklearn.base import BaseEstimator, TransformerMixin

# Custom class so that we can use it in the pipeline
class Outlier_Capper(BaseEstimator, TransformerMixin):
    def __init__(self, cols_to_cap_indices):
        self.cols_to_cap_indices = cols_to_cap_indices
        self.upper_fences = {}

    def fit(self, X, y=None):
        for col_idx in self.cols_to_cap_indices:
            column_data = X[:, col_idx]
            Q1 = np.percentile(column_data, 25)
            Q3 = np.percentile(column_data, 75)
            IQR = Q3 - Q1
            self.upper_fences[col_idx] = Q3 + 1.5 * IQR
        return self

    def transform(self, X, y=None):
        X_copy = X.copy()
        for col_idx, fence in self.upper_fences.items():
            X_copy[:, col_idx] = np.where(X_copy[:, col_idx] > fence, fence, X_copy[:, col_idx])
        return X_copy

# Load and clean dataset
data = pd.read_csv("./datasets/cs-training.csv")
if 'Unnamed: 0' in data.columns:
    data = data.drop(columns=['Unnamed: 0'])

# Combine similar columns to avoid multicollinearity as discovered in deliverable 2
data['Total_Times_Past_Due'] = (
    data['NumberOfTime30-59DaysPastDueNotWorse'] +
    data['NumberOfTime60-89DaysPastDueNotWorse'] +
    data['NumberOfTimes90DaysLate']
)
data = data.drop(columns=[
    'NumberOfTime30-59DaysPastDueNotWorse',
    'NumberOfTime60-89DaysPastDueNotWorse',
    'NumberOfTimes90DaysLate'
])

X = data.drop(columns=["SeriousDlqin2yrs"])
y = data["SeriousDlqin2yrs"]


# Pipelines

outlier_indices_to_cap = [0, 2, 3] # RevolvingUtilization, DebtRatio, MonthlyIncome 
log_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('capper', Outlier_Capper(cols_to_cap_indices=outlier_indices_to_cap)),
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(max_iter=1000, class_weight='balanced'))
])

rf_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('capper', Outlier_Capper(cols_to_cap_indices=outlier_indices_to_cap)),
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier(n_estimators=200, max_depth=None, random_state=42, class_weight='balanced'))
])


# Comparison metrics 
scoring = {
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'roc_auc': 'roc_auc'
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("Model Comparison: \n")
print("Logistic Regression Results:")
log_cv_results = cross_validate(
    log_pipeline, 
    X.to_numpy(), 
    y,
    cv=cv,
    scoring=scoring,
    return_train_score=False
)
for metric in scoring.keys():
    mean = np.mean(log_cv_results[f'test_{metric}'])
    std = np.std(log_cv_results[f'test_{metric}'])
    print(f"  {metric:8}: {mean:.3f} ± {std:.3f}")

print("Random Forest Results:")
rf_cv_results = cross_validate(
    rf_pipeline, 
    X.to_numpy(), 
    y,
    cv=cv,
    scoring=scoring,
    return_train_score=False
)
# Print results
for metric in scoring.keys():
    mean = np.mean(rf_cv_results[f'test_{metric}'])
    std = np.std(rf_cv_results[f'test_{metric}'])
    print(f"  {metric:8}: {mean:.3f} ± {std:.3f}")


Model Comparison: 

Logistic Regression Results:
  recall  : 0.738 ± 0.012
  f1      : 0.295 ± 0.005
  roc_auc : 0.831 ± 0.005
Random Forest Results:
  recall  : 0.146 ± 0.007
  f1      : 0.229 ± 0.010
  roc_auc : 0.838 ± 0.003


## Analysis of Results

**Recall:**
The logistic regression model boasts a far higher recall score of 0.73 versus the random forest model's 0.14. This means that the logistic regression model would be able to more successfully predict if someone will experience financial distress in the next two years. This is a very important metric to the bank. If they give loans to people who won't be able to pay them back they risk great financial loss.

**F1-Score:**
The F1-scores for both models are similar but the logistic regression model pulls ahead slightly at 0.295 versus random forest model's 0.229. This means the logistic regression model achieves a better balance between precision and recall compared to the random forest model.

**ROC AUC:**
The random forest model is superior in this metric with a score of 0.838 versus the logistic regression model's 0.831. This means the random forest model is better at discriminating between people who are risky to loan to and people who are not. This is probably because the random forest model is more conservative when classifying risky people to loan to as seen by its low recall score. However given the difference is so small it is not very significant.

Based on these results the logistic regression model seems like a clear winner due to its superiority in Recall and F1-Score as well as its comparable ROC AUC score.


### Design Choice Justification

We decided to just use a pipeline and not a column transformer as it is not necessary for our dataset. This is because we only have numerical columns,  just using the pipeline works well and is a simpler solution.