Data Preproccesing

In [None]:
#1]Data Preprocessing: Handle missing values,
#  clean data inconsistencies, 
# encode categorical features (e.g., loan purpose, credit history),
#scale numerical features (e.g., income, loan amount).
import pandas as pd
df = pd.read_csv('Loan_default.csv')
# print(df.head())

print(df.isnull().sum())




LoanID            0
Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
InterestRate      0
LoanTerm          0
DTIRatio          0
Education         0
EmploymentType    0
MaritalStatus     0
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
Default           0
dtype: int64


In [5]:
# Import necessary libraries for preprocessing
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)


In [13]:
# 1. EXPLORATORY DATA ANALYSIS
print("="*50)
print("EXPLORATORY DATA ANALYSIS")
print("="*50)

print(f"Dataset Shape: {df.shape}")
print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print("\nDataset Info:")
print(df.info())

print("\nBasic Statistics:")
print(df.describe())

print("\nTarget Variable Distribution:")
print(df['Default'].value_counts())
print(f"Default Rate: {df['Default'].mean():.2%}")


EXPLORATORY DATA ANALYSIS
Dataset Shape: (255347, 18)
Memory Usage: 126.81 MB

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255347 entries, 0 to 255346
Data columns (total 18 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   LoanID          255347 non-null  object 
 1   Age             255347 non-null  int64  
 2   Income          255347 non-null  int64  
 3   LoanAmount      255347 non-null  int64  
 4   CreditScore     255347 non-null  int64  
 5   MonthsEmployed  255347 non-null  int64  
 6   NumCreditLines  255347 non-null  int64  
 7   InterestRate    255347 non-null  float64
 8   LoanTerm        255347 non-null  int64  
 9   DTIRatio        255347 non-null  float64
 10  Education       255347 non-null  object 
 11  EmploymentType  255347 non-null  object 
 12  MaritalStatus   255347 non-null  object 
 13  HasMortgage     255347 non-null  object 
 14  HasDependents   255347 non-null  object 
 15  LoanPurpo

In [20]:
# 2. HANDLE MISSING VALUES
print("="*50)
print("MISSING VALUES ANALYSIS")
print("="*50)

# Check for missing values
missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values)

# Check for any missing values
if missing_values.sum() == 0:
    print("\n✅ No missing values found in the dataset!")
else:
    print(f"\n⚠️ Found {missing_values.sum()} missing values")
    # Handle missing values if any exist
    for col in missing_values[missing_values > 0].index:
        if df[col].dtype in ['object']:
            df[col].fillna(df[col].mode()[0], inplace=True)
        else:
            df[col].fillna(df[col].median(), inplace=True)
    print("Missing values handled!")


MISSING VALUES ANALYSIS
Missing values per column:
LoanID            0
Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
InterestRate      0
LoanTerm          0
DTIRatio          0
Education         0
EmploymentType    0
MaritalStatus     0
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
Default           0
dtype: int64

✅ No missing values found in the dataset!


In [26]:
# 3. CLEAN DATA INCONSISTENCIES AND OUTLIERS
print("="*50)
print("DATA CLEANING AND OUTLIER DETECTION")
print("="*50)

# Create a copy for preprocessing
df_clean = df.copy()

# Check for duplicates
duplicates = df_clean.duplicated().sum()
print(f"Duplicate rows: {duplicates}")
if duplicates > 0:
    df_clean = df_clean.drop_duplicates()
    print(f"Removed {duplicates} duplicate rows")

# Identify numerical columns
numerical_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()
numerical_cols.remove('Default')  # Remove target variable

print(f"\nNumerical columns: {numerical_cols}")

# Detect outliers using IQR method
outlier_info = {}
for col in numerical_cols:
    Q1 = df_clean[col].quantile(0.25)
    Q3 = df_clean[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df_clean[(df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)]
    outlier_count = len(outliers)
    outlier_info[col] = outlier_count
    
    print(f"{col}: {outlier_count} outliers ({outlier_count/len(df_clean)*100:.2f}%)")

# Cap outliers for critical features (optional - can be adjusted based on business logic)
critical_features = ['Income', 'LoanAmount', 'CreditScore']
for col in critical_features:
    if col in outlier_info and outlier_info[col] > 0:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Cap outliers instead of removing them
        df_clean[col] = np.where(df_clean[col] < lower_bound, lower_bound, df_clean[col])
        df_clean[col] = np.where(df_clean[col] > upper_bound, upper_bound, df_clean[col])
        print(f"Capped outliers in {col}")

print("\n✅ Data cleaning completed!")


DATA CLEANING AND OUTLIER DETECTION
Duplicate rows: 0

Numerical columns: ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio']
Age: 0 outliers (0.00%)
Income: 0 outliers (0.00%)
LoanAmount: 0 outliers (0.00%)
CreditScore: 0 outliers (0.00%)
MonthsEmployed: 0 outliers (0.00%)
NumCreditLines: 0 outliers (0.00%)
InterestRate: 0 outliers (0.00%)
LoanTerm: 0 outliers (0.00%)
DTIRatio: 0 outliers (0.00%)

✅ Data cleaning completed!


In [125]:
# 4. ENCODE CATEGORICAL FEATURES

# Identify categorical columns
categorical_cols = df_clean.select_dtypes(include=['object']).columns.tolist()
categorical_cols.remove('LoanID')  # Remove ID column

print(f"Categorical columns: {categorical_cols}")

# Check unique values in each categorical column
for col in categorical_cols:
    unique_vals = df_clean[col].unique()
    print(f"\n{col}: {len(unique_vals)} unique values")
    print(f"Values: {unique_vals}")

# Create a copy for encoding
df_encoded = df_clean.copy()

# Initialize label encoders
label_encoders = {}

# Encode categorical variables
for col in categorical_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le
    print(f"✅ Encoded {col}")

# Display encoded values mapping for reference

for col, le in label_encoders.items():
    print(f"\n{col}:")
    for i, class_name in enumerate(le.classes_):
        print(f"  {class_name} -> {i}")

print("\n✅ Categorical encoding completed!")


Categorical columns: ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']

Education: 4 unique values
Values: ["Bachelor's" "Master's" 'High School' 'PhD']

EmploymentType: 4 unique values
Values: ['Full-time' 'Unemployed' 'Self-employed' 'Part-time']

MaritalStatus: 3 unique values
Values: ['Divorced' 'Married' 'Single']

HasMortgage: 2 unique values
Values: ['Yes' 'No']

HasDependents: 2 unique values
Values: ['Yes' 'No']

LoanPurpose: 5 unique values
Values: ['Other' 'Auto' 'Business' 'Home' 'Education']

HasCoSigner: 2 unique values
Values: ['Yes' 'No']
✅ Encoded Education
✅ Encoded EmploymentType
✅ Encoded MaritalStatus
✅ Encoded HasMortgage
✅ Encoded HasDependents
✅ Encoded LoanPurpose
✅ Encoded HasCoSigner

Education:
  Bachelor's -> 0
  High School -> 1
  Master's -> 2
  PhD -> 3

EmploymentType:
  Full-time -> 0
  Part-time -> 1
  Self-employed -> 2
  Unemployed -> 3

MaritalStatus:
  Divorced -> 0
  Married -> 1
  Sing

Feature Engineering


In [126]:
# 2]Feature Engineering (Optional):
#  Create features like debt-to-income ratio, loan term to income ratio, credit score bands, etc.
# Approximate monthly loan payment (ignoring interest)
df['MonthlyLoanPayment'] = df['LoanAmount'] / df['LoanTerm']

# Debt-to-Income ratio (if not already present or to reinforce)
df['DTI'] = df['MonthlyLoanPayment'] / (df['Income'] / 12 + 1e-6)  # Adding small epsilon to avoid div zero

# Loan Term to Income ratio
df['LoanTerm_to_Income'] = df['LoanTerm'] / (df['Income'] + 1)

# Credit score bands (categorical)
def credit_band(score):
    if score < 580:
        return 'Poor'
    elif score < 670:
        return 'Fair'
    elif score < 740:
        return 'Good'
    elif score < 800:
        return 'Very Good'
    else:
        return 'Exceptional'

df['CreditScoreBand'] = df['CreditScore'].apply(credit_band)


Train & Test split

In [127]:
X = df_encoded.drop(columns=['Default', 'LoanID'])
y = df_encoded['Default']
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(204277, 16) (51070, 16) (204277,) (51070,)


Model Building



3]Train Classification Models: Logistic Regression, Decision Trees, Random Forest, Gradient Boosting (XGBoost, LightGBM). Explore different models and compare their performance.


In [128]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression() #build model

# log_reg.fit(X_train, y_train) #train model

# y_pred = log_reg.predict(X_test) #make predictionsa


from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# print(accuracy_score(y_test, y_pred))

In [129]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# models = {
#     "Logistic Regression": log_reg,  #already built this
#     "Decision Tree": DecisionTreeClassifier(random_state=42),
#     "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
#     "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42),
#     "LightGBM": LGBMClassifier(random_state=42)
# }





| Model               | Accuracy | Precision (1) | Recall (1) | F1-score (1) |
| ------------------- | -------- | ------------- | ---------- | ------------ |
| Logistic Regression | 0.8841   | 0.55          | 0.01       | 0.02         |
| Decision Tree       | 0.8034   | 0.20          | 0.23       | 0.21         |
| Random Forest       | 0.8856   | 0.60          | 0.05       | 0.09         |
| XGBoost             | 0.8849   | 0.53          | 0.08       | 0.14         |
| LightGBM            | 0.8865   | 0.61          | 0.07       | 0.12         |


Handling Class Imbalance


In [114]:
dt_model = DecisionTreeClassifier(class_weight='balanced', random_state=42)
rf_model = RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=42)

neg_count = y_train.value_counts()[0]
pos_count = y_train.value_counts()[1]
scale_pos_weight = neg_count / pos_count

xgb_model = XGBClassifier(scale_pos_weight=scale_pos_weight, use_label_encoder=False, eval_metric="logloss", random_state=42)
lgbm_model = LGBMClassifier(scale_pos_weight=scale_pos_weight, random_state=42)

models = {
    "Decision Tree": dt_model,
    "Random Forest": rf_model,
    "XGBoost": xgb_model,
    "LightGBM": lgbm_model
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # evaluate metrics

for name, model in models.items():
    model.fit(X_train, y_train)                # Train
    y_pred = model.predict(X_test)             # Predict
    acc = accuracy_score(y_test, y_pred)       # Accuracy
    print(f"{name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))



Decision Tree Accuracy: 0.8174
              precision    recall  f1-score   support

           0       0.90      0.90      0.90     45139
           1       0.20      0.20      0.20      5931

    accuracy                           0.82     51070
   macro avg       0.55      0.55      0.55     51070
weighted avg       0.81      0.82      0.82     51070

Random Forest Accuracy: 0.8853
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     45139
           1       0.64      0.03      0.05      5931

    accuracy                           0.89     51070
   macro avg       0.76      0.51      0.50     51070
weighted avg       0.86      0.89      0.84     51070

XGBoost Accuracy: 0.7128
              precision    recall  f1-score   support

           0       0.94      0.73      0.82     45139
           1       0.23      0.62      0.33      5931

    accuracy                           0.71     51070
   macro avg       0.58      0.67      0.

| Model         | Accuracy | Precision (1) | Recall (1) | F1-score (1) |
| ------------- | -------- | ------------- | ---------- | ------------ |
| Decision Tree | 0.8174   | 0.20          | 0.20       | 0.20         |
| Random Forest | 0.8853   | 0.64          | 0.03       | 0.05         |
| XGBoost       | 0.7128   | 0.23          | 0.62       | 0.33         |
| LightGBM      | 0.6895   | 0.23          | 0.69       | 0.34         |


Hyperparameter tuning
and smoting


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score
from imblearn.over_sampling import SMOTE


# Apply SMOTE on training data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Before SMOTE:\n", y_train.value_counts())
print("After SMOTE:\n", y_train_res.value_counts())



xgb_model = XGBClassifier(
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42
)

lgbm_model = LGBMClassifier(
    random_state=42,
    verbose=-1)
#defining parameters

xgb_params = {
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 5, 7, 9],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "subsample": [0.7, 0.8, 1.0],
    "colsample_bytree": [0.7, 0.8, 1.0]
}

lgbm_params = {
    "n_estimators": [100, 200, 300],
    "num_leaves": [31, 50, 70],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "subsample": [0.7, 0.8, 1.0],
    "colsample_bytree": [0.7, 0.8, 1.0]
}

#f1 scorer

f1_scorer = make_scorer(f1_score, pos_label=1)

#randomized search

xgb_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=xgb_params,
    scoring=f1_scorer,
    n_iter=20,
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

lgbm_search = RandomizedSearchCV(
    estimator=lgbm_model,
    param_distributions=lgbm_params,
    scoring=f1_scorer,
    n_iter=20,
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

#fitting smote data

xgb_search.fit(X_train_res, y_train_res)
lgbm_search.fit(X_train_res, y_train_res)


print("XGBoost Best Params:", xgb_search.best_params_)
print("LightGBM Best Params:", lgbm_search.best_params_)


Before SMOTE:
 Default
0    180555
1     23722
Name: count, dtype: int64
After SMOTE:
 Default
0    180555
1    180555
Name: count, dtype: int64
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Fitting 3 folds for each of 20 candidates, totalling 60 fits
XGBoost Best Params: {'subsample': 1.0, 'n_estimators': 200, 'max_depth': 7, 'learning_rate': 0.2, 'colsample_bytree': 0.8}
LightGBM Best Params: {'subsample': 0.7, 'num_leaves': 70, 'n_estimators': 300, 'learning_rate': 0.1, 'colsample_bytree': 0.8}


In [116]:
#  Now, the next step is to train the final models using these best parameters on your SMOTE-resampled data:
# Train final XGBoost
xgb_best = XGBClassifier(
    subsample=1.0,
    n_estimators=200,
    max_depth=7,
    learning_rate=0.2,
    colsample_bytree=0.8,
    random_state=42
)
xgb_best.fit(X_train_res, y_train_res)

# Train final LightGBM
lgb_best = LGBMClassifier(
    subsample=0.7,
    num_leaves=70,
    n_estimators=300,
    learning_rate=0.1,
    colsample_bytree=0.8,
    random_state=42
)
lgb_best.fit(X_train_res, y_train_res)


0,1,2
,boosting_type,'gbdt'
,num_leaves,70
,max_depth,-1
,learning_rate,0.1
,n_estimators,300
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [117]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score

def evaluate_model(model, X_test, y_test):
    y_probs = model.predict_proba(X_test)[:,1]  # probabilities for class 1
    y_pred = (y_probs >= 0.5).astype(int)       # default threshold = 0.5
    
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision (1)": precision_score(y_test, y_pred),
        "Recall (1)": recall_score(y_test, y_pred),
        "F1 Score (1)": f1_score(y_test, y_pred),
        "ROC-AUC": roc_auc_score(y_test, y_probs),
        "PR-AUC": average_precision_score(y_test, y_probs)
    }
    return metrics

# Evaluate both models
xgb_metrics = evaluate_model(xgb_best, X_test, y_test)
lgb_metrics = evaluate_model(lgb_best, X_test, y_test)

print("XGBoost:", xgb_metrics)
print("LightGBM:", lgb_metrics)


XGBoost: {'Accuracy': 0.8638731153318974, 'Precision (1)': 0.336115569823435, 'Recall (1)': 0.17653009610520992, 'F1 Score (1)': 0.2314835286314393, 'ROC-AUC': 0.7199983864449663, 'PR-AUC': 0.2599753950880558}
LightGBM: {'Accuracy': 0.8796162130409242, 'Precision (1)': 0.43731946851530906, 'Recall (1)': 0.1276344629910639, 'F1 Score (1)': 0.19759853824066823, 'ROC-AUC': 0.736566787356086, 'PR-AUC': 0.28759946988180163}


Optimal Threshold


In [118]:
import numpy as np

def find_best_threshold(model, X_test, y_test):
    y_probs = model.predict_proba(X_test)[:,1]
    best_thresh, best_f1 = 0.5, 0
    
    for thresh in np.arange(0.1, 0.9, 0.01):
        y_pred = (y_probs >= thresh).astype(int)
        f1 = f1_score(y_test, y_pred)
        if f1 > best_f1:
            best_f1, best_thresh = f1, thresh
    
    return best_thresh, best_f1

# Apply for both
xgb_thresh, xgb_f1 = find_best_threshold(xgb_best, X_test, y_test)
lgb_thresh, lgb_f1 = find_best_threshold(lgb_best, X_test, y_test)

print(f"XGBoost Optimal Threshold: {xgb_thresh:.2f}, F1 Score: {xgb_f1:.4f}")
print(f"LightGBM Optimal Threshold: {lgb_thresh:.2f}, F1 Score: {lgb_f1:.4f}")


XGBoost Optimal Threshold: 0.23, F1 Score: 0.3266
LightGBM Optimal Threshold: 0.24, F1 Score: 0.3431


In [119]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, auc

final_results = {}

for name, model in models.items():
    # Predict probabilities
    y_probs = model.predict_proba(X_test)[:, 1]
    
    # Find best threshold based on F1
    best_thresh = 0
    best_f1 = 0
    for thresh in np.arange(0.1, 0.9, 0.01):
        y_pred_thresh = (y_probs >= thresh).astype(int)
        f1 = f1_score(y_test, y_pred_thresh)
        if f1 > best_f1:
            best_f1 = f1
            best_thresh = thresh
    
    # Apply best threshold
    y_pred_final = (y_probs >= best_thresh).astype(int)
    
    # Compute metrics
    acc = accuracy_score(y_test, y_pred_final)
    prec = precision_score(y_test, y_pred_final)
    rec = recall_score(y_test, y_pred_final)
    f1 = f1_score(y_test, y_pred_final)
    roc_auc = roc_auc_score(y_test, y_probs)
    precision, recall, _ = precision_recall_curve(y_test, y_probs)
    pr_auc = auc(recall, precision)

    final_results[name] = {
        "Accuracy": round(acc, 4),
        "Precision": round(prec, 4),
        "Recall": round(rec, 4),
        "F1": round(f1, 4),
        "ROC-AUC": round(roc_auc, 4),
        "PR-AUC": round(pr_auc, 4),
        "Threshold": round(best_thresh, 2)
    }

final_df = pd.DataFrame(final_results).T
print(final_df)


               Accuracy  Precision  Recall      F1  ROC-AUC  PR-AUC  Threshold
Decision Tree    0.8174     0.2041  0.1974  0.2007   0.5482  0.2474       0.10
Random Forest    0.7999     0.2820  0.4675  0.3518   0.7342  0.3002       0.18
XGBoost          0.8014     0.2847  0.4697  0.3545   0.7408  0.3102       0.61
LightGBM         0.8074     0.2985  0.4881  0.3705   0.7557  0.3270       0.63


In [120]:
# If X_train is your original DataFrame
X_train_res = pd.DataFrame(X_train_res, columns=X_train.columns)


wrapping preproccessing and Model pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from lightgbm import LGBMClassifier
import joblib
import os
import pandas as pd


numeric_features = X_train_res.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train_res.select_dtypes(include=['object', 'category']).columns.tolist()


if 'LoanDefault' in numeric_features:  
    numeric_features.remove('LoanDefault')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LGBMClassifier(n_estimators=300, learning_rate=0.1))
])

pipeline.fit(X_train_res, y_train_res)

os.makedirs("models", exist_ok=True)
joblib.dump(pipeline, "models/loan_default_pipeline.pkl")

print("Pipeline trained and saved successfully!")


Pipeline trained and saved successfully!


In [122]:
X_train.columns

Index(['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed',
       'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio', 'Education',
       'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents',
       'LoanPurpose', 'HasCoSigner'],
      dtype='object')

In [None]:
import joblib

# Load the saved pipeline
pipeline = joblib.load("models/loan_default_pipeline.pkl")

#predictions
preds = pipeline.predict(X_test) 
X_test.iloc[6,:]


Age                   19.00
Income            102696.00
LoanAmount        239913.00
CreditScore          422.00
MonthsEmployed        37.00
NumCreditLines         3.00
InterestRate          16.63
LoanTerm              24.00
DTIRatio               0.87
Education              2.00
EmploymentType         1.00
MaritalStatus          0.00
HasMortgage            1.00
HasDependents          1.00
LoanPurpose            3.00
HasCoSigner            0.00
Name: 127976, dtype: float64

In [136]:
import joblib
import pandas as pd

pipeline = joblib.load("models/loan_default_pipeline.pkl")


applicant_data = pd.DataFrame([{
    "Age": 50.00,
    "Income": 1696.00,
    "LoanAmount": 2399183.00,
    "CreditScore": 102.00,
    "MonthsEmployed": 37.00,
    "NumCreditLines": 3.00,
    "InterestRate": 16.63,
    "LoanTerm": 24.00,
    "DTIRatio": 0.87,
    "Education": 2.00,
    "EmploymentType": 1.00,
    "MaritalStatus": 0.00,
    "HasMortgage": 1.00,
    "HasDependents": 1.00,
    "LoanPurpose": 3.00,
    "HasCoSigner": 0.00
}])

pred = pipeline.predict(applicant_data)[0]
pred_proba = pipeline.predict_proba(applicant_data)[0][1]  # probability of default


if pred == 0:
    print("The applicant is likely to repay the loan.")
else:
    print("The applicant may default on the loan.")


The applicant may default on the loan.


In [137]:
print(y_train_res.unique())


[0 1]
