<a href="https://www.kaggle.com/code/suehuynh/kaggle-s4e10-loan-approval-prediction?scriptVersionId=214336659" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Preparation

## Import libraries

In [None]:
import pandas as pd              # For data manipulation and analysis
import numpy as np               # For numerical computing
from datetime import datetime
import scipy.stats as stats      # For statistical analysis
import math
import matplotlib                # For plotting and visualization
import matplotlib.pyplot as plt  
from pandas.plotting import parallel_coordinates
import seaborn as sns            # For statistical data visualization
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [None]:
# For machine learning
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier, Pool

from lightgbm import early_stopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score, roc_auc_score,
                             f1_score, confusion_matrix, classification_report)
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
import optuna

## Load dataset

In [None]:
df_train = pd.read_csv('/kaggle/input/playground-series-s4e10/train.csv',index_col=0)
df_test = pd.read_csv('/kaggle/input/playground-series-s4e10/test.csv', index_col=0)

In [None]:
df = pd.concat([df_train, df_test])

# Exploratory Data Analysis

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df['loan_status'].value_counts()

## Univariate Analysis
Perform distribution analysis on numerical features and Target - Loan Status

In [None]:
num_cols = [col for col in df.columns if df[col].dtypes in ['int', 'float']]
cat_cols = [col for col in df.columns if col not in num_cols]
print(num_cols, cat_cols)

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(20, 15))

for i, col in enumerate(num_cols):
    ax = sns.histplot(data=df,
                      x=col,
                      bins=20,
                      ax=axes[i // 2, i % 2])
    ax.bar_label(ax.containers[1])

fig.tight_layout(h_pad=2)
plt.subplots_adjust(top=0.92)
plt.suptitle('Numerical Feature Distributions', fontsize=16)
plt.show()

Observations
- `person_age` column and `person_emp_length` column have an extreme outlier to be removed

Analyze proportion of each categorical feature

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20, 15))

axes = axes.flatten()

for i, col in enumerate(cat_cols):
    # Get the counts for each category in the column
    counts = df[col].value_counts()
    # Plot the pie chart
    axes[i].pie(counts, labels=counts.index, autopct='%1.1f%%', startangle=90, counterclock=False)
    # Set title to the column name
    axes[i].set_title(f'{col} Distribution', fontsize=14)
    

fig.tight_layout(h_pad=2)
plt.subplots_adjust(top=0.92)
plt.suptitle('Categorical Feature Distributions', fontsize=16)
plt.show()

Observations
- `loan_intent` column has an quite even distribution between each category. We can use One Hot Coding for this feature.
- The majority of loan applicants `person_home_ownership` is Rent or Mortgage. We will look into the correlation this feature and the loan status to decide the engineering tactic.
- `cb_person_default_on_file` can be binary coded
- `loan_grade` column can be simplified by grouping (A,B), (C,D), and (E,F,G)
- `person_home_ownership`

## Multivariate Analysis

In [None]:
fig, axes = plt.subplots(2, 4, figsize=(20, 15))
 
for i, col in enumerate(num_cols):
    plt.subplots_adjust(top = 0.85)
    ax = sns.histplot(data = df, 
                x = col, 
                hue = 'loan_status',
                bins = 20,
                ax = axes[i // 4, i % 4])
    ax.set_yticklabels(['{:,.0f}K'.format(ticks / 1000) for ticks in ax.get_yticks()])
fig.tight_layout(h_pad=2)
plt.subplots_adjust(top=0.92)
plt.suptitle('Numerical Feature Distributions by Target', fontsize=16)
plt.show()

Observations
- Distribution of numerical features between Approved (1) and Disapproved (0) is similar for `person_age`, `person_income`, `person_emp_length`, `cb_person_cred_hist_length`, `loan_amnt`
- `loan_int_rate` and `loan_percent_income` is more left-tailed for Approved applicants than for Disapproved ones.

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(20, 15))
 
for i, col in enumerate(cat_cols):
    plt.subplots_adjust(top = 0.85)
    ax = sns.histplot(data = df, 
                x = col, 
                hue = 'loan_status',
                ax = axes[i // 2, i % 2])

fig.tight_layout(h_pad=2)
plt.subplots_adjust(top=0.92)
plt.suptitle('Categorical Feature Distributions by Target', fontsize=16)
plt.show()

# Feature Engineering

In [None]:
num_cols.remove('loan_status')

In [None]:
def feature_engineer(df):
    # 1. Age Group
    df['age_group'] = pd.cut(df['person_age'], bins=[0, 24, 34, 44, 54, 100], 
                             labels=['<25', '25-34', '35-44', '45-54', '55+'])

    # 2. Income Level
    df['income_level'] = pd.cut(df['person_income'], bins=[0, 30000, 60000, 100000, float('inf')], 
                                labels=['Low', 'Medium', 'High', 'Very High'])

    # 3. Employment Stability
    df['employment_stability'] = pd.cut(df['person_emp_length'], bins=[0, 1, 5, 10, float('inf')],
                                        labels=['<1 year', '1-5 years', '6-10 years', '10+ years'])

    # 4. Loan Amount Range
    df['loan_amnt_range'] = pd.cut(df['loan_amnt'], bins=[0, 5000, 10000, 20000, float('inf')],
                                   labels=['<5k', '5-10k', '10-20k', '20k+'])

    # 5. Interest Rate Buckets
    df['loan_int_rate_range'] = pd.cut(df['loan_int_rate'], bins=[0, 5, 10, 15, 20, float('inf')],
                                       labels=['<5%', '5-10%', '10-15%', '15-20%', '20%+'])

    # 6. Loan-to-Income Ratio Buckets
    df['loan_percent_income_range'] = pd.cut(df['loan_percent_income'], bins=[0, 0.2, 0.4, 0.6, float('inf')],
                                             labels=['<20%', '20-40%', '40-60%', '60%+'])

    # 7. Encode Default History
    df['default_history'] = df['cb_person_default_on_file'].map({'N': 0, 'Y': 1})

    # 8. Credit History Length Category
    df['credit_hist_length_cat'] = pd.cut(df['cb_person_cred_hist_length'], bins=[0, 5, 10, 15, float('inf')],
                                          labels=['<5 years', '5-10 years', '10-15 years', '15+ years'])

    # 9. Interaction Feature: Income Level & Employment Stability
    df['income_emp_interaction'] = df['income_level'].astype(str) + '_' + df['employment_stability'].astype(str)

    # 10. Interaction Feature: Loan Grade & Interest Rate
    df['grade_int_rate'] = df['loan_grade'] + '_' + df['loan_int_rate_range'].astype(str)

    # 11. Debt Burden (Loan Amount Relative to Income)
    df['debt_burden'] = df['loan_amnt'] / df['person_income']

    # 12. Credit History per Age
    df['credit_hist_per_age'] = df['cb_person_cred_hist_length'] / df['person_age']

    # 13. Risk Score (Combining multiple risk factors)
    df['risk_score'] = (
        df['loan_int_rate'] * df['loan_percent_income'] / (df['cb_person_cred_hist_length'] + 1)
    )

    # 14. Aggregated Default Risk
    df['high_risk_flag'] = ((df['default_history'] == 1) | 
                            (df['loan_int_rate'] > 15) |
                            (df['loan_percent_income'] > 0.5)).astype(int)

    # 15. Age-Adjusted Employment Length
    df['emp_length_age_ratio'] = df['person_emp_length'] / df['person_age']

    # 17. Loan Amount per Year of Employment
    df['loan_per_year_emp'] = df['loan_amnt'] / (df['person_emp_length'] + 1)

    # 18. Age-to-Income Ratio
    df['age_to_income_ratio'] = df['person_age'] / df['person_income']

    # 19. Adjusted Interest Rate (Interest Rate adjusted by Loan Grade)
    loan_grade_map = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}
    df['loan_grade_numeric'] = df['loan_grade'].map(loan_grade_map)
    df['adjusted_interest_rate'] = df['loan_int_rate'] / (df['loan_grade_numeric'] + 1)

    # 20. Credit Age Relative to Employment Length
    df['credit_to_emp_ratio'] = df['cb_person_cred_hist_length'] / (df['person_emp_length'] + 1)

    # 21. Financial Stability Index (FSI)
    # A composite score combining income, loan amount, and interest rate to assess financial stability
    df['financial_stability_index'] = df['person_income'] / (df['loan_amnt'] * df['loan_int_rate'])

    # 22. Loan Amortization Speed (Hypothetical)
    # The ratio of income to loan amount, indicating how quickly the loan might be paid off
    df['loan_amortization_speed'] = df['person_income'] / df['loan_amnt']

    # 23. Loan Purpose Risk Weight (Assign risk score based on loan intent)
    loan_intent_risk_map = {'EDUCATION': 0.5, 'MEDICAL': 0.8, 'PERSONAL': 1.0, 'HOMEIMPROVEMENT': 0.6, 
                            'VENTURE': 1.5, 'DEBTCONSOLIDATION': 0.7}
    df['loan_intent_risk'] = df['loan_intent'].map(loan_intent_risk_map)

    # 24. Employment to Income Ratio
    df['emp_income_ratio'] = df['person_emp_length'] / (df['person_income'] / 1000)

    return df

df_train_processed = feature_engineer(df_train)
df_test_processed = feature_engineer(df_test)

In [None]:
X = df_train_processed.drop(columns = ['loan_status'] , axis = 1)
y = df_train_processed['loan_status']

# Split the dataset into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_valid[num_cols] = scaler.transform(X_valid[num_cols])
df_test_processed[num_cols] = scaler.transform(df_test_processed[num_cols])

from sklearn.preprocessing import LabelEncoder

def prepare_data_for_xgb(df):
    # Convert object columns to categorical if needed
    object_cols = df.select_dtypes(include='object').columns
    
    # Use LabelEncoder or pd.get_dummies to encode categorical columns
    label_encoder = LabelEncoder()
    
    for col in object_cols:
        df[col] = label_encoder.fit_transform(df[col])
    
    # Ensure that categorical columns are set to 'category' dtype
    category_cols = df.select_dtypes(include=['category']).columns
    for col in category_cols:
        df[col] = df[col].cat.codes
    
    return df

# Apply the preparation to both the training and validation sets
X_train = prepare_data_for_xgb(X_train)
X_valid = prepare_data_for_xgb(X_valid)
df_test_processed_xgb = prepare_data_for_xgb(df_test_processed)

In [None]:
# import library
from imblearn.over_sampling import SMOTE
import collections
from collections import Counter

smote = SMOTE()

# fit predictor and target variable
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print('Original dataset shape', Counter(y_train))
print('Resample dataset shape', Counter(y_train_smote))

# Machine Learning

## XGBoost

In [None]:
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score

# Define objective function for Optuna
def objective(trial):
    # Define hyperparameters to search
    params = {
        'booster': 'gbtree',
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'min_child_weight': trial.suggest_float('min_child_weight', 1, 15),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 10.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 100, 12000),
        'device': 'cuda',
        'random_state': 0
    }

    # Split the training data into training and validation sets
    X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

    # Train XGBoost model with current hyperparameters
    clf = XGBClassifier(**params)
    clf.fit(X_train_split, y_train_split)

    # Predict probabilities on validation set
    y_pred_proba = clf.predict_proba(X_valid_split)[:, 1]

    # Calculate ROC AUC on validation set
    roc_auc = roc_auc_score(y_valid_split, y_pred_proba)
    return roc_auc

# Optimize hyperparameters using Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Get best hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)

In [None]:
# Split the training data to include a validation set for early stopping
X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train, y_train, test_size=0.2, random_state=0)
# Create the XGBoost model using Optuna model
xgb_1 = XGBClassifier(**best_params)

# Fit the model with early stopping
xgb_1.fit(X_train_split, y_train_split,
          eval_set=[(X_valid_split, y_valid_split)],
          early_stopping_rounds=50,
          verbose=500)

# Predict probabilities on validation data
y_pred_proba_xgb_1 = xgb_1.predict_proba(X_valid)[:, 1]

# Calculate ROC AUC on validation data
roc_auc = roc_auc_score(y_valid, y_pred_proba_xgb_1)
print("ROC AUC on Validation Data:", roc_auc)

## LGBM

In [None]:
# Define objective function for Optuna
def objective(trial):
    # Define hyperparameters to search
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'random_state': 0,
        'verbose': -1
    }

    # Split the training data into training and validation sets
    X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

    # Train LightGBM model with current hyperparameters
    clf = LGBMClassifier(**params)
    clf.fit(X_train_split, y_train_split)

    # Predict probabilities on validation set
    y_pred_proba = clf.predict_proba(X_valid_split)[:, 1]

    # Calculate ROC AUC on validation set
    roc_auc = roc_auc_score(y_valid_split, y_pred_proba)
    return roc_auc

# Optimize hyperparameters using Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Get best hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)

In [None]:
from lightgbm import log_evaluation

# Split the training data to include a validation set for early stopping
X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

# Create the XGBoost model using Optuna model
lgb_1 = LGBMClassifier(**best_params)

# Fit the model with early stopping
lgb_1.fit(
    X_train_split, y_train_split,
    eval_set=[(X_valid_split, y_valid_split)],
    callbacks=[early_stopping(100), log_evaluation(100)]
)

# Predict probabilities on validation data
y_pred_proba_lgb_1 = lgb_1.predict_proba(X_valid)[:, 1]

# Calculate ROC AUC on validation data
roc_auc = roc_auc_score(y_valid, y_pred_proba_lgb_1)
print("ROC AUC on Validation Data:", roc_auc) #0.9597419019693103

## CatBoost

In [None]:
import optuna
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score

# Define objective function for Optuna
def objective(trial):
    # Define hyperparameters to search
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'random_strength': trial.suggest_float('random_strength', 0.1, 10.0),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'border_count': trial.suggest_int('border_count', 1, 255),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-5, 100),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.01, 1.0),
        'eval_metric': 'AUC',
        'random_state': 0
    }

    # Split the training data into training and validation sets
    X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

    # Train CatBoost model with current hyperparameters
    clf = CatBoostClassifier(**params)
    clf.fit(X_train_split, y_train_split, eval_set=(X_valid_split, y_valid_split), verbose=0, early_stopping_rounds=50)

    # Predict probabilities on validation set
    y_pred_proba = clf.predict_proba(X_valid_split)[:, 1]

    # Calculate ROC AUC on validation set
    roc_auc = roc_auc_score(y_valid_split, y_pred_proba)
    return roc_auc

# Optimize hyperparameters using Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Get best hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)

In [None]:
# Split the training data to include a validation set for early stopping
X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

# # Create the CatBoost model using Optuna model
cat_1 = CatBoostClassifier(**best_params)

# Fit the model with early stopping
cat_1.fit(X_train_split, y_train_split,
          eval_set=[(X_valid_split, y_valid_split)],
          early_stopping_rounds=50,
          verbose=500)

# Predict probabilities on validation data
y_pred_proba_cat_1 = cat_1.predict_proba(X_valid)[:, 1]

# Calculate ROC AUC on validation data
roc_auc = roc_auc_score(y_valid, y_pred_proba_cat_1)
print("ROC AUC on Validation Data:", roc_auc)

# Light AutoML

In [None]:
!pip install -U lightautoml[all]

In [None]:
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task
import torch
import os

In [None]:
def reduce_mem_usage(df):
    """ 
    Iterate through all the columns of a dataframe and modify the data type
    to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if pd.api.types.is_numeric_dtype(df[col]):
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.api.types.is_integer_dtype(df[col]):
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                else:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('object')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
df_train_processed[num_cols] = scaler.fit_transform(df_train_processed[num_cols])
df_train_processed = reduce_mem_usage(df_train_processed)
df_train_processed.info()

In [None]:
df_test_processed = reduce_mem_usage(df_test_processed)
df_test_processed.info()

In [None]:
def map_class(x, task, reader):
    if task.name == 'multiclass':
        return reader[x]
    else:
        return x

mapped = np.vectorize(map_class)

def score(task, y_true, y_pred):
    if task.name == 'binary':
        return roc_auc_score(y_true, y_pred)
    elif task.name == 'multiclass':
        return accuracy_score(y_true, np.argmax(y_pred, 1))
    elif task.name == 'reg' or task.name == 'multi:reg':
        return median_absolute_error(y_true, y_pred)
    else:
        raise 'Task is not correct.'
        
def take_pred_from_task(pred, task):
    if task.name == 'binary' or task.name == 'reg':
        return pred[:, 0]
    elif task.name == 'multiclass' or task.name == 'multi:reg':
        return pred
    else:
        raise 'Task is not correct.'
        
def use_plr(USE_PLR):
    if USE_PLR:
        return "plr"
    else:
        return "cont"

In [None]:
RANDOM_STATE = 42
N_THREADS = os.cpu_count()

In [None]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [None]:
task = Task('binary') 
automl = TabularAutoML(
    task = task, 
    timeout = 9 * 3600,
    cpu_limit = os.cpu_count(),
    nn_params = {
        "n_epochs": 10, 
        "bs": 1024, 
        "num_workers": 0, 
        "path_to_save": None, 
        "freeze_defaults": True,
        "cont_embedder": 'plr',
        'cat_embedder': 'weighted',
        "hidden_size": 64,
        'hid_factor': [3, 3],
        'block_config': [3, 3],
        'embedding_size': 64, 
        'stop_by_metric': True,
        'verbose_bar': True,
        "snap_params": { 'k': 2, 'early_stopping': True, 'patience': 2, 'swa': True }
    },
    nn_pipeline_params = {"use_qnt": False, "use_te": False},
    reader_params = {'n_jobs': os.cpu_count(), 'cv': 10, 'random_state': 42, 'advanced_roles': True}
)

In [None]:
out_of_fold_predictions = automl.fit_predict(
    df_train_processed,
    roles = {
        'target': 'loan_status',
    }, 
    verbose = 3
)

In [None]:
print("Light AutoML Accuracy:")
roc_auc_score(df_train_processed.loan_status, out_of_fold_predictions.data)

# Ensemble

## Soft Voting

In [None]:
# Combine predictions using soft voting
y_pred_softvoting = (y_pred_proba_xgb_1 + y_pred_proba_lgb_1 + y_pred_proba_cat_1) / 3

# Calculate ROC AUC on validation data
roc_auc_softvoting = roc_auc_score(y_valid, y_pred_softvoting)
print("ROC AUC on Validation Data (Soft Voting):", roc_auc_softvoting)

## Weighted Average

In [None]:
# Define weights for the models
weights = [0.4, 0.5, 0.15] 

# Combine predictions using weighted average
y_pred_weighted = (weights[0] * y_pred_proba_xgb_1 +
                   weights[1] * y_pred_proba_lgb_1 +
                   weights[2] * y_pred_proba_cat_1)

# Calculate ROC AUC on validation data
roc_auc_weighted = roc_auc_score(y_valid, y_pred_weighted)
print("ROC AUC on Validation Data (Weighted Average):", roc_auc_weighted)

# Evaluate

In [None]:
roc_auc_ensemble = {
    'Soft Voting': roc_auc_softvoting,
    'Weighted Average': roc_auc_weighted
}

print("Ensemble Accuracy:")
for method, roc in roc_auc_ensemble.items():
    print(f"{method}: {roc:.5f}")

In [None]:
from sklearn.metrics import roc_curve, auc
# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_valid, y_pred_softvoting) 
roc_auc = auc(fpr, tpr)
# Plot the ROC curve
plt.figure()  
plt.plot(fpr, tpr, label='ROC curve (area = %0.4f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--', label='No Skill')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Loan Type Classification - Soft Voting')
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc
# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_valid, y_pred_weighted) 
roc_auc = auc(fpr, tpr)
# Plot the ROC curve
plt.figure()  
plt.plot(fpr, tpr, label='ROC curve (area = %0.4f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--', label='No Skill')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Loan Type Classification - Weighted')
plt.legend()
plt.show()

# Submission

In [None]:
# Predict probabilities on test data for both models
y_pred_test_xgb = xgb_1.predict_proba(df_test_processed)[:, 1]
y_pred_test_lgb = lgb_1.predict_proba(df_test_processed)[:, 1]
y_pred_test_cat = cat_1.predict_proba(df_test_processed)[:, 1]
y_pred_test_aml = automl.predict(df_test_processed).data[:, 0]

# Combine predictions using weighted average
y_pred_test_ensemble = (weights[0] * y_pred_test_xgb +
                        weights[1] * y_pred_test_lgb +
                        weights[2] * y_pred_test_cat)
# Create submission file
df_sub = pd.read_csv('/kaggle/input/playground-series-s4e10/sample_submission.csv')
df_sub['loan_status'] = y_pred_test_ensemble
df_sub.to_csv('submission_ensemble.csv', index=False)
df_sub.head()