In [6]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import PowerTransformer, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, classification_report, precision_recall_curve

# Load the data
file_path = 'real_data_r3.xlsx'
df = pd.read_excel(file_path)

# Separate numerical columns
num_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Apply KNN Imputer for the numerical columns
knn_imputer = KNNImputer(n_neighbors=5)
df[num_cols] = knn_imputer.fit_transform(df[num_cols])

# Fill missing values for categorical columns
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = df[cat_cols].fillna('missing')

# Separate features and target variable
X = df.drop(['unique_identifier', 'appl_month', 'default_ind'], axis=1)
y = df['default_ind']

# List of skewed columns and their handling strategy
skewed_cols = {
    'debt_cap': 'power',
    'income_incons': 'power',
    'home_value': 'power',
    'return_payments': 'sqrt',
    'decline_txn': 'sqrt'
}

# Apply Power and Square Root Transformations
for col, method in skewed_cols.items():
    if col in X.columns:
        if method == 'power':
            pt = PowerTransformer(method='yeo-johnson')
            X[col] = pt.fit_transform(X[[col]])
        elif method == 'sqrt':
            X[col] = np.sqrt(X[col] + 1)

# OneHotEncode categorical columns
encoder = OneHotEncoder(sparse_output=False, drop='first')
columns_to_encode = X.select_dtypes(exclude=['number']).columns
encoded_features = encoder.fit_transform(X[columns_to_encode])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(columns_to_encode))

# Update X with encoded features
X = X.drop(columns=columns_to_encode)
X = pd.concat([X, encoded_df], axis=1)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Best parameters found previously
best_params = {
    'subsample': 1,
    'reg_lambda': 0.5,
    'reg_alpha': 1,
    'n_estimators': 200,
    'max_depth': 4,
    'learning_rate': 0.1,
    'gamma': 0,
    'colsample_bytree': 0.8,
    'use_label_encoder': False
}

# Create the XGBClassifier models with the best parameters
xgb1 = XGBClassifier(**best_params)
xgb2 = XGBClassifier(**best_params)
xgb3 = XGBClassifier(**best_params)

# Base models with optimized XGB classifiers
models = [
    ('xgb1', xgb1),
    ('xgb2', xgb2),
    ('xgb3', xgb3)
]

# Meta-model
stacking_clf = StackingClassifier(estimators=models, final_estimator=LogisticRegression())

# Train the stacking classifier
stacking_clf.fit(X_train, y_train)

# Calibrate the model
calibrated_clf = CalibratedClassifierCV(stacking_clf, method='isotonic')
calibrated_clf.fit(X_train, y_train)

# Predict probabilities
y_probs = calibrated_clf.predict_proba(X_test)[:, 1]

# Tune the threshold
precisions, recalls, thresholds = precision_recall_curve(y_test, y_probs)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
optimal_threshold = thresholds[np.argmax(f1_scores)]

# Apply the optimal threshold
y_pred = (y_probs >= optimal_threshold).astype(int)

# Evaluate the performance
print("Stacking Classifier Performance on Test Set after Calibration and Threshold Tuning:")
print(f"Optimal Threshold: {optimal_threshold}")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Stacking Classifier Performance on Test Set after Calibration and Threshold Tuning:
Optimal Threshold: 0.1816642843529636
Accuracy: 0.9955
Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     12427
         1.0       0.61      0.57      0.59        70

    accuracy                           1.00     12497
   macro avg       0.80      0.78      0.79     12497
weighted avg       1.00      1.00      1.00     12497



In [8]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import PowerTransformer, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, classification_report, precision_recall_curve

# Load the data
file_path = 'real_data_r3.xlsx'
df = pd.read_excel(file_path)

# Separate numerical columns
num_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Identify columns with more than 30% missing data
threshold = 0.3
missing_percent = df.isnull().mean()
cols_to_impute = missing_percent[missing_percent > threshold].index

# Check if the datatype of these columns is a string and apply label encoding if necessary
for col in cols_to_impute:
    if df[col].dtype == 'object':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))

# Apply KNN Imputer for columns with more than 30% missing data
knn_imputer = KNNImputer(n_neighbors=5)
df[cols_to_impute] = knn_imputer.fit_transform(df[cols_to_impute])

# Apply KNN Imputer for the remaining numerical columns
df[num_cols] = knn_imputer.fit_transform(df[num_cols])

# Fill missing values for categorical columns
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = df[cat_cols].fillna('missing')

# Separate features and target variable
X = df.drop(['unique_identifier', 'appl_month', 'default_ind'], axis=1)
y = df['default_ind']

# List of skewed columns and their handling strategy
skewed_cols = {
    'debt_cap': 'power',
    'income_incons': 'power',
    'home_value': 'power',
    'return_payments': 'sqrt',
    'decline_txn': 'sqrt'
}

# Apply Power and Square Root Transformations
for col, method in skewed_cols.items():
    if col in X.columns:
        if method == 'power':
            pt = PowerTransformer(method='yeo-johnson')
            X[col] = pt.fit_transform(X[[col]])
        elif method == 'sqrt':
            X[col] = np.sqrt(X[col] + 1)

# OneHotEncode categorical columns
encoder = OneHotEncoder(sparse_output=False, drop='first')
columns_to_encode = X.select_dtypes(exclude=['number']).columns
encoded_features = encoder.fit_transform(X[columns_to_encode])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(columns_to_encode))

# Update X with encoded features
X = X.drop(columns=columns_to_encode)
X = pd.concat([X, encoded_df], axis=1)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Base models
models = [
    ('xgb', XGBClassifier(colsample_bytree=0.8, learning_rate=0.15, max_depth=4, n_estimators=130, subsample=0.9, use_label_encoder=False)),
    ('rf', RandomForestClassifier(n_estimators=100)),
    ('ada', AdaBoostClassifier(n_estimators=100))
]

# Meta-model
stacking_clf = StackingClassifier(estimators=models, final_estimator=LogisticRegression())

# Train the stacking classifier
stacking_clf.fit(X_train, y_train)

# Calibrate the model
calibrated_clf = CalibratedClassifierCV(stacking_clf, method='isotonic')
calibrated_clf.fit(X_train, y_train)

# Predict probabilities
y_probs = calibrated_clf.predict_proba(X_test)[:, 1]

# Tune the threshold
precisions, recalls, thresholds = precision_recall_curve(y_test, y_probs)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
optimal_threshold = thresholds[np.argmax(f1_scores)]

# Apply the optimal threshold
y_pred = (y_probs >= optimal_threshold).astype(int)

# Evaluate the performance
print("Stacking Classifier Performance on Test Set after Calibration and Threshold Tuning:")
print(f"Optimal Threshold: {optimal_threshold}")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Stacking Classifier Performance on Test Set after Calibration and Threshold Tuning:
Optimal Threshold: 0.21190913261116404
Accuracy: 0.9961
Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     12427
         1.0       0.70      0.53      0.60        70

    accuracy                           1.00     12497
   macro avg       0.85      0.76      0.80     12497
weighted avg       1.00      1.00      1.00     12497



In [12]:
!pip install catboost


Collecting catboost
  Downloading catboost-1.2.5-cp312-cp312-win_amd64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Downloading catboost-1.2.5-cp312-cp312-win_amd64.whl (101.1 MB)
   ---------------------------------------- 0.0/101.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/101.1 MB 330.3 kB/s eta 0:05:06
   ---------------------------------------- 0.1/101.1 MB 1.1 MB/s eta 0:01:37
   ---------------------------------------- 0.3/101.1 MB 2.1 MB/s eta 0:00:48
   ---------------------------------------- 0.4/101.1 MB 2.5 MB/s eta 0:00:41
   ---------------------------------------- 0.6/101.1 MB 2.7 MB/s eta 0:00:38
   ---------------------------------------- 0.7/101.1 MB 2.7 MB/s eta 0:00:38
   ---------------------------------------- 0.8/101.1 MB 2.6 MB/s eta 0:00:39
   ---------------------------------------- 1.0/101.1 MB 2.7 MB/s eta 0:00:37
   ---------------------------------------- 1.

In [13]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import PowerTransformer, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, classification_report, precision_recall_curve
from imblearn.over_sampling import SMOTE

# Load the data
file_path = 'real_data_r3.xlsx'
df = pd.read_excel(file_path)

# Separate numerical columns
num_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Apply KNN Imputer for the numerical columns
knn_imputer = KNNImputer(n_neighbors=10)
df[num_cols] = knn_imputer.fit_transform(df[num_cols])

# Fill missing values for categorical columns
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = df[cat_cols].fillna('missing')

# Separate features and target variable
X = df.drop(['unique_identifier', 'appl_month', 'default_ind'], axis=1)
y = df['default_ind']

# List of skewed columns and their handling strategy
skewed_cols = {
    'debt_cap': 'power',
    'income_incons': 'power',
    'home_value': 'power',
    'return_payments': 'sqrt',
    'decline_txn': 'sqrt'
}

# Apply Power and Square Root Transformations
for col, method in skewed_cols.items():
    if col in X.columns:
        if method == 'power':
            pt = PowerTransformer(method='yeo-johnson')
            X[col] = pt.fit_transform(X[[col]])
        elif method == 'sqrt':
            X[col] = np.sqrt(X[col] + 1)

# OneHotEncode categorical columns
encoder = OneHotEncoder(sparse_output=False, drop='first')
columns_to_encode = X.select_dtypes(exclude=['number']).columns
encoded_features = encoder.fit_transform(X[columns_to_encode])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(columns_to_encode))

# Update X with encoded features
X = X.drop(columns=columns_to_encode)
X = pd.concat([X, encoded_df], axis=1)

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=4)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=4)

# Adjust class weights for CatBoost and XGB
class_weights = {0: 1, 1: 10}  # Adjust the weights depending on the performance

# Create the XGBClassifier models with the best parameters
xgb1 = XGBClassifier(**best_params, scale_pos_weight=class_weights[1])
xgb2 = XGBClassifier(**best_params, scale_pos_weight=class_weights[1])

# Create a CatBoostClassifier model with class weights
catboost_model = CatBoostClassifier(iterations=200, depth=4, learning_rate=0.1, class_weights=class_weights, silent=True)

# Base models with optimized XGB classifiers and CatBoost
models = [
    ('xgb1', xgb1),
    ('xgb2', xgb2),
    ('catboost', catboost_model)
]

# Meta-model
stacking_clf = StackingClassifier(estimators=models, final_estimator=LogisticRegression())

# Train the stacking classifier
stacking_clf.fit(X_train, y_train)

# Calibrate the model
calibrated_clf = CalibratedClassifierCV(stacking_clf, method='isotonic')
calibrated_clf.fit(X_train, y_train)

# Predict probabilities
y_probs = calibrated_clf.predict_proba(X_test)[:, 1]

# Tune the threshold
precisions, recalls, thresholds = precision_recall_curve(y_test, y_probs)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
optimal_threshold = thresholds[np.argmax(f1_scores)]

# Apply the optimal threshold
y_pred = (y_probs >= optimal_threshold).astype(int)

# Evaluate the performance
print("Stacking Classifier Performance on Test Set after Calibration and Threshold Tuning:")
print(f"Optimal Threshold: {optimal_threshold}")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Stacking Classifier Performance on Test Set after Calibration and Threshold Tuning:
Optimal Threshold: 0.3976389935978024
Accuracy: 0.9982
Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     12457
         1.0       1.00      1.00      1.00     12392

    accuracy                           1.00     24849
   macro avg       1.00      1.00      1.00     24849
weighted avg       1.00      1.00      1.00     24849



In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import PowerTransformer, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, classification_report, precision_recall_curve

# Load the data
file_path = 'real_data_r3.xlsx'
df = pd.read_excel(file_path)

# Separate numerical columns
num_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Identify columns with more than 30% missing data
threshold = 0.3
missing_percent = df.isnull().mean()
cols_to_impute = missing_percent[missing_percent > threshold].index

# Check if the datatype of these columns is a string and apply label encoding if necessary
for col in cols_to_impute:
    if df[col].dtype == 'object':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))

# Apply KNN Imputer for columns with more than 30% missing data
knn_imputer = KNNImputer(n_neighbors=5)
df[cols_to_impute] = knn_imputer.fit_transform(df[cols_to_impute])

# Apply KNN Imputer for the remaining numerical columns
df[num_cols] = knn_imputer.fit_transform(df[num_cols])

# Fill missing values for categorical columns
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = df[cat_cols].fillna('missing')

# Separate features and target variable
X = df.drop(['unique_identifier', 'appl_month', 'default_ind'], axis=1)
y = df['default_ind']

# List of skewed columns and their handling strategy
skewed_cols = {
    'debt_cap': 'power',
    'income_incons': 'power',
    'home_value': 'power',
    'return_payments': 'sqrt',
    'decline_txn': 'sqrt'
}

# Apply Power and Square Root Transformations
for col, method in skewed_cols.items():
    if col in X.columns:
        if method == 'power':
            pt = PowerTransformer(method='yeo-johnson')
            X[col] = pt.fit_transform(X[[col]])
        elif method == 'sqrt':
            X[col] = np.sqrt(X[col] + 1)

# OneHotEncode categorical columns
encoder = OneHotEncoder(sparse_output=False, drop='first')
columns_to_encode = X.select_dtypes(exclude=['number']).columns
encoded_features = encoder.fit_transform(X[columns_to_encode])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(columns_to_encode))

# Update X with encoded features
X = X.drop(columns=columns_to_encode)
X = pd.concat([X, encoded_df], axis=1)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Base models
models = [
    ('xgb', XGBClassifier(colsample_bytree=0.8, learning_rate=0.15, max_depth=4, n_estimators=130, subsample=0.9, use_label_encoder=False)),
    ('rf', RandomForestClassifier(n_estimators=100)),
    ('ada', AdaBoostClassifier(n_estimators=100))
]

# Meta-model
stacking_clf = StackingClassifier(estimators=models, final_estimator=LogisticRegression())

# Train the stacking classifier
stacking_clf.fit(X_train, y_train)

# Calibrate the model
calibrated_clf = CalibratedClassifierCV(stacking_clf, method='isotonic')
calibrated_clf.fit(X_train, y_train)

# Predict probabilities
# Predict probabilities on the entire dataset
y_probs_full = calibrated_clf.predict_proba(X)[:, 1]

# Tune the threshold
precisions, recalls, thresholds = precision_recall_curve(y, y_probs_full)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
optimal_threshold = thresholds[np.argmax(f1_scores)]

# Apply the optimal threshold to get final predictions
y_pred_full = (y_probs_full >= optimal_threshold).astype(int)

# Evaluate the performance
print("Stacking Classifier Performance on Entire Dataset after Calibration and Threshold Tuning:")
print(f"Optimal Threshold: {optimal_threshold}")
print(f"Accuracy: {accuracy_score(y, y_pred_full):.4f}")
print("Classification Report:")
print(classification_report(y, y_pred_full))


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Stacking Classifier Performance on Entire Dataset after Calibration and Threshold Tuning:
Optimal Threshold: 0.4629652574195594
Accuracy: 0.9992
Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     62121
         1.0       0.98      0.88      0.93       363

    accuracy                           1.00     62484
   macro avg       0.99      0.94      0.97     62484
weighted avg       1.00      1.00      1.00     62484



In [9]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import PowerTransformer, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, classification_report, precision_recall_curve

# Load the data
file_path = 'real_data_r3.xlsx'
df = pd.read_excel(file_path)

# Separate numerical columns
num_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Identify columns with more than 30% missing data
threshold = 0.3
missing_percent = df.isnull().mean()
cols_to_impute = missing_percent[missing_percent > threshold].index

# Check if the datatype of these columns is a string and apply label encoding if necessary
for col in cols_to_impute:
    if df[col].dtype == 'object':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))

# Apply KNN Imputer for columns with more than 30% missing data
knn_imputer = KNNImputer(n_neighbors=5)
df[cols_to_impute] = knn_imputer.fit_transform(df[cols_to_impute])

# Apply KNN Imputer for the remaining numerical columns
df[num_cols] = knn_imputer.fit_transform(df[num_cols])

# Fill missing values for categorical columns
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = df[cat_cols].fillna('missing')

# Separate features and target variable
X = df.drop(['unique_identifier', 'appl_month', 'default_ind'], axis=1)
y = df['default_ind']

# List of skewed columns and their handling strategy
skewed_cols = {
    'debt_cap': 'power',
    'income_incons': 'power',
    'home_value': 'power',
    'return_payments': 'sqrt',
    'decline_txn': 'sqrt'
}

# Apply Power and Square Root Transformations
for col, method in skewed_cols.items():
    if col in X.columns:
        if method == 'power':
            pt = PowerTransformer(method='yeo-johnson')
            X[col] = pt.fit_transform(X[[col]])
        elif method == 'sqrt':
            X[col] = np.sqrt(X[col] + 1)

# OneHotEncode categorical columns
encoder = OneHotEncoder(sparse_output=False, drop='first')
columns_to_encode = X.select_dtypes(exclude=['number']).columns
encoded_features = encoder.fit_transform(X[columns_to_encode])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(columns_to_encode))

# Update X with encoded features
X = X.drop(columns=columns_to_encode)
X = pd.concat([X, encoded_df], axis=1)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Base models
models = [
    ('xgb', XGBClassifier(colsample_bytree=0.8, learning_rate=0.15, max_depth=4, n_estimators=130, subsample=0.9, use_label_encoder=False)),
    ('rf', RandomForestClassifier(n_estimators=100)),
    ('ada', AdaBoostClassifier(n_estimators=100))
]

# Meta-model
stacking_clf = StackingClassifier(estimators=models, final_estimator=LogisticRegression())

# Train the stacking classifier
stacking_clf.fit(X_train, y_train)

# Calibrate the model
calibrated_clf = CalibratedClassifierCV(stacking_clf, method='isotonic')
calibrated_clf.fit(X_train, y_train)

# Predict probabilities
# Predict probabilities on the entire dataset
y_probs_full = calibrated_clf.predict_proba(X)[:, 1]

# Tune the threshold
precisions, recalls, thresholds = precision_recall_curve(y, y_probs_full)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
optimal_threshold = thresholds[np.argmax(f1_scores)]

# Apply the optimal threshold to get final predictions
y_pred_full = (y_probs_full >= optimal_threshold).astype(int)

# Evaluate the performance
submission_df = df[['unique_identifier']].copy()
submission_df['strategy_hit'] = y_pred_full

# Calculate the number of unique variables used
no_of_vars = len(X.columns)

# Add the no_of_vars column to the submission DataFrame
submission_df['no_of_vars'] = no_of_vars

# Save the submission file as a CSV
submission_df.to_csv('submission_4th.csv',index=False)
print("Stacking Classifier Performance on Entire Dataset after Calibration and Threshold Tuning:")
print(f"Optimal Threshold: {optimal_threshold}")
print(f"Accuracy: {accuracy_score(y, y_pred_full):.4f}")
print("Classification Report:")
print(classification_report(y, y_pred_full))


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Stacking Classifier Performance on Entire Dataset after Calibration and Threshold Tuning:
Optimal Threshold: 0.4088533988533989
Accuracy: 0.9992
Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     62121
         1.0       0.98      0.88      0.93       363

    accuracy                           1.00     62484
   macro avg       0.99      0.94      0.97     62484
weighted avg       1.00      1.00      1.00     62484



In [17]:
import pandas as pd
import numpy as np

# Initialize a DataFrame to hold the aggregate feature importance
final_feature_importance = pd.DataFrame({'Feature': X.columns})

# Extract and add feature importances from XGBoost
xgb_importances = stacking_clf.named_estimators_['xgb'].feature_importances_
final_feature_importance['XGBoost Importance'] = xgb_importances

# Extract and add feature importances from RandomForest
rf_importances = stacking_clf.named_estimators_['rf'].feature_importances_
final_feature_importance['RandomForest Importance'] = rf_importances

# Extract and add feature importances from AdaBoost (if used)
ada_importances = stacking_clf.named_estimators_['ada'].feature_importances_
final_feature_importance['AdaBoost Importance'] = ada_importances

# Calculate the mean importance of each feature across all models
final_feature_importance['Mean Importance'] = final_feature_importance[['XGBoost Importance', 'RandomForest Importance', 'AdaBoost Importance']].mean(axis=1)

# Sort the DataFrame by the mean importance
final_feature_importance = final_feature_importance.sort_values(by='Mean Importance', ascending=False)

# Save the feature importances to an Excel file
file_path = 'final_feature_importances.xlsx'
final_feature_importance.to_excel(file_path, index=False)

print(f"Final feature importance has been saved to {file_path}")


Final feature importance has been saved to final_feature_importances.xlsx


In [19]:
X_train

Unnamed: 0,risk_score_1,bureau_score,limit,income,old_open_trd_ms,no_open_trd,no_trd,debt_cap,basic_max_trd_tnr,basic_no_trd,...,state_code_VI,state_code_VT,state_code_WA,state_code_WI,state_code_WV,state_code_WY,bureau_src_Bureau 2,bureau_src_No Bureau Hit,addr_mismatch_Y,addr_mismatch_missing
920,0.0,750.0,4000.0,75000.0,0.0,0.0,0.0,-0.043968,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
40257,0.0,775.0,8000.0,320000.0,0.0,0.0,0.0,0.140665,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
42201,2.9,750.0,6000.0,180000.0,19.0,2.0,2.0,0.110008,16.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29352,0.0,725.0,5000.0,70000.0,0.0,0.0,0.0,-2.039703,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
26833,5.4,750.0,1000.0,95000.0,219.0,13.0,18.0,0.018423,219.0,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54343,0.0,750.0,5000.0,30000.0,0.0,0.0,0.0,-0.095299,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
38158,0.0,800.0,14000.0,120000.0,0.0,0.0,0.0,-0.025540,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
860,2.1,725.0,4000.0,40000.0,97.0,1.0,1.0,-0.064334,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15795,0.1,800.0,30000.0,150000.0,270.0,15.0,28.0,0.045950,216.0,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import PowerTransformer, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, classification_report, precision_recall_curve
from imblearn.over_sampling import SMOTE

# Load the data
file_path = 'real_data_r3.xlsx'
df = pd.read_excel(file_path)

# Separate numerical columns
num_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Identify columns with more than 30% missing data
threshold = 0.3
missing_percent = df.isnull().mean()
cols_to_impute = missing_percent[missing_percent > threshold].index

# Check if the datatype of these columns is a string and apply label encoding if necessary
for col in cols_to_impute:
    if df[col].dtype == 'object':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))

# Apply KNN Imputer for columns with more than 30% missing data
knn_imputer = KNNImputer(n_neighbors=5)
df[cols_to_impute] = knn_imputer.fit_transform(df[cols_to_impute])

# Apply KNN Imputer for the remaining numerical columns
df[num_cols] = knn_imputer.fit_transform(df[num_cols])

# Fill missing values for categorical columns
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = df[cat_cols].fillna('missing')

# Separate features and target variable
X = df.drop(['unique_identifier', 'appl_month', 'default_ind'], axis=1)
y = df['default_ind']

# List of skewed columns and their handling strategy
skewed_cols = {
    'debt_cap': 'power',
    'income_incons': 'power',
    'home_value': 'power',
    'return_payments': 'sqrt',
    'decline_txn': 'sqrt'
}

# Apply Power and Square Root Transformations
for col, method in skewed_cols.items():
    if col in X.columns:
        if method == 'power':
            pt = PowerTransformer(method='yeo-johnson')
            X[col] = pt.fit_transform(X[[col]])
        elif method == 'sqrt':
            X[col] = np.sqrt(X[col] + 1)

# OneHotEncode categorical columns
encoder = OneHotEncoder(sparse_output=False, drop='first')
columns_to_encode = X.select_dtypes(exclude=['number']).columns
encoded_features = encoder.fit_transform(X[columns_to_encode])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(columns_to_encode))

# Update X with encoded features
X = X.drop(columns=columns_to_encode)
X = pd.concat([X, encoded_df], axis=1)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Base models
models = [
    ('xgb', XGBClassifier(colsample_bytree=0.8, learning_rate=0.15, max_depth=4, n_estimators=130, subsample=0.9, use_label_encoder=False)),
    ('rf', RandomForestClassifier(n_estimators=100)),
    ('ada', AdaBoostClassifier(n_estimators=100))
]

# Meta-model
stacking_clf = StackingClassifier(estimators=models, final_estimator=LogisticRegression())

# Train the stacking classifier
stacking_clf.fit(X_train_resampled, y_train_resampled)

# Calibrate the model
calibrated_clf = CalibratedClassifierCV(stacking_clf, method='isotonic')
calibrated_clf.fit(X_train_resampled, y_train_resampled)

# Predict probabilities
y_probs = calibrated_clf.predict_proba(X_test)[:, 1]

# Tune the threshold
precisions, recalls, thresholds = precision_recall_curve(y_test, y_probs)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
optimal_threshold = thresholds[np.argmax(f1_scores)]

# Apply the optimal threshold
y_pred = (y_probs >= optimal_threshold).astype(int)

# Evaluate the performance
print("Stacking Classifier Performance on Test Set after Calibration, Threshold Tuning, and SMOTE:")
print(f"Optimal Threshold: {optimal_threshold}")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Stacking Classifier Performance on Test Set after Calibration, Threshold Tuning, and SMOTE:
Optimal Threshold: 0.1688888888888889
Accuracy: 0.9955
Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     12427
         1.0       0.73      0.31      0.44        70

    accuracy                           1.00     12497
   macro avg       0.86      0.66      0.72     12497
weighted avg       0.99      1.00      0.99     12497



In [3]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import PowerTransformer, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, classification_report, precision_recall_curve
from imblearn.over_sampling import SMOTE

# Load the data
file_path = 'real_data_r3.xlsx'
df = pd.read_excel(file_path)

# Separate numerical columns
num_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Apply KNN Imputer for the numerical columns
knn_imputer = KNNImputer(n_neighbors=10)
df[num_cols] = knn_imputer.fit_transform(df[num_cols])

# Fill missing values for categorical columns
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = df[cat_cols].fillna('missing')

# Separate features and target variable
X = df.drop(['unique_identifier', 'appl_month', 'default_ind'], axis=1)
y = df['default_ind']

# List of skewed columns and their handling strategy
skewed_cols = {
    'debt_cap': 'power',
    'income_incons': 'power',
    'home_value': 'power',
    'return_payments': 'sqrt',
    'decline_txn': 'sqrt'
}

# Apply Power and Square Root Transformations
for col, method in skewed_cols.items():
    if col in X.columns:
        if method == 'power':
            pt = PowerTransformer(method='yeo-johnson')
            X[col] = pt.fit_transform(X[[col]])
        elif method == 'sqrt':
            X[col] = np.sqrt(X[col] + 1)

# OneHotEncode categorical columns
encoder = OneHotEncoder(sparse_output=False, drop='first')
columns_to_encode = X.select_dtypes(exclude=['number']).columns
encoded_features = encoder.fit_transform(X[columns_to_encode])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(columns_to_encode))

# Update X with encoded features
X = X.drop(columns=columns_to_encode)
X = pd.concat([X, encoded_df], axis=1)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

# Apply SMOTE to the training data only
smote = SMOTE(random_state=4)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Adjust class weights for CatBoost and XGB
class_weights = {0: 1, 1: 15}  # Adjust the weights depending on the performance
# Best parameters (provided in earlier context or tune accordingly)
best_params = {
    'subsample': 1,
    'reg_lambda': 0.5,
    'reg_alpha': 1,
    'n_estimators': 200,
    'max_depth': 4,
    'learning_rate': 0.1,
    'gamma': 0,
    'colsample_bytree': 0.8,
    'use_label_encoder': False
}

# Create the XGBClassifier models with the best parameters
xgb1 = XGBClassifier(**best_params, scale_pos_weight=class_weights[1])
xgb2 = XGBClassifier(**best_params, scale_pos_weight=class_weights[1])

# Create a CatBoostClassifier model with class weights
catboost_model = CatBoostClassifier(iterations=200, depth=4, learning_rate=0.1, class_weights=class_weights, silent=True)

# Base models with optimized XGB classifiers and CatBoost
models = [
    ('xgb1', xgb1),
    ('xgb2', xgb2),
    ('catboost', catboost_model)
]

# Meta-model
stacking_clf = StackingClassifier(estimators=models, final_estimator=LogisticRegression())

# Train the stacking classifier
stacking_clf.fit(X_train_resampled, y_train_resampled)

# Calibrate the model
calibrated_clf = CalibratedClassifierCV(stacking_clf, method='isotonic')
calibrated_clf.fit(X_train_resampled, y_train_resampled)

# Predict probabilities
y_probs = calibrated_clf.predict_proba(X_test)[:, 1]

# Tune the threshold
precisions, recalls, thresholds = precision_recall_curve(y_test, y_probs)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
optimal_threshold = thresholds[np.argmax(f1_scores)]

# Apply the optimal threshold
y_pred = (y_probs >= optimal_threshold).astype(int)

# Evaluate the performance
print("Stacking Classifier Performance on Test Set after Calibration and Threshold Tuning:")
print(f"Optimal Threshold: {optimal_threshold}")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Stacking Classifier Performance on Test Set after Calibration and Threshold Tuning:
Optimal Threshold: 0.5045346978916958
Accuracy: 0.9966
Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     12439
         1.0       0.69      0.47      0.56        58

    accuracy                           1.00     12497
   macro avg       0.84      0.73      0.78     12497
weighted avg       1.00      1.00      1.00     12497

