In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import recall_score, precision_score, f1_score, classification_report
import warnings

In [22]:
df = pd.read_csv('/content/drive/MyDrive/wetransfer_credit_card_behaviour_scores-docx_2025-01-07_0850/Dev_data_to_be_shared.csv')

In [23]:
def recall_optimized_train(df, target_col='bad_flag', test_size=0.2, random_state=42, min_precision=0.015):

    print("Starting processing...")

    # Separate features and target
    X = df.drop(target_col, axis=1)
    y = df[target_col]

    # Focus on bureau features and top transaction features
    bureau_cols = [col for col in X.columns if 'bureau' in col]
    transaction_cols = [col for col in X.columns if 'transaction' in col]

    # Select top transaction features based on variance
    X_trans = X[transaction_cols]
    variances = X_trans.var()
    top_trans_features = variances.nlargest(20).index

    # Combine selected features
    selected_features = list(bureau_cols) + list(top_trans_features)
    X = X[selected_features]

    # Fill missing values
    X = X.fillna(-999)

    print(f"Selected {len(selected_features)} features")

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    print("Converting to DMatrix format...")
    dtrain = xgb.DMatrix(X_train, label=y_train, nthread=-1)
    dtest = xgb.DMatrix(X_test, label=y_test, nthread=-1)

    # Parameters optimized for recall
    params = {
        'objective': 'binary:logistic',
        'eval_metric': ['auc', 'aucpr'],
        'scale_pos_weight': (len(y_train[y_train==0]) / len(y_train[y_train==1]))*2,
        'max_depth': 4,
        'learning_rate': 0.08,
        'subsample': 0.8,
        'colsample_bytree': 0.6,
        'tree_method': 'hist',
        'max_bin': 256,
        'grow_policy': 'lossguide',
        'max_leaves': 32,
        'min_child_weight': 2,
        'nthread': -1
    }

    print("\nTraining model...")
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=50,
        evals=[(dtrain, 'train'), (dtest, 'eval')],
        early_stopping_rounds=5,
        verbose_eval=5
    )

    # Try many thresholds to find optimal recall while maintaining minimum precision
    thresholds = np.arange(0.1, 0.5, 0.02)
    best_threshold = 0.5
    best_recall = 0
    predictions = model.predict(dtest)

    print("\nFinding optimal threshold for recall while maintaining minimum precision...")
    for threshold in thresholds:
        y_pred = (predictions > threshold).astype(int)
        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)

        # Only consider thresholds that maintain minimum precision
        if precision >= min_precision and recall > best_recall:
            best_recall = recall
            best_threshold = threshold

    # Final predictions with best threshold
    y_pred = (predictions > best_threshold).astype(int)

    print(f"\nBest threshold found: {best_threshold:.3f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # Calculate cost metrics
    n_false_negatives = sum((y_test == 1) & (y_pred == 0))
    n_false_positives = sum((y_test == 0) & (y_pred == 1))

    print("\nBusiness Impact Metrics:")
    print(f"Number of missed defaults (False Negatives): {n_false_negatives}")
    print(f"Number of incorrectly flagged good customers (False Positives): {n_false_positives}")
    print(f"Percentage of actual defaults caught: {best_recall*100:.2f}%")

    # Feature importance
    importance_scores = model.get_score(importance_type='gain')
    feature_importance = pd.DataFrame({
        'feature': list(importance_scores.keys()),
        'importance': list(importance_scores.values())
    }).sort_values('importance', ascending=False)

    print("\nTop 10 Important Features:")
    print(feature_importance.head(10))

    return {
        'model': model,
        'selected_features': list(X.columns),
        'feature_importance': feature_importance,
        'best_threshold': best_threshold,
        'recall': best_recall
    }

In [24]:
results = recall_optimized_train(df)

Starting processing...
Selected 522 features
Converting to DMatrix format...

Training model...
[0]	train-auc:0.75211	train-aucpr:0.03918	eval-auc:0.72790	eval-aucpr:0.03820
[5]	train-auc:0.80248	train-aucpr:0.05500	eval-auc:0.76820	eval-aucpr:0.04449
[10]	train-auc:0.81324	train-aucpr:0.05984	eval-auc:0.77126	eval-aucpr:0.04841
[11]	train-auc:0.81395	train-aucpr:0.06028	eval-auc:0.77023	eval-aucpr:0.04876

Finding optimal threshold for recall while maintaining minimum precision...

Best threshold found: 0.340

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.10      0.17     19088
           1       0.02      0.99      0.03       274

    accuracy                           0.11     19362
   macro avg       0.51      0.54      0.10     19362
weighted avg       0.98      0.11      0.17     19362


Business Impact Metrics:
Number of missed defaults (False Negatives): 2
Number of incorrectly flagged good customers (False Positive

In [25]:
validation_df = pd.read_csv("/content/drive/MyDrive/wetransfer_credit_card_behaviour_scores-docx_2025-01-07_0850/validation_data_to_be_shared.csv")

In [26]:
def predict_probabilities(validation_df, trained_results):

    # Select only the features used in training
    features = trained_results['selected_features']

    # Check if all features exist in validation set
    missing_features = set(features) - set(validation_df.columns)
    if missing_features:
        print(f"Warning: Missing features in validation set: {missing_features}")
        # Only use features that exist in validation set
        features = list(set(features) - missing_features)

    # Prepare the data
    X_val = validation_df[features]
    X_val = X_val.fillna(-999)  # Same as in training

    # Convert to DMatrix
    dval = xgb.DMatrix(X_val)

    # Get raw probabilities
    probabilities = trained_results['model'].predict(dval)

    # Add predictions to a copy of the validation data
    results_df = validation_df.copy()
    results_df['default_probability'] = probabilities

    # Sort by probability in descending order to see highest risk first
    results_df = results_df.sort_values('default_probability', ascending=False)

    print("\nPrediction Summary:")
    print(f"Number of customers: {len(results_df)}")
    print("\nProbability Distribution:")
    print(results_df['default_probability'].describe())

    return results_df

In [27]:
predictions_df = predict_probabilities(validation_df, results)


Prediction Summary:
Number of customers: 41792

Probability Distribution:
count    41792.000000
mean         0.559904
std          0.150351
min          0.214761
25%          0.416065
50%          0.585929
75%          0.689913
max          0.806705
Name: default_probability, dtype: float64


In [14]:
final_predictions = predictions_df[['account_number', 'default_probability']]

In [15]:
output_path = "/content/drive/MyDrive/wetransfer_credit_card_behaviour_scores-docx_2025-01-07_0850/predictions.csv"
final_predictions.to_csv(output_path, index=False)
print(f"\nPredictions saved to {output_path}")


Predictions saved to /content/drive/MyDrive/wetransfer_credit_card_behaviour_scores-docx_2025-01-07_0850/predictions.csv


In [31]:
final_predictions_df = pd.read_csv("/content/drive/MyDrive/wetransfer_credit_card_behaviour_scores-docx_2025-01-07_0850/predictions.csv")

In [37]:
final_predictions_df.head()

Unnamed: 0,account_number,default_probability
0,125136,0.806705
1,130026,0.806705
2,105244,0.806705
3,129382,0.806705
4,139583,0.806705
