# **Introduction**

This project involves building a predictive model to calculate a "Behaviour Score" for Bank A's credit card customers. The Behaviour Score predicts the likelihood of a customer defaulting on their credit card payments. The following steps outline the approach used for data preprocessing, feature selection, and model training.

In [13]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np

In [15]:
# Load datasets
train_file = 'training_dataset.csv'
val_file = 'validation_dataset.csv'

train_data = pd.read_csv(train_file)
val_data = pd.read_csv(val_file)

In [16]:
# Separate features and target variable
target_col = 'bad_flag'
X_train = train_data.drop(columns=[target_col])
y_train = train_data[target_col]

# Separate features and target variable for validation dataset
y_val = val_data[target_col] if target_col in val_data.columns else None
X_val = val_data.drop(columns=[target_col]) if target_col in val_data.columns else val_data

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)


# Get indices of features to normalize
features_to_normalize = ['credit_activity_intensity', 'credit_limit_utilization']
normalize_indices = [list(feature_names).index(f) for f in features_to_normalize if f in feature_names]

# Normalize using indices
scaler = StandardScaler()
for idx in normalize_indices:
    X_train[:, idx] = scaler.fit_transform(X_train[:, [idx]]).flatten()
    X_val[:, idx] = scaler.transform(X_val[:, [idx]]).flatten()

# Feature selection (remove low-variance features)
selector = VarianceThreshold(threshold=0.1)
X_train = selector.fit_transform(X_train)
X_val = selector.transform(X_val)
selected_features = selector.get_support(indices=True)
feature_names = np.array(train_data.drop(columns=[target_col]).columns)[selected_features]

# Train XGBoost with manual hyperparameter tuning
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
best_score = 0

# Train with best parameters
xgb_model = XGBClassifier(
    learning_rate=0.3,
    max_depth=6,
    n_estimators=300,
    scale_pos_weight=scale_pos_weight,
    reg_alpha=0.1,
    reg_lambda=1,
    random_state=42,
    eval_metric='logloss'  # Specify eval_metric here
)

if y_val is not None:
    xgb_model.fit(
        X_train, y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],  # Use both training and validation sets for early stopping
        early_stopping_rounds=10,
        verbose=True,
        # Remove **params from here
    )
else:
    xgb_model.fit(
        X_train, y_train,
        verbose=True,
        # Remove **params from here
    )

# Evaluate model on training set
y_train_pred = xgb_model.predict(X_train)
y_train_pred_proba = xgb_model.predict_proba(X_train)[:, 1]

train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)
train_roc_auc = roc_auc_score(y_train, y_train_pred_proba)

print("XGBoost Training Set Performance:")
print(f"Accuracy: {train_accuracy:.2f}")
print(f"Precision: {train_precision:.2f}")
print(f"Recall: {train_recall:.2f}")
print(f"F1 Score: {train_f1:.2f}")
print(f"ROC AUC Score: {train_roc_auc:.2f}")

# Step 7: Predict on validation dataset
y_pred_proba = xgb_model.predict_proba(X_val)[:, 1]

# Output predictions
val_data['predicted_probability'] = y_pred_proba
output_data = val_data[['account_number', 'predicted_probability']]
output_data.to_csv('xgboost_predictions.csv', index=False)
print("Predictions saved to 'xgboost_predictions.csv'")


XGBoost Training Set Performance:
Accuracy: 1.00
Precision: 0.94
Recall: 1.00
F1 Score: 0.97
ROC AUC Score: 1.00
Predictions saved to 'xgboost_predictions.csv'
