In [2]:
!pip install catboost


^C


In [1]:
import joblib
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, accuracy_score, average_precision_score
from scipy.stats import uniform, randint


ModuleNotFoundError: No module named 'catboost'

In [None]:

# Load data
data = joblib.load("/content/without_smote_preprocessed_data.pkl")
X_train = data["X_train_scaled"]
X_test = data["X_test_scaled"]
y_train = data["y_train"]
y_test = data["y_test"]


In [None]:

# Step 1: Define model
cat_model = CatBoostClassifier(
    scale_pos_weight=100,
    eval_metric='PRAUC',
    early_stopping_rounds=10,
    verbose=0
)

In [None]:
# Step 2: Define hyperparameter search space
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],  # Discrete values work better than uniform
    'depth': [4, 5, 6],                   # Focus near XGBoost's best depth (5)
    'l2_leaf_reg': [0.1, 0.5, 1, 5],      # Wider regularization range
    'subsample': [0.6, 0.8],               # Test higher subsampling
    'iterations': [100, 200],              # Fewer trees for faster tuning
    'grow_policy': ['SymmetricTree', 'Depthwise']  # Tree growth strategies
}


In [None]:
# Step 3: StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# Step 4: Randomized Search
random_search = RandomizedSearchCV(
    estimator=cat_model,
    param_distributions=param_grid,
    n_iter=30,
    scoring='average_precision',
    cv=cv,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

#Step 5: Fit RandomizedSearchCV to the training data
random_search.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)]
)

# Best model
best_cat_model = random_search.best_estimator_
print("\nBest hyperparameters:", random_search.best_params_)


Fitting 5 folds for each of 30 candidates, totalling 150 fits

Best hyperparameters: {'subsample': 0.6, 'learning_rate': 0.1, 'l2_leaf_reg': 5, 'iterations': 200, 'grow_policy': 'Depthwise', 'depth': 6}


In [None]:
print(best_cat_model)

<catboost.core.CatBoostClassifier object at 0x7c5e8f2f02d0>


In [None]:
# Make predictions on the test set
y_pred = best_cat_model.predict(X_test)
y_train_pred=best_cat_model.predict(X_train)

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix,auc

# Train metrics
print("Train metrics:")
print("Accuracy Score:", accuracy_score(y_train, y_train_pred))  # Corrected to y_train
print("Recall Score:", recall_score(y_train, y_train_pred, average="macro"))  # Corrected to y_train
print("Precision Score:", precision_score(y_train, y_train_pred, average="macro"))  # Corrected to y_train
print("F1 Score:", f1_score(y_train, y_train_pred, average="macro"))  # Corrected to y_train
print("Confusion matrix:", confusion_matrix(y_train, y_train_pred))  # Corrected to y_train

from sklearn.metrics import precision_recall_curve, auc

# Get predicted probabilities for the positive class (fraud)
y_proba = best_cat_model.predict_proba(X_train)[:, 1]

# Calculate precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_train, y_proba)

# Compute PR-AUC
pr_auc = auc(recall, precision)
print(f"PR-AUC: {pr_auc:.4f}")



# Test metrics
print("\nTest metrics:")
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Recall Score:", recall_score(y_test, y_pred, average="macro"))
print("Precision Score:", precision_score(y_test, y_pred, average="macro"))
print("F1 Score:", f1_score(y_test, y_pred, average="macro"))
print("Confusion matrix:", confusion_matrix(y_test, y_pred))

from sklearn.metrics import precision_recall_curve, auc

# Get predicted probabilities for the positive class (fraud)
y_proba = best_cat_model.predict_proba(X_test)[:, 1]

# Calculate precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)

# Compute PR-AUC
pr_auc = auc(recall, precision)
print(f"PR-AUC: {pr_auc:.4f}")


Train metrics:
Accuracy Score: 0.999052779980615
Recall Score: 0.9334983594058976
Precision Score: 0.8325468192000525
F1 Score: 0.8763413850061352
Confusion matrix: [[226437    165]
 [    50    328]]
PR-AUC: 0.8221

Test metrics:
Accuracy Score: 0.9990131463010609
Recall Score: 0.8839104433516758
Precision Score: 0.8409272828586623
F1 Score: 0.8611389852905318
Confusion matrix: [[56617    34]
 [   22    73]]
PR-AUC: 0.7244


Threshold Tuning

In [None]:
y_proba = best_cat_model.predict_proba(X_test)[:, 1]
new_threshold = 0.6 # Lower = higher recall
y_pred = (y_proba >= new_threshold).astype(int)

In [None]:

# Train metrics
print("Train metrics:")
print("Accuracy Score:", accuracy_score(y_train, y_train_pred))  # Corrected to y_train
print("Recall Score:", recall_score(y_train, y_train_pred, average="macro"))  # Corrected to y_train
print("Precision Score:", precision_score(y_train, y_train_pred, average="macro"))  # Corrected to y_train
print("F1 Score:", f1_score(y_train, y_train_pred, average="macro"))  # Corrected to y_train
print("Confusion matrix:", confusion_matrix(y_train, y_train_pred))  # Corrected to y_train

from sklearn.metrics import precision_recall_curve, auc

# Get predicted probabilities for the positive class (fraud)
y_proba = best_cat_model.predict_proba(X_train)[:, 1]

# Calculate precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_train, y_proba)

# Compute PR-AUC
pr_auc = auc(recall, precision)
print(f"PR-AUC: {pr_auc:.4f}")



# Test metrics
print("\nTest metrics:")
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Recall Score:", recall_score(y_test, y_pred, average="macro"))
print("Precision Score:", precision_score(y_test, y_pred, average="macro"))
print("F1 Score:", f1_score(y_test, y_pred, average="macro"))
print("Confusion matrix:", confusion_matrix(y_test, y_pred))

from sklearn.metrics import precision_recall_curve, auc

# Get predicted probabilities for the positive class (fraud)
y_proba = best_cat_model.predict_proba(X_test)[:, 1]

# Calculate precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)

# Compute PR-AUC
pr_auc = auc(recall, precision)
print(f"PR-AUC: {pr_auc:.4f}")

Train metrics:
Accuracy Score: 0.999052779980615
Recall Score: 0.9334983594058976
Precision Score: 0.8325468192000525
F1 Score: 0.8763413850061352
Confusion matrix: [[226437    165]
 [    50    328]]
PR-AUC: 0.8221

Test metrics:
Accuracy Score: 0.9992774821132767
Recall Score: 0.8840428328946671
Precision Score: 0.896544969387019
F1 Score: 0.8901934039659647
Confusion matrix: [[56632    19]
 [   22    73]]
PR-AUC: 0.7244


In [None]:
joblib.dump(best_cat_model, 'cat_model.pkl')

['cat_model.pkl']