In [None]:
# This notebook implements a binary classification pipeline using XGBoost. The goal is to accurately classify observations in the dataset based on feature input.

In [None]:
!pip install xgboost




In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve, auc
from xgboost import XGBClassifier, plot_importance
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
from google.colab import files
uploaded = files.upload()

Saving creditcard.csv to creditcard.csv


In [None]:
df = pd.read_csv('creditcard.csv')
print(df['Class'].value_counts())


Class
0    284315
1       492
Name: count, dtype: int64


In [None]:
X = df.drop('Class', axis=1)
y = df['Class']


In [None]:
# Data Preprocessing

# We split the dataset into training and testing sets, and normalize the `Amount` feature using `StandardScaler`. Since the dataset is highly imbalanced, we take care to evaluate the model with appropriate metrics later.


In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, stratify=y, random_state=42)


In [None]:
scale_pos_weight = (y == 0).sum() / (y == 1).sum()


In [None]:
# Model Training with XGBoost

# We train an `XGBClassifier` with hyperparameter tuning using `GridSearchCV` to find the best combination for optimal performance.


In [None]:
xgb_model = XGBClassifier(
    scale_pos_weight=scale_pos_weight,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)


In [None]:
param_grid = {
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1.0]
}


In [None]:
grid = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=3,
    verbose=1
)

grid.fit(X_train, y_train)


Fitting 3 folds for each of 16 candidates, totalling 48 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

In [None]:
best_model = grid.best_estimator_


In [None]:
# Evaluation Metrics

# To assess model performance, we use precision, recall, F1-score, ROC AUC, and confusion matrix. This is critical due to the class imbalance (fraud cases are < 0.2%).


In [None]:
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]


In [None]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_proba))


In [None]:
plt.figure(figsize=(6, 4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
precision, recall, _ = precision_recall_curve(y_test, y_proba)
plt.figure(figsize=(6, 4))
plt.plot(recall, precision, label='PR Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.grid()
plt.show()


In [None]:
# Feature Importance

# Using XGBoost’s built-in importance scores, we visualize which features contribute most to the classification decisions.


In [None]:
plt.figure(figsize=(10, 6))
plot_importance(best_model, max_num_features=10)
plt.title("Top 10 Important Features")
plt.show()
