In [22]:
# necessary libraries
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# evaluation matrices
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# machine learning models
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

import cudf as df
import cuml
from cuml.ensemble import RandomForestClassifier
# from cuml.linear_model import LogisticRegression


# import tensorflow as tf

In [23]:
# Load the datasets
X_resampled = pd.read_csv("X_resampled.csv")
y_resampled = pd.read_csv("y_resampled.csv")
X_test = pd.read_csv("X_test.csv")
y_test = pd.read_csv("y_test.csv")

In [24]:
# Check the shape of the datasets
print("Shape of X_resampled:", X_resampled.shape)
print("Shape of y_resampled:", y_resampled.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_resampled: (2461812, 8)
Shape of y_resampled: (2461812, 1)
Shape of X_test: (555719, 8)
Shape of y_test: (555719, 1)


In [4]:
# Load the datasets
X_resampled_float = X_resampled.astype(np.float32)
y_resampled_float = y_resampled.astype(np.float32)
X_test_float = X_test.astype(np.float32)
y_test_float = y_test.astype(np.float32)

In [5]:
X_resampled_float.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2461812 entries, 0 to 2461811
Data columns (total 8 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   category               float32
 1   amt                    float32
 2   city                   float32
 3   state                  float32
 4   zip                    float32
 5   merch_lat              float32
 6   merch_long             float32
 7   merchant_mean_encoded  float32
dtypes: float32(8)
memory usage: 75.1 MB


#### Standardization - The process of transforming data features to have a mean of zero and a standard deviation of one

In [14]:
scaler = StandardScaler()
X_resampled_scaled = scaler.fit_transform(X_resampled)
X_test_scaled = scaler.transform(X_test)

In [15]:
y_resampled

Unnamed: 0,is_fraud
0,0
1,0
2,0
3,0
4,0
...,...
2461807,1
2461808,1
2461809,1
2461810,1


#### Normalization - Scaling individual data features to a specific range, typically [0, 1] or [-1, 1], to ensure uniformity

In [16]:
scaler = MinMaxScaler()
X_resampled_scaledN = scaler.fit_transform(X_resampled)
X_test_scaledN = scaler.transform(X_test)

---
## 1) Logistic Regression (LG)

In [17]:
# Define hyperparameter grids
param_grid_lr = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l2'],
    # 'solver': ['liblinear', 'saga']
}

#### a) LG with standardize data

In [18]:
# Start the timer
start_time = time.time()

In [19]:
grid_search_lr_scaled = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid_lr, cv=3, scoring='accuracy', verbose=2, n_jobs=2)
grid_search_lr_scaled.fit(X_resampled_scaled, y_resampled.values.ravel())
best_params_lr_scaled = grid_search_lr_scaled.best_params_
best_score_lr_scaled = grid_search_lr_scaled.best_score_

print("Best parameters for Logistic Regression with Standardized data: ", best_params_lr_scaled)
print("Best score for Logistic Regression with Standardized data: ", best_score_lr_scaled)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best parameters for Logistic Regression with Standardized data:  {'C': 0.001, 'penalty': 'l2'}
Best score for Logistic Regression with Standardized data:  0.8668549832399876


In [20]:
end_time = time.time()

execution_time = (end_time - start_time)/60

print(f"Hyperparameter tuning for Logistic regression with Standardization execution completed in {execution_time:.2f} minutes.")

Hyperparameter tuning for Logistic regression with Standardization execution completed in 0.30 minutes.


#### b) LG with normalized data

In [21]:
# Start the timer
start_time = time.time()

In [22]:
grid_search_lr_normalized = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid_lr, cv=3, scoring='accuracy', verbose=2, n_jobs=2)
grid_search_lr_normalized.fit(X_resampled_scaledN, y_resampled.values.ravel())
best_params_lr_normalized = grid_search_lr_normalized.best_params_
best_score_lr_normalized = grid_search_lr_normalized.best_score_

print("Best parameters for Logistic Regression with Normalized data: ", best_params_lr_normalized)
print("Best score for Logistic Regression with Normalized data: ", best_score_lr_normalized)

[CV] END ................................C=0.001, penalty=l2; total time=   2.9s
[CV] END ................................C=0.001, penalty=l2; total time=   0.9s
[CV] END .................................C=0.01, penalty=l2; total time=   1.0s
[CV] END ..................................C=0.1, penalty=l2; total time=   1.0s
[CV] END ..................................C=0.1, penalty=l2; total time=   1.0s
[CV] END ....................................C=1, penalty=l2; total time=   1.0s
[CV] END ...................................C=10, penalty=l2; total time=   1.0s
[CV] END ...................................C=10, penalty=l2; total time=   0.9s
[CV] END ..................................C=100, penalty=l2; total time=   0.9s
[CV] END ................................C=0.001, penalty=l2; total time=   1.2s
[CV] END ................................C=0.001, penalty=l2; total time=   1.2s
[CV] END .................................C=0.01, penalty=l2; total time=   1.2s
[CV] END ...................

In [23]:
end_time = time.time()

execution_time = (end_time - start_time)/60

print(f"Hyperparameter tuning for Logistic regression with Normalized execution completed in {execution_time:.2f} minutes.")

Hyperparameter tuning for Logistic regression with Normalized execution completed in 0.34 minutes.


In [42]:
# Evaluate Logistic Regression (Standardized)
best_lr_scaled = grid_search_lr_scaled.best_estimator_
y_pred_lr_scaled = best_lr_scaled.predict(X_test_scaled)

print(f"Logistic Regression (Standardized) Accuracy: {accuracy_score(y_test, y_pred_lr_scaled)*100:.2f}%")
print("Logistic Regression (Standardized) Classification Report:\n", classification_report(y_test, y_pred_lr_scaled))
print("Logistic Regression (Standardized) Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr_scaled))

Logistic Regression (Standardized) Accuracy: 94.85%
Logistic Regression (Standardized) Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.95      0.97    553574
           1       0.05      0.75      0.10      2145

    accuracy                           0.95    555719
   macro avg       0.53      0.85      0.54    555719
weighted avg       1.00      0.95      0.97    555719

Logistic Regression (Standardized) Confusion Matrix:
 [[525491  28083]
 [   528   1617]]


In [40]:
# Evaluate Logistic Regression (Normalized)
best_lr_normalized = grid_search_lr_normalized.best_estimator_
y_pred_lr_normalized = best_lr_normalized.predict(X_test_scaledN)

print(f"Logistic Regression (Normalized) Accuracy: {accuracy_score(y_test, y_pred_lr_normalized)*100:.2f}%")
print("Logistic Regression (Normalized) Classification Report:\n", classification_report(y_test, y_pred_lr_normalized))
print("Logistic Regression (Normalized) Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr_normalized))

Logistic Regression (Normalized) Accuracy: 94.78%
Logistic Regression (Normalized) Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.95      0.97    553574
           1       0.05      0.75      0.10      2145

    accuracy                           0.95    555719
   macro avg       0.53      0.85      0.54    555719
weighted avg       1.00      0.95      0.97    555719

Logistic Regression (Normalized) Confusion Matrix:
 [[525074  28500]
 [   528   1617]]


---
## 2) Decision Trees

In [26]:
param_grid_dt = {
    'max_depth': [5, 10, 20, 50, None],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'max_features': [None, 'sqrt', 'log2'],
    'criterion': ['gini', 'entropy']
}

In [27]:
# Start the timer
start_time = time.time()

In [28]:
# with tf.device('/GPU:0'):
grid_search_dt = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=param_grid_dt, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search_dt.fit(X_resampled, y_resampled.values.ravel())
best_params_dt = grid_search_dt.best_params_
best_score_dt = grid_search_dt.best_score_

print("Best parameters for Decision Tree: ", best_params_dt)
print("Best score for Decision Tree: ", best_score_dt)

Fitting 3 folds for each of 270 candidates, totalling 810 fits
Best parameters for Decision Tree:  {'criterion': 'entropy', 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2}
Best score for Decision Tree:  0.9915241293811224


In [29]:
end_time = time.time()

execution_time = (end_time - start_time)/60

print(f"Hyperparameter tuning for Decision Tree execution completed in {execution_time:.2f} minutes.")

Hyperparameter tuning for Decision Tree execution completed in 16.34 minutes.


In [41]:
# Evaluate Decision Tree
best_dt = grid_search_dt.best_estimator_
y_pred_dt = best_dt.predict(X_test)

print(f"Decision Tree Accuracy: {accuracy_score(y_test, y_pred_dt)*100:.2f}%")
print("Decision Tree Classification Report:\n", classification_report(y_test, y_pred_dt))
print("Decision Tree Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))

Decision Tree Accuracy: 97.28%
Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.97      0.99    553574
           1       0.09      0.67      0.16      2145

    accuracy                           0.97    555719
   macro avg       0.54      0.82      0.57    555719
weighted avg       1.00      0.97      0.98    555719

Decision Tree Confusion Matrix:
 [[539173  14401]
 [   702   1443]]


---
## 3) Random Forest

In [16]:
param_grid_rf = {
    'n_estimators': [10, 50, 100],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False],
    'split_criterion': [0,1]
}

In [17]:
# Start the timer
start_time = time.time()

In [18]:
# import tensorflow as tf

# with tf.device('/CPU:0'):
# grid_search_rf = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid_rf, cv=3, scoring='accuracy', verbose=2, n_jobs=2)
grid_search_rf = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid_rf, verbose=1, n_jobs=2)
grid_search_rf.fit(X_resampled_float, y_resampled_float.values.ravel())
best_params_rf = grid_search_rf.best_params_
best_score_rf = grid_search_rf.best_score_

print("Best parameters for Random Forest: ", best_params_rf)
print("Best score for Random Forest: ", best_score_rf)

Fitting 5 folds for each of 648 candidates, totalling 3240 fits
[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=10, split_criterion=0; total time=   2.5s
[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=10, split_criterion=0; total time=   1.8s
[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=10, split_criterion=0; total time=   1.8s
[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=10, split_criterion=1; total time=   1.9s
[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=10, split_criterion=1; total time=   1.9s
[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50, split_criterion=0; total time=   8.6s
[CV] END bootstrap



[CV] END bootstrap=True, max_depth=5, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=100, split_criterion=0; total time=  19.2s
[CV] END bootstrap=True, max_depth=5, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=100, split_criterion=0; total time=  19.1s
[CV] END bootstrap=True, max_depth=5, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=100, split_criterion=0; total time=  19.1s
[CV] END bootstrap=True, max_depth=5, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=100, split_criterion=1; total time=  18.8s
[CV] END bootstrap=True, max_depth=5, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=100, split_criterion=1; total time=  18.8s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=10, split_criterion=0; total time=   2.5s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_l

In [19]:
end_time = time.time()

execution_time = (end_time - start_time)/60

print(f"Hyperparameter tuning for Random Forest execution completed in {execution_time:.2f} minutes.")

Hyperparameter tuning for Random Forest execution completed in 362.61 minutes.


In [20]:
# Evaluate Random Forest
best_rf = grid_search_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test_float)

print("Random Forest Accuracy: ", accuracy_score(y_test_float, y_pred_rf))
print("Random Forest Classification Report:\n", classification_report(y_test_float, y_pred_rf))
print("Random Forest Confusion Matrix:\n", confusion_matrix(y_test_float, y_pred_rf))

Random Forest Accuracy:  0.9787806427349074
Random Forest Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      0.98      0.99    553574
         1.0       0.10      0.58      0.17      2145

    accuracy                           0.98    555719
   macro avg       0.55      0.78      0.58    555719
weighted avg       0.99      0.98      0.99    555719

Random Forest Confusion Matrix:
 [[542680  10894]
 [   898   1247]]
[CV] END bootstrap=False, max_depth=20, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=100, split_criterion=0; total time=  34.4s
[CV] END bootstrap=False, max_depth=20, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=100, split_criterion=0; total time=  30.8s
[CV] END bootstrap=False, max_depth=20, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=100, split_criterion=0; total time=  34.7s
[CV] END bootstrap=False, max_depth=20, max_features

---
## 4) Gradient Boosting Machines (GBM)

In [6]:
param_grid_gbm = {
    'n_estimators': [10, 50, 100],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'subsample': [0.8, 1.0]
}

In [7]:
# Start the timer
start_time = time.time()

In [8]:
grid_search_gbm = GridSearchCV(estimator=GradientBoostingClassifier(), param_grid=param_grid_gbm, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search_gbm.fit(X_resampled, y_resampled.values.ravel())
best_params_gbm = grid_search_gbm.best_params_
best_score_gbm = grid_search_gbm.best_score_

print("Best parameters for Gradient Boosting Machine: ", best_params_gbm)
print("Best score for Gradient Boosting Machine: ", best_score_gbm)

Fitting 3 folds for each of 1296 candidates, totalling 3888 fits
[CV] END learning_rate=0.01, max_depth=3, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100, subsample=0.8; total time= 4.1min
[CV] END learning_rate=0.01, max_depth=3, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=10, subsample=0.8; total time=  23.3s
[CV] END learning_rate=0.01, max_depth=3, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=50, subsample=0.8; total time= 2.1min
[CV] END learning_rate=0.01, max_depth=3, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=10, subsample=0.8; total time=  27.1s
[CV] END learning_rate=0.01, max_depth=3, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=10, subsample=1.0; total time=  32.7s
[CV] END learning_rate=0.01, max_depth=3, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=50, subsample=1.0; total time= 2.4min
[CV] END learn

In [9]:
end_time = time.time()

execution_time = (end_time - start_time)/60

print(f"Hyperparameter tuning for GBM execution completed in {execution_time:.2f} minutes.")

Hyperparameter tuning for GBM execution completed in 1404.78 minutes.


In [10]:
# Evaluate Gradient Boosting Machine
best_gbm = grid_search_gbm.best_estimator_
y_pred_gbm = best_gbm.predict(X_test)

print(f"Gradient Boosting Machine Accuracy: {accuracy_score(y_test, y_pred_gbm)*100:.2f}%")
print("Gradient Boosting Machine Classification Report:\n", classification_report(y_test, y_pred_gbm))
print("Gradient Boosting Machine Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gbm))

Gradient Boosting Machine Accuracy: 98.83%
Gradient Boosting Machine Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99    553574
           1       0.21      0.74      0.33      2145

    accuracy                           0.99    555719
   macro avg       0.61      0.86      0.66    555719
weighted avg       1.00      0.99      0.99    555719

Gradient Boosting Machine Confusion Matrix:
 [[547651   5923]
 [   560   1585]]


---
## 5) XGBoost Classifier

In [10]:
param_grid_xgb = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7, 10],
    'min_child_weight': [1, 5, 10],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3]
}

In [11]:
# Start the timer
start_time = time.time()

In [12]:
grid_search_xgb = GridSearchCV(estimator=xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'), param_grid=param_grid_xgb, cv=3, scoring='accuracy', verbose=2, n_jobs=2)
grid_search_xgb.fit(X_resampled, y_resampled.values.ravel())
best_params_xgb = grid_search_xgb.best_params_
best_score_xgb = grid_search_xgb.best_score_

print("Best parameters for XGBoost: ", best_params_xgb)
print("Best score for XGBoost: ", best_score_xgb)

Fitting 3 folds for each of 1728 candidates, totalling 5184 fits
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8; total time=   4.5s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=1.0; total time=   3.7s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=1.0; total time=   3.8s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=200, subsample=0.8; total time=   6.3s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=200, subsample=0.8; total time=   6.5s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=200, subsample=1.0; total time=   7.9s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_d



[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=7, min_child_weight=1, n_estimators=200, subsample=0.8; total time=  10.2s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=7, min_child_weight=1, n_estimators=200, subsample=1.0; total time=   9.6s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=7, min_child_weight=1, n_estimators=200, subsample=1.0; total time=   9.7s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=7, min_child_weight=1, n_estimators=500, subsample=0.8; total time=  44.6s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=7, min_child_weight=1, n_estimators=500, subsample=0.8; total time=  24.5s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=7, min_child_weight=1, n_estimators=500, subsample=1.0; total time=  30.3s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=7, min_child_weight=5, n_estimators=100, subsample=0.8; tota

In [13]:
end_time = time.time()

execution_time = (end_time - start_time)/60

print(f"Hyperparameter tuning for XGBoost execution completed in {execution_time:.2f} minutes.")

Hyperparameter tuning for XGBoost execution completed in 513.26 minutes.


In [14]:
# Evaluate XGBoost
best_xgb = grid_search_xgb.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)

print(f"XGBoost Accuracy: {accuracy_score(y_test, y_pred_xgb)*100:.2f}%")
print("XGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))
print("XGBoost Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))

XGBoost Accuracy: 99.21%
XGBoost Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00    553574
           1       0.29      0.71      0.41      2145

    accuracy                           0.99    555719
   macro avg       0.64      0.85      0.70    555719
weighted avg       1.00      0.99      0.99    555719

XGBoost Confusion Matrix:
 [[549799   3775]
 [   628   1517]]
[CV] END colsample_bytree=1.0, gamma=0.3, learning_rate=0.2, max_depth=7, min_child_weight=1, n_estimators=100, subsample=0.8; total time=   5.8s
[CV] END colsample_bytree=1.0, gamma=0.3, learning_rate=0.2, max_depth=7, min_child_weight=1, n_estimators=100, subsample=1.0; total time=   5.4s
[CV] END colsample_bytree=1.0, gamma=0.3, learning_rate=0.2, max_depth=7, min_child_weight=1, n_estimators=200, subsample=0.8; total time=   9.9s
[CV] END colsample_bytree=1.0, gamma=0.3, learning_rate=0.2, max_depth=7, min_child_weight=1, n_estimators=200, subs

In [11]:
print('hello')

hello


#### XGBoost with GPU

In [25]:
param_grid_xgb_gpu = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7, 10],
    'min_child_weight': [1, 5, 10],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3],
    'tree_method': ['hist'],
    'device': ['cuda:0']
}

In [26]:
# Start the timer
start_time = time.time()

In [None]:
grid_search_xgb_gpu = GridSearchCV(estimator=xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'), param_grid=param_grid_xgb_gpu, cv=3, scoring='accuracy', verbose=2, n_jobs=2)
grid_search_xgb_gpu.fit(X_resampled, y_resampled.values.ravel())
best_params_xgb_gpu = grid_search_xgb_gpu.best_params_
best_score_xgb_gpu = grid_search_xgb_gpu.best_score_

print("Best parameters for XGBoost: ", best_params_xgb_gpu)
print("Best score for XGBoost: ", best_score_xgb_gpu)

Fitting 3 folds for each of 1728 candidates, totalling 5184 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV] END colsample_bytree=0.8, device=cuda:0, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8, tree_method=hist; total time=   2.7s
[CV] END colsample_bytree=0.8, device=cuda:0, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=1.0, tree_method=hist; total time=   2.2s
[CV] END colsample_bytree=0.8, device=cuda:0, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=1.0, tree_method=hist; total time=   2.1s
[CV] END colsample_bytree=0.8, device=cuda:0, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=200, subsample=0.8, tree_method=hist; total time=   4.2s
[CV] END colsample_bytree=0.8, device=cuda:0, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=200, subsample=1.0, tree_method=hist; total time=   3.8s
[CV] END colsample_bytree=0.8, device=cuda:0, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [None]:
end_time = time.time()

execution_time = (end_time - start_time)/60

print(f"Hyperparameter tuning for XGBoost with GPU execution completed in {execution_time:.2f} minutes.")

In [None]:
# Evaluate XGBoost
best_xgb_gpu = grid_search_xgb_gpu.best_estimator_
y_pred_xgb_gpu = best_xgb_gpu.predict(X_test_df)

print(f"XGBoost Accuracy: {accuracy_score(y_test_df, y_pred_xgb_gpu)*100:.2f}%")
print("XGBoost Classification Report:\n", classification_report(y_test_df, y_pred_xgb_gpu))
print("XGBoost Confusion Matrix:\n", confusion_matrix(y_test_df, y_pred_xgb_gpu))