# step 1 : Import libraries


In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve, auc, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE


# Step 2: Load the dataset

In [2]:
data = pd.read_csv("creditcard.csv")

# Step 3: Data preprocessing

## Check for missing values

In [3]:
missing_values = data.isnull().sum()
print("Missing values per feature:")
print(missing_values)


Missing values per feature:
Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64


## Check for and handle duplicates

In [4]:
# Check for duplicates
duplicates = data.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

# Drop duplicates (if needed)
data.drop_duplicates(inplace=True)

#Check for duplicate after handling
duplicates = data.duplicated().sum()
print(f"Number of duplicate rows after handling: {duplicates}")

Number of duplicate rows: 1081
Number of duplicate rows after handling: 0


## Identify and handle outliers

In [5]:
# Z-score method for detecting outliers
from scipy import stats
z_scores = np.abs(stats.zscore(data))
outliers = (z_scores > 3).any(axis=1)
print(f"Number of outlier rows: {outliers.sum()}")

# In the context of fraud detection I decide to keep the outliers because they are more likely to be fraudulent transactions


Number of outlier rows: 37930


## Standardize numerical features

In [6]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Standardize the 'Time' and 'Amount' columns in the dataset
# This is done to ensure that these numerical features are on the same scale, which can help improve the performance of the model.
# StandardScaler transforms the data by subtracting the mean and dividing by the standard deviation.
data[['Time', 'Amount']] = scaler.fit_transform(data[['Time', 'Amount']])



# Step 4: Data splitting

In [7]:
# Separate the features (X) and the target variable (y)
# In this case, we are dropping the 'Class' column, which is the target variable, from the dataset to create the feature matrix (X)
# 'Class' column is used as the target variable (y)
X = data.drop('Class', axis=1)
y = data['Class']

# Split the dataset into training and testing sets
# The test set size is 20% of the total dataset, and the train set size is 80%
# 'stratify=y' ensures that the distribution of the target variable (fraud vs. non-fraud) is preserved in both the train and test sets
# 'random_state=42' sets a seed for reproducibility of the random split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)



# Step 5: Handling imbalance data

In [8]:
# Initialize the SMOTE (Synthetic Minority Over-sampling Technique) algorithm with a random seed for reproducibility
smote = SMOTE(random_state=42)

# Use SMOTE to resample the training dataset by generating synthetic samples for the minority class (fraud cases)
# This helps to balance the class distribution in the training data, which can improve the model's performance on the minority class
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


# Step 6: Model Selection and Training

In [9]:
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# 1. Logistic Regression
# Reason: It's a simple, fast, and interpretable linear model that works well when the relationship between the features and the target is approximately linear.
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_resampled, y_train_resampled)
print("Finish training LR")

# 2. Random Forest
# Reason: It's an ensemble method that works well with high-dimensional data and can capture complex patterns by combining multiple decision trees.
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_resampled, y_train_resampled)
print("Finish training RF")

# 3. K-Nearest Neighbors (KNN)
# Reason: It's a non-parametric method that can capture non-linear relationships in the data by considering the "neighborhood" of data points.
knn = KNeighborsClassifier()
knn.fit(X_train_resampled, y_train_resampled)
print("Finisn training KNN")

# 4. XGBoost
# Reason: It's an efficient gradient boosting algorithm that can handle a wide range of data and is known for its high performance and speed.
xgb = XGBClassifier(random_state=42)
xgb.fit(X_train_resampled, y_train_resampled)
print("Finish training XGBoost")

# 5. LightGBM
# Reason: It's a gradient boosting framework that uses tree-based learning algorithms and is designed for large datasets, offering better efficiency and speed than other gradient boosting methods.
lgbm = LGBMClassifier(random_state=42)
lgbm.fit(X_train_resampled, y_train_resampled)
print("Finsih training lightBGM")


  from pandas import MultiIndex, Int64Index


Finish training LR
Finish training RF
Finisn training KNN


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Finish training XGBoost
Finsih training lightBGM


# Step 7: Model evaluation

In [11]:
# Define a function to evaluate the performance of a given model on the test dataset
def evaluate_model(model, X_test, y_test):
    # Predict the target class for the test dataset
    y_pred = model.predict(X_test)
    
    # Calculate the precision-recall curve and the area under the curve (AUPRC)
    precision, recall, _ = precision_recall_curve(y_test, y_pred)
    auprc = auc(recall, precision)
    print(f"AUPRC: {auprc}")
    
    # Display the confusion matrix and classification report for the model's performance
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Evaluate and display the performance of each trained model on the test dataset
print("Logistic Regression Evaluation:")
evaluate_model(log_reg, X_test, y_test)

print("Random Forest Evaluation:")
evaluate_model(rf, X_test, y_test)

print("KNN Evaluation:")
evaluate_model(knn, X_test, y_test)

print("XGBoost Evaluation:")
evaluate_model(xgb, X_test, y_test)

print("LightGBM Evaluation:")
evaluate_model(lgbm, X_test, y_test)


Logistic Regression Evaluation:
AUPRC: 0.4634823664422642
Confusion Matrix:
[[55170  1481]
 [   12    83]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.99     56651
           1       0.05      0.87      0.10        95

    accuracy                           0.97     56746
   macro avg       0.53      0.92      0.54     56746
weighted avg       1.00      0.97      0.99     56746

Random Forest Evaluation:
AUPRC: 0.8348462284087304
Confusion Matrix:
[[56644     7]
 [   23    72]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56651
           1       0.91      0.76      0.83        95

    accuracy                           1.00     56746
   macro avg       0.96      0.88      0.91     56746
weighted avg       1.00      1.00      1.00     56746

KNN Evaluation:
AUPRC: 0.6456158651192072
Confusion Matrix:
[[56563    88]
 [   17    78]]
C

Based on the given results, the Random Forest model performs the best among the selected models. The reasoning behind this conclusion is as follows:

1. AUPRC (Area Under the Precision-Recall Curve): Random Forest has the highest AUPRC (0.8348) among all models, indicating better overall performance in distinguishing between the classes when dealing with imbalanced data.

2. Precision, Recall, and F1-score: Random Forest has the highest precision (0.91) for the positive class (fraud), which means it has the lowest false positive rate among the models. It also has a good recall (0.76), which means it can detect a considerable proportion of the actual fraud cases. The F1-score (0.83) for the positive class in Random Forest is also the highest, indicating a good balance between precision and recall.

3. Confusion Matrix: The confusion matrix of the Random Forest model shows the smallest number of false positives (7) and a relatively low number of false negatives (23) compared to other models.

Although the accuracy is high for all the models, it is not a reliable metric in this case due to the highly imbalanced nature of the dataset. The other metrics mentioned above provide a better perspective on the model's performance.

Considering all these factors, the Random Forest model seems to be the best performer among the selected models for this imbalanced dataset.

# Step 8: Parameter Tunning

In [13]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Create the base model
rf_base = RandomForestClassifier(random_state=42)

# Create the grid search object
grid_search = GridSearchCV(estimator=rf_base, param_grid=param_grid, scoring='recall', cv=5, verbose=2, n_jobs=-1)

# Fit the grid search object to the resampled data
grid_search.fit(X_train_resampled, y_train_resampled)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Train the model with the best hyperparameters
rf_tuned = RandomForestClassifier(**best_params, random_state=42)
rf_tuned.fit(X_train_resampled, y_train_resampled)


Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 4.5min
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 4.6min
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 4.6min
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 4.6min
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 4.6min
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 9.1min
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 9.1min
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; tot

# Comparing result of pre-tuned and after-tuned model

In [15]:
# Make predictions on the test set
y_pred_tuned = rf_tuned.predict(X_test)
y_pred_proba_tuned = rf_tuned.predict_proba(X_test)[:, 1]

# Calculate the AUPRC
auprc_tuned = average_precision_score(y_test, y_pred_proba_tuned)

# Print the AUPRC
print("Tuned Random Forest Evaluation:")
print(f"AUPRC: {auprc_tuned}")

# Calculate and print the confusion matrix
conf_matrix_tuned = confusion_matrix(y_test, y_pred_tuned)
print("Confusion Matrix:")
print(conf_matrix_tuned)

# Calculate and print the classification report
class_report_tuned = classification_report(y_test, y_pred_tuned)
print("Classification Report:")
print(class_report_tuned)


Tuned Random Forest Evaluation:
AUPRC: 0.8077696451432539
Confusion Matrix:
[[56644     7]
 [   23    72]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56651
           1       0.91      0.76      0.83        95

    accuracy                           1.00     56746
   macro avg       0.96      0.88      0.91     56746
weighted avg       1.00      1.00      1.00     56746



Comparing the pre-tuned and after-tuned results, we see that the AUPRC, confusion matrix, and classification report values are very similar.

Pre-tuned Random Forest:

AUPRC: 0.8348462284087304

Confusion Matrix: [[56644, 7], [23, 72]]

Precision (class 1): 0.91

Recall (class 1): 0.76

F1-score (class 1): 0.83

After-tuned Random Forest:

AUPRC: 0.8077696451432539

Confusion Matrix: [[56644, 7], [23, 72]]

Precision (class 1): 0.91

Recall (class 1): 0.76

F1-score (class 1): 0.83

The metrics show that the performance of the tuned Random Forest model is almost the same as the base model, with a slightly lower AUPRC. The precision, recall, and F1-score for the minority class (class 1) are identical for both models.

In this case, it appears that hyperparameter tuning did not lead to a significant improvement in the model's performance.