In [None]:
import numpy as np 
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import time
import optuna
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
# Download the datasets
gdown.download('https://drive.google.com/uc?id=1WY6FU7bmZitRwdho89yrtgMGoOGEi9N5')
gdown.download('https://drive.google.com/uc?id=1o-3j2Ugp_Btmw3hMIbRcuQoqhGTY_gBz')

In [None]:
train_df = pd.read_csv('Fraudulent_E-Commerce_Transaction_Data.csv')

In [None]:
test_df = pd.read_csv('Fraudulent_E-Commerce_Transaction_Data_2.csv')

In [14]:
def clean_data(df) -> pd.DataFrame:
    df["Transaction Date"] = pd.to_datetime(df["Transaction Date"])
    df['Transaction Day'] = df["Transaction Date"].dt.day
    df["Transaction DOW"] = df["Transaction Date"].dt.day_of_week
    df["Transaction Month"] = df["Transaction Date"].dt.month
    mean_value = np.round(df['Customer Age'].mean(),0) 
    df['Customer Age'] = np.where(df['Customer Age'] <= -9, 
                                    np.abs(df['Customer Age']), 
                                    df['Customer Age'])
    df['Customer Age'] = np.where(df['Customer Age'] < 9, 
                                    mean_value, 
                                    df['Customer Age'])
    df["Is Address Match"] = (df["Shipping Address"] == df["Billing Address"]).astype(int)
    df.drop(columns=["Transaction ID", "Customer ID", "Customer Location",
                     "IP Address", "Transaction Date","Shipping Address","Billing Address"], inplace=True)
    int_col = df.select_dtypes(include="int").columns
    float_col = df.select_dtypes(include="float").columns
    df[int_col] = df[int_col].apply(pd.to_numeric, downcast='integer')
    df[float_col] = df[float_col].apply(pd.to_numeric, downcast='float')
    return df

In [15]:
train_df = clean_data(train_df)

In [30]:
train_data = train_df.drop(columns=["Is Fraudulent"])
train_label = train_df["Is Fraudulent"]

clean_test_df = clean_data(test_df)
test_data = clean_test_df.drop(columns=["Is Fraudulent"])
test_label = clean_test_df["Is Fraudulent"]

In [31]:
cat_col = train_data.select_dtypes(include="O").columns
num_col = []
for col in train_data.columns:
    if col not in cat_col  and col != 'Is Address Match':
        num_col.append(col)

In [32]:
transformer = ColumnTransformer(transformers=[
    ('encoding',OneHotEncoder(),cat_col),
    ('scaling',StandardScaler(),num_col)
],remainder='passthrough')

In [33]:
classifiers = {
    "Logistic Regression" : LogisticRegression(),
    "Bernoulli NB" : BernoulliNB(),
    "Decision Tree" : DecisionTreeClassifier(),
    "Random Forest" : RandomForestClassifier(),
    "XGB": XGBClassifier()
}

In [None]:
classifier_name = []
accuracy = []
for name, classifier in classifiers.items():
    
    model = Pipeline(steps=[
        ('transformer',transformer),
        ('classifier',classifier)
    ])
    
    start_time = time.time()
    
    model.fit(train_data, train_label)
    
    training_time = time.time() - start_time
    
    test_pred = model.predict(test_data)
        
    acc_score = accuracy_score(test_label, test_pred)
    conf_mat = confusion_matrix(test_label, test_pred)
    class_report = classification_report(test_label, test_pred)
    print(f"Classifier name : {name}")
    print(f"Accuracy score : {acc_score}")
    print(f"Confusion maxtrix : \n{conf_mat}")
    print(f"Classification report :\n{class_report}")
    print(f"{name} Training time: {training_time:.2f} seconds")
    print("="*55)
    
    classifier_name.append(name)
    accuracy.append(acc_score)

In [None]:
pd.DataFrame({"Classifier name" : classifier_name, "Accuracy" : accuracy}).sort_values(by="Accuracy",ascending=False)

In [None]:
def objective(trial):
    
    params = {
        "tree_method": "gpu_hist", 
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 3.0),
        "n_estimators": trial.suggest_int("n_estimators", 100, 700),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0.0, 1.0),
        "subsample": trial.suggest_float("subsample", 0.01, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.01, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
    }

    
    classifier = RandomForestClassifier(**params)
    
    model = Pipeline(steps=[
        ('transformer',transformer),
        ('classifier',classifier)
    ])
    model.fit(train_data, train_label)

    preds = model.predict(test_data)
    accuracy = accuracy_score(test_label, preds)

    return accuracy


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=70)


best_params = study.best_params
best_trial = study.best_trial

print("Best Hyperparameters:", best_params)
print("Best Accuracy:", best_trial.value)
