In [1]:
import pandas as pd 
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC, OneClassSVM
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, auc, f1_score, precision_score, recall_score
import matplotlib.pyplot as plt

In [2]:
class DataProcessor:

    def __init__(self, csv_file_path):
        self.csv_file_path = csv_file_path
        self.data = None
        self.to_scale = []
        self.categorical_cols = ['device_os', 'source', 'payment_type', 'employment_status']
        self.cols_to_del = ['housing_status', 'device_fraud_count', 'bank_branch_count_8w','month', 'prev_address_months_count', 'bank_months_count', 'days_since_request', 'proposed_credit_limit']
        self.col_with_nan = ['current_address_months_count']
        self.X_train = None
        self.X_val = None
        self.y_train = None
        self.y_val = None

    def load_data(self):
        self.data = pd.read_csv(self.csv_file_path)
        self.data.drop(self.cols_to_del, axis=1, inplace=True)
        self.data.loc[:, self.col_with_nan] = self.data[self.col_with_nan].replace(-1, np.nan)
    
    def knn_impute(self):
        knn_imputer = KNNImputer()
        self.data[self.col_with_nan] = knn_imputer.fit_transform(self.data[self.col_with_nan])
        self.data[self.col_with_nan] = self.data[self.col_with_nan].astype(int)

    def scale_data(self):
        scl = MinMaxScaler()
        self.to_scale = [col for col in self.to_scale if col not in ['device_os', 'source', 'payment_type', 'fraud_bool', 'employment_status']]
        for col in self.to_scale:
            self.data[col] = scl.fit_transform(self.data[col].values.reshape(-1, 1))

    def  one_hot_encode(self):
        self.data = pd.get_dummies(self.data, columns=self.categorical_cols, prefix=self.categorical_cols)
    
    def split_data(self):
        X = self.data.drop('fraud_bool', axis=1)
        y = self.data['fraud_bool'].copy()

        X_train, X_rest, y_train, y_rest = train_test_split(X, y, test_size=0.2, random_state=42)
        X_val, X_test, y_val, y_test = train_test_split(X_rest, y_rest, test_size=0.2, random_state=42)

        self.X_train = X_train
        self.X_val = X_val
        self.y_train = y_train
        self.y_val = y_val
        self.X_test = X_test
        self.y_test = y_test

In [3]:
data_processor = DataProcessor('Base.csv')
# Perform the steps
data_processor.load_data()
data_processor.knn_impute()
data_processor.scale_data()
data_processor.one_hot_encode()

data_processor.split_data()

# Access the processed data and splits
X_train = data_processor.X_train
X_val = data_processor.X_val
y_train = data_processor.y_train
y_val = data_processor.y_val
X_test = data_processor.X_test
y_test = data_processor.y_train

In [4]:
# Train Support Vector Classifier (SVC)

class_weight = {0:1, 1:10}
svc_clf = SVC(class_weight=class_weight)
svc_clf.fit(X_train, y_train)

In [None]:
# Train One-Class SVM
oneclass_svm_clf = OneClassSVM(kernel='RBF')
oneclass_svm_clf.fit(X_train)

In [None]:
# Train Random Forest Classifier
class_weight = {0:1, 1:10}
rf_clf = RandomForestClassifier(n_estimators=100, class_weight=class_weight, random_state=42)
rf_clf.fit(X_train, y_train)

In [None]:
# Train XGBoost Classifier
xgb_clf = XGBClassifier(scale_pos_weight=np.sum(y_train == 0) / np.sum(y_train == 1))
xgb_clf.fit(X_train, y_train)

In [None]:
# Predict probabilities for each model
svc_probs = svc_clf.predict_proba(X_test)[:, 1]
oneclass_svm_probs = oneclass_svm_clf.decision_function(X_test)
rf_probs = rf_clf.predict_proba(X_test)[:, 1]
xgb_probs = xgb_clf.predict_proba(X_test)[:, 1]

In [None]:
# Calculate ROC curves and AUC
fpr_svc, tpr_svc, _ = roc_curve(y_test, svc_probs)
roc_auc_svc = auc(fpr_svc, tpr_svc)

fpr_oneclass_svm, tpr_oneclass_svm, _ = roc_curve(y_test, oneclass_svm_probs)
roc_auc_oneclass_svm = auc(fpr_oneclass_svm, tpr_oneclass_svm)

fpr_rf, tpr_rf, _ = roc_curve(y_test, rf_probs)
roc_auc_rf = auc(fpr_rf, tpr_rf)

fpr_xgb, tpr_xgb, _ = roc_curve(y_test, xgb_probs)
roc_auc_xgb = auc(fpr_xgb, tpr_xgb)

In [None]:
# Calculate F1-score, precision, and recall
f1_svc = f1_score(y_test, svc_clf.predict(X_test))
precision_svc = precision_score(y_test, svc_clf.predict(X_test))
recall_svc = recall_score(y_test, svc_clf.predict(X_test))

f1_oneclass_svm = f1_score(y_test, oneclass_svm_clf.predict(X_test))
precision_oneclass_svm = precision_score(y_test, oneclass_svm_clf.predict(X_test))
recall_oneclass_svm = recall_score(y_test, oneclass_svm_clf.predict(X_test))

f1_rf = f1_score(y_test, rf_clf.predict(X_test))
precision_rf = precision_score(y_test, rf_clf.predict(X_test))
recall_rf = recall_score(y_test, rf_clf.predict(X_test))

f1_xgb = f1_score(y_test, xgb_clf.predict(X_test))
precision_xgb = precision_score(y_test, xgb_clf.predict(X_test))
recall_xgb = recall_score(y_test, xgb_clf.predict(X_test))

In [None]:
# Plot ROC curves
plt.figure()
plt.plot(fpr_svc, tpr_svc, label='SVC (AUC = %0.2f)' % roc_auc_svc)
plt.plot(fpr_oneclass_svm, tpr_oneclass_svm, label='One-Class SVM (AUC = %0.2f)' % roc_auc_oneclass_svm)
plt.plot(fpr_rf, tpr_rf, label='Random Forest (AUC = %0.2f)' % roc_auc_rf)
plt.plot(fpr_xgb, tpr_xgb, label='XGBoost (AUC = %0.2f)' % roc_auc_xgb)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Print F1-score, precision, and recall
print("SVC:")
print("F1-score:", f1_svc)
print("Precision:", precision_svc)
print("Recall:", recall_svc)

print("One-Class SVM:")
print("F1-score:", f1_oneclass_svm)
print("Precision:", precision_oneclass_svm)
print("Recall:", recall_oneclass_svm)

print("Random Forest:")
print("F1-score:", f1_rf)
print("Precision:", precision_rf)
print("Recall:", recall_rf)

print("XGBoost:")
print("F1-score:", f1_xgb)
print("Precision:", precision_xgb)
print("Recall:", recall_xgb)