# Libraries

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer
from pgmpy.models import DiscreteBayesianNetwork
from pgmpy.estimators import TreeSearch, BayesianEstimator
from pgmpy.inference import VariableElimination
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [13]:
def load_and_preprocess():
    df = pd.read_csv('/Users/raya/Desktop/fraud-detection/european-dataset/data/raw/european_data.csv')
    features = ['V3', 'V4', 'V7', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'Amount', 'Class']
    return df[features]

def discretize_data(df, n_bins=5):
    discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='quantile')
    cont_features = [col for col in df.columns if col not in ['Class']]
    df_disc = df.copy()
    df_disc[cont_features] = discretizer.fit_transform(df[cont_features])
    return df_disc.astype(int)

def main():
    df = load_and_preprocess()
    df_disc = discretize_data(df)

    train_df, test_df = train_test_split(df_disc, test_size=0.2, random_state=42)

    est = TreeSearch(train_df, root_node='V14')
    tan_model = est.estimate(estimator_type='tan', class_node='Class')

    model = DiscreteBayesianNetwork(tan_model.edges())
    model.fit(train_df, estimator=BayesianEstimator, prior_type='dirichlet', pseudo_counts=1)

    infer = VariableElimination(model)

    def predict(row):
        evidence = row.drop('Class').to_dict()
        try:
            query = infer.query(variables=['Class'], evidence=evidence)
            return query.values[1]
        except:
            return 0.5

    test_df['Predicted_Prob'] = test_df.apply(predict, axis=1)
    test_df['Predicted_Class'] = (test_df['Predicted_Prob'] > 0.35).astype(int)

    y_true = test_df['Class']
    y_pred = test_df['Predicted_Class']

    print("Classification Report:")
    print(classification_report(y_true, y_pred))

    print("\nConfusion Matrix:")
    print(confusion_matrix(y_true, y_pred))

    print("\nAUC-ROC:", roc_auc_score(y_true, test_df['Predicted_Prob']))

main()

INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'V3': 'N', 'V4': 'N', 'V7': 'N', 'V10': 'N', 'V11': 'N', 'V12': 'N', 'V14': 'N', 'V16': 'N', 'V17': 'N', 'Amount': 'N', 'Class': 'N'}
Building tree: 100%|██████████| 55/55.0 [00:00<00:00, 342.33it/s]
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'V3': 'N', 'V4': 'N', 'V7': 'N', 'V10': 'N', 'V11': 'N', 'V12': 'N', 'V14': 'N', 'V16': 'N', 'V17': 'N', 'Amount': 'N', 'Class': 'N'}


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.54      0.82      0.65        98

    accuracy                           1.00     56962
   macro avg       0.77      0.91      0.82     56962
weighted avg       1.00      1.00      1.00     56962


Confusion Matrix:
[[56796    68]
 [   18    80]]

AUC-ROC: 0.9608769365934331
