In [2]:
import pandas as pd

# Load the dataset to understand its structure and contents

dataset = pd.read_csv('transaction_dataset.csv')

# Display the first few rows of the dataset to understand its structure
dataset.head(), dataset.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9841 entries, 0 to 9840
Data columns (total 51 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   Unnamed: 0                                            9841 non-null   int64  
 1   Index                                                 9841 non-null   int64  
 2   Address                                               9841 non-null   object 
 3   FLAG                                                  9841 non-null   int64  
 4   Avg min between sent tnx                              9841 non-null   float64
 5   Avg min between received tnx                          9841 non-null   float64
 6   Time Diff between first and last (Mins)               9841 non-null   float64
 7   Sent tnx                                              9841 non-null   int64  
 8   Received Tnx                                          9841

(   Unnamed: 0  Index                                     Address  FLAG  \
 0           0      1  0x00009277775ac7d0d59eaad8fee3d10ac6c805e8     0   
 1           1      2  0x0002b44ddb1476db43c868bd494422ee4c136fed     0   
 2           2      3  0x0002bda54cb772d040f779e88eb453cac0daa244     0   
 3           3      4  0x00038e6ba2fd5c09aedb96697c8d7b8fa6632e5e     0   
 4           4      5  0x00062d1dd1afb6fb02540ddad9cdebfe568e0d89     0   
 
    Avg min between sent tnx  Avg min between received tnx  \
 0                    844.26                       1093.71   
 1                  12709.07                       2958.44   
 2                 246194.54                       2434.02   
 3                  10219.60                      15785.09   
 4                     36.61                      10707.77   
 
    Time Diff between first and last (Mins)  Sent tnx  Received Tnx  \
 0                                704785.63       721            89   
 1                              

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, roc_curve, auc, RocCurveDisplay
import numpy as np

# Defining models
models_extended = {
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=500),
    "SVM": SVC(probability=True, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "KNN": KNeighborsClassifier()
}

# Storing performance metrics and AUC-ROC data
performance_metrics = {}
roc_data = {}

# Training and evaluating models
for name, model in models_extended.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    confusion_matrices[name] = cm

    # Classification Metrics
    report = classification_report(y_test, y_pred, output_dict=True)
    performance_metrics[name] = {
        "Precision": report["1"]["precision"],
        "Recall": report["1"]["recall"],
        "F1-Score": report["1"]["f1-score"],
        "Accuracy": report["accuracy"]
    }

    # ROC-AUC
    if y_proba is not None:
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        roc_auc = auc(fpr, tpr)
        roc_data[name] = (fpr, tpr, roc_auc)

# Bar graph for accuracy comparison
accuracies = [metrics["Accuracy"] for metrics in performance_metrics.values()]
model_names = list(performance_metrics.keys())

# Plot accuracies as bar graph
plt.figure(figsize=(10, 6))
sns.barplot(x=model_names, y=accuracies, palette="viridis")
plt.ylabel("Accuracy")
plt.title("Comparison of Accuracy Across Algorithms")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("/mnt/data/accuracy_comparison.png", dpi=300)
plt.show()

# Plotting AUC-ROC curves
plt.figure(figsize=(10, 8))
for name, (fpr, tpr, roc_auc) in roc_data.items():
    plt.plot(fpr, tpr, label=f"{name} (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], 'k--', lw=2, label="Random Chance")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("AUC-ROC Curve for Various Algorithms")
plt.legend(loc="lower right")
plt.grid()
plt.tight_layout()
plt.savefig("/mnt/data/roc_curve_comparison.png", dpi=300)
plt.show()

# Pairwise distribution plot
sns.pairplot(pd.concat([X_train, y_train.rename("FLAG")], axis=1), hue="FLAG", diag_kind="kde", palette="husl")
plt.tight_layout()
plt.savefig("/mnt/data/pairwise_distribution.png", dpi=300)

# Export confusion matrices and metrics for review
confusion_matrices, performance_metrics


NameError: name 'SVC' is not defined