In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.utils.class_weight import compute_sample_weight
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import StackingClassifier
import joblib

In [11]:
def load_data(file_path):
    if file_path.endswith('.csv'):
        return pd.read_csv(file_path)
    elif file_path.endswith('.pkl'):
        return pd.read_pickle(file_path)
    else:
        raise ValueError("Unsupported file format")



In [13]:

data = load_data("processed_data.csv")

In [15]:
label_col = 'TX_FRAUD'
y = data[label_col]
X = data.drop(columns=[label_col])

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [19]:
### Cell 4: Handle Imbalance (Using Class Weights & SMOTE)
# Compute class weights
class_weights = compute_sample_weight(class_weight='balanced', y=y_train)

# Apply SMOTE if needed (Optional)
smote = SMOTE(sampling_strategy=0.2, random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

In [21]:
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
xgb = XGBClassifier(n_estimators=100, random_state=42, scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]))


In [23]:
base_models = [('random_forest', rf), ('xgboost', xgb)]
meta_model = LogisticRegression()

stacking_clf = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)


In [25]:
stacking_clf.fit(X_train_sm, y_train_sm)


In [26]:
y_pred = stacking_clf.predict(X_test)
y_prob = stacking_clf.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1914
           1       1.00      1.00      1.00        16

    accuracy                           1.00      1930
   macro avg       1.00      1.00      1.00      1930
weighted avg       1.00      1.00      1.00      1930

ROC AUC Score: 1.0


In [34]:
joblib.dump(stacking_clf, 'stacked_model.pkl')

['stacked_model.pkl']

In [36]:
loaded_model = joblib.load('stacked_model.pkl')
def predict_fraud(new_data):
    return loaded_model.predict(new_data)


In [38]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import joblib

# Load the test data (Make sure you split data earlier)
X_test = pd.read_pickle("processed_data.pkl")  # Adjust if needed
y_test = X_test["TX_FRAUD"]  # Label column
X_test = X_test.drop(columns=["TX_FRAUD"])  # Features only

# Load the trained model
stacked_model = joblib.load("stacked_model.pkl")

# Make predictions
y_pred = stacked_model.predict(X_test)
y_pred_proba = stacked_model.predict_proba(X_test)[:, 1]

# 🔹 Confusion Matrix
plt.figure(figsize=(6,4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues", xticklabels=["No Fraud", "Fraud"], yticklabels=["No Fraud", "Fraud"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

# 🔹 Classification Report
print("📌 Classification Report:\n", classification_report(y_test, y_pred))

# 🔹 ROC-AUC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.2f})", color="darkorange")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC-AUC Curve")
plt.legend()
plt.show()


ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:TX_TIME_SECONDS: object, TX_TIME_DAYS: object