In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

# Load the dataset
file_path = r"D:\ML_proj\creditcard.csv"  # Use raw string to handle backslashes in the path
df = pd.read_csv(file_path)

# Ensure there are no extra spaces in the column names
df.columns = df.columns.str.strip()

# 1. Handle missing values: Impute numerical columns with the mean, and categorical columns with the most frequent value
imputer = SimpleImputer(strategy='most_frequent')  # For categorical and numerical columns
df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# 2. Encode categorical variables if necessary using LabelEncoder (only for categorical variables)
label_encoder = LabelEncoder()
for col in df.select_dtypes(include=[object]).columns:  # Only apply to categorical columns
    df[col] = label_encoder.fit_transform(df[col])

# 3. Feature Scaling: Standardize numerical columns
scaler = StandardScaler()
df[df.select_dtypes(include=[np.number]).columns] = scaler.fit_transform(df.select_dtypes(include=[np.number]))

# 4. Check if the target variable ('Class') exists
if "Class" not in df.columns:
    raise KeyError("❌ 'Class' column not found in dataset!")

# Separate features (X) and target (y)
X = df.drop(columns=["Class"])  # All columns except 'Class'
y = df["Class"]  # 'Class' column is the target

# 5. Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save the preprocessed dataset (optional)
df.to_csv(r"D:\ML_proj\creditcard_processed.csv", index=False)  # Save preprocessed data

print("Preprocessing completed successfully!")


Preprocessing completed successfully!


In [25]:
import numpy as np
import pandas as pd
from scipy.stats import dirichlet
from collections import Counter
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import ADASYN

# Load preprocessed dataset
file_path = r"D:\ML_proj\creditcard_processed.csv"

# Ensure correct file path and clean column names
df = pd.read_csv(file_path)
df.columns = df.columns.str.strip()  # Remove extra spaces from column names

# Check if 'Class' column exists
if "Class" not in df.columns:
    raise KeyError("❌ 'Class' column not found in dataset! Please check the file.")

# Separate features and target
X = df.drop(columns=["Class"])
y = df["Class"]

# Ensure the target variable is discrete (binary classification in this case)
# If the target is continuous, convert to binary (for example, thresholding at 0.5 for a binary classification task)
if y.dtype != 'int64' and y.dtype != 'object':  # If the target is continuous
    y = y.apply(lambda x: 1 if x >= 0.5 else 0)

# Handle class imbalance using ADASYN
adasyn = ADASYN(n_neighbors=3, random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X, y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Function to generate class prior using Pitman-Yor Process (PYP)
def pitman_yor_prior(alpha, d, num_classes):
    base_measure = np.ones(num_classes)
    prior_distribution = dirichlet(alpha * base_measure)
    return prior_distribution.rvs()[0]

# Self-Adaptive Bayesian Decision Tree
class BayesianDecisionTree:
    def __init__(self, alpha=0.5, d=0.1):
        self.alpha = alpha  # Strength of prior
        self.d = d  # Discount parameter
        self.tree = GradientBoostingClassifier(n_estimators=100, learning_rate=0.05, max_depth=3, random_state=42)
        self.class_prior = None

    def fit(self, X, y):
        num_classes = len(np.unique(y))
        self.class_prior = pitman_yor_prior(self.alpha, self.d, num_classes)

        # Compute class weight mapping (based on priors)
        class_counts = np.bincount(y)
        class_weights = {cls: self.class_prior[i] for i, cls in enumerate(np.unique(y))}
        sample_weights = np.array([class_weights[label] for label in y])

        self.tree.fit(X, y, sample_weight=sample_weights)

    def predict(self, X):
        return self.tree.predict(X)

    def predict_proba(self, X):
        return self.tree.predict_proba(X)

# Train the Bayesian Decision Tree
model = BayesianDecisionTree()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.77      0.99      0.87     56736
           1       0.99      0.71      0.82     56988

    accuracy                           0.85    113724
   macro avg       0.88      0.85      0.85    113724
weighted avg       0.88      0.85      0.85    113724



In [27]:
import pandas as pd
import numpy as np

# Select random samples from the test set for qualitative analysis
num_samples = 5  # Adjust the number of samples as needed
sample_indices = np.random.choice(len(X_test), num_samples, replace=False)

# Extract the sample inputs using `.iloc`
sample_inputs = X_test.iloc[sample_indices]
sample_true_labels = y_test.iloc[sample_indices]

# Predict using the trained model
sample_predictions = model.predict(sample_inputs)

# Convert to DataFrame for better readability
sample_results = sample_inputs.copy()  # Copy feature values
sample_results["True Label"] = sample_true_labels.values
sample_results["Predicted Label"] = sample_predictions

# Display the sample inputs and their corresponding predictions
print("📌 Sample Model Predictions:")
print(sample_results)


📌 Sample Model Predictions:
            Time        V1        V2        V3        V4        V5        V6  \
451851 -0.332775 -1.885710 -1.519765 -0.708429  1.569756  5.833161 -3.549121   
67538  -0.889231 -0.777055  0.787588  0.707614  0.534474  0.961773  0.887339   
568304  1.565482  0.993772  0.054717 -1.682025  0.305488  0.767668 -0.127143   
358980 -0.913241  0.512631  0.245631  0.351688  0.346108 -0.017666 -0.321067   
374146 -0.769261 -0.739150  0.835571  0.065643  1.442747 -0.617511  0.598416   

              V7        V8        V9  ...       V22        V23       V24  \
451851 -6.438805 -0.206824  1.394375  ... -1.422179 -19.474998 -0.722030   
67538   0.399437  0.337275 -0.622844  ... -1.350870   0.008639 -2.999291   
568304  0.194557 -0.073268  0.554853  ... -0.390436  -0.130421 -0.721502   
358980  0.112549 -0.137816  0.136373  ... -0.005505  -0.244314 -0.446341   
374146 -0.957154  0.050575 -0.547493  ...  0.767255  -0.054113 -0.318567   

             V25       V26       V