In [8]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve

In [9]:
class FuzzyCMeans:

    def __init__(self, n_clusters=5, m=2.0, max_iter=300, error=1e-5, random_state=None):
        self.n_clusters = n_clusters
        self.m = m
        self.max_iter = max_iter
        self.error = error
        self.random_state = random_state
        self.centers = None
        self.membership = None
        self.history = []

    def fit(self, X):
        n_samples, n_features = X.shape

        np.random.seed(self.random_state)
        membership = np.random.rand(n_samples, self.n_clusters)
        membership = membership / np.sum(membership, axis=1, keepdims=True)

        for iteration in range(self.max_iter):
            centers = self._update_centers(X, membership)

            new_membership = self._update_membership(X, centers)

            error = np.linalg.norm(new_membership - membership)
            self.history.append(error)

            if error < self.error:
                break

            membership = new_membership

        self.centers = centers
        self.membership = membership
        return self

    def _update_centers(self, X, membership):
        membership_m = membership ** self.m
        centers = np.dot(membership_m.T, X) / np.sum(membership_m.T, axis=1, keepdims=True)
        return centers

    def _update_membership(self, X, centers):
        distances = np.zeros((X.shape[0], self.n_clusters))
        for i in range(self.n_clusters):
            distances[:, i] = np.linalg.norm(X - centers[i], axis=1)

        distances = np.fmax(distances, 1e-10)

        power = 2 / (self.m - 1)
        inv_dist = 1 / distances
        denominator = np.sum((distances[:, :, None] * inv_dist[:, None, :]) ** power, axis=2)

        return 1 / denominator

    def predict(self, X):
        distances = np.zeros((X.shape[0], self.n_clusters))
        for i in range(self.n_clusters):
            distances[:, i] = np.linalg.norm(X - self.centers[i], axis=1)

        distances = np.fmax(distances, 1e-10)

        power = 2 / (self.m - 1)
        inv_dist = 1 / distances
        denominator = np.sum((distances[:, :, None] * inv_dist[:, None, :]) ** power, axis=2)

        return 1 / denominator


In [10]:
def load_and_preprocess():
    df = pd.read_csv('/Users/raya/Desktop/fraud-detection/european-dataset/data/raw/european_data.csv')

    df.fillna(df.median(), inplace=True)

    scaler = StandardScaler()
    df[['Time', 'Amount']] = scaler.fit_transform(df[['Time', 'Amount']])

    return df

In [11]:
df = load_and_preprocess()


X = df.drop('Class', axis=1).values
y = df['Class'].values

fraud_percent = y.mean() * 100
print(f"Fraud percentage: {fraud_percent:.4f}%")
print(f"Original dataset shape: {X.shape}")

fraud_indices = np.where(y == 1)[0]
non_fraud_indices = np.where(y == 0)[0]

sample_size = min(10000, len(non_fraud_indices))
sampled_non_fraud = np.random.choice(non_fraud_indices, size=sample_size, replace=False)

sample_indices = np.concatenate([fraud_indices, sampled_non_fraud])

X_sampled = X[sample_indices]
y_sampled = y[sample_indices]

print(f"\nSampled dataset shape: {X_sampled.shape}")

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_sampled)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

print("\nFuzzy C-Means Clustering Configuration:")
print("Number of clusters: 5")
print("Fuzziness parameter (m): 2.0")
print("Maximum iterations: 300")
print("Error tolerance: 1e-5")

fcm = FuzzyCMeans(n_clusters=5, m=2.0, max_iter=300, error=1e-5, random_state=42)
fcm.fit(X_scaled)

cluster_centers = fcm.centers
membership_matrix = fcm.membership
labels = np.argmax(membership_matrix, axis=1)

silhouette = silhouette_score(X_scaled, labels)
calinski = calinski_harabasz_score(X_scaled, labels)
davies = davies_bouldin_score(X_scaled, labels)

print("\nCluster Validity Indices:")
print(f"Silhouette Score: {silhouette:.4f}")
print(f"Calinski-Harabasz Index: {calinski:.4f}")
print(f"Davies-Bouldin Index: {davies:.4f}")

anomaly_scores = 1 - np.max(membership_matrix, axis=1)

auc = roc_auc_score(y_sampled, anomaly_scores)

print("\nFraud Detection Performance:")
print(f"AUC Score: {auc:.4f}")

fpr, tpr, thresholds = roc_curve(y_sampled, anomaly_scores)
j_scores = tpr - fpr
optimal_idx = np.argmax(j_scores)
optimal_threshold = thresholds[optimal_idx]

y_pred = (anomaly_scores > optimal_threshold).astype(int)

print("\nClassification Report:")
print(classification_report(y_sampled, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_sampled, y_pred))

Fraud percentage: 0.1727%
Original dataset shape: (284807, 30)

Sampled dataset shape: (10492, 30)

Fuzzy C-Means Clustering Configuration:
Number of clusters: 5
Fuzziness parameter (m): 2.0
Maximum iterations: 300
Error tolerance: 1e-5

Cluster Validity Indices:
Silhouette Score: 0.0113
Calinski-Harabasz Index: 132.7509
Davies-Bouldin Index: 3.6328

Fraud Detection Performance:
AUC Score: 0.8783

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.84      0.91     10000
           1       0.19      0.81      0.31       492

    accuracy                           0.83     10492
   macro avg       0.59      0.82      0.61     10492
weighted avg       0.95      0.83      0.88     10492


Confusion Matrix:
[[8351 1649]
 [  95  397]]
