In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

In [None]:
data = pd.read_csv('../creditcard.csv')
data.head()

In [None]:
X = data.drop('Class', axis=1).values
y = data['Class'].values

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
cluster = KMeans(n_clusters=2, random_state=42, n_init=10)
cluster.fit(X_scaled)

In [None]:
centers = cluster.cluster_centers_
assigned = cluster.labels_

In [None]:
contamination = max(1e-6, y.mean())
distances = np.linalg.norm(X_scaled - centers[assigned], axis=1)
threshold = np.quantile(distances, 1.0 - contamination)

In [None]:
y_pred = (distances > threshold).astype(int)

In [None]:
print('Fraud Fraction: ', contamination)
print('Threshold: ', threshold)

In [None]:
print('Classification Report: \n')
print(classification_report(y, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y, y_pred))

In [None]:
plt.scatter(range(len(distances)), distances, c=y_pred, cmap='coolwarm', s=5)
plt.xlabel('Transaction Index')
plt.ylabel('Distance from cluster')
plt.title('Distance of transactions (red = predicted anamoly)')
plt.show()