In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, average_precision_score

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [None]:
df = pd.read_csv('creditcard.csv')
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.isna().sum()

In [None]:
df[["Time", "Amount"]].describe()

In [None]:
df.dtypes

In [None]:
y_hidden = df['Class'].copy()
X = df.drop(columns=['Class']).copy()

feature_cols = X.columns.to_list()

assert "Class" not in feature_cols, "Label leakage: 'Class' is still in features."
assert y_hidden.shape[0] == X.shape[0], "Row mismatch between X and y."
assert X.shape[1] == 30, f"Unexpected number of feature columns: {X.shape[1]}"

print("X shape:", X.shape)
print("y_hidden shape:", y_hidden.shape)
print("First 5 feature columns:", feature_cols[:5])
print("Last 5 feature columns:", feature_cols[-5:])
print("Label value counts (context only, not used for training):")
print(y_hidden.value_counts())

#EDA

In [None]:
time_hours = X['Time'] / 3600
time_hours.describe()

In [None]:
plt.figure(figsize= (10, 3))
plt.hist(time_hours, bins=48)
plt.xlabel('Time (hours) since first transaction')
plt.ylabel( 'Number of transactions')
plt.title('Transaction volume over time')
plt.show()

In [None]:
plt.figure(figsize=(10, 4))
plt.hist(X["Amount"], bins=100)
plt.xlabel("Transaction Amount")
plt.ylabel("Frequency")
plt.title("Raw transaction amount distribution")
plt.show()

In [None]:
log_amount = np.log1p(X["Amount"])

plt.figure(figsize=(10, 4))
plt.hist(log_amount, bins=100)
plt.xlabel("log(Amount + 1)")
plt.ylabel("Frequency")
plt.title("Log-transformed transaction amount distribution")
plt.show()


In [None]:
# Correct chronological ordering and label alignment
X_sorted = X.sort_values(by="Time")
y_sorted = y_hidden.loc[X_sorted.index]

# Reset both after alignment
X_sorted = X_sorted.reset_index(drop=True)
y_sorted = y_sorted.reset_index(drop=True)

split_ratio = 0.7
split_index = int(len(X_sorted) * split_ratio)

X_train = X_sorted.iloc[:split_index]
X_test = X_sorted.iloc[split_index:]

y_train_hidden = y_sorted.iloc[:split_index]
y_test_hidden = y_sorted.iloc[split_index:]

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)
print("Train time range:", X_train['Time'].min(), 'to', X_train['Time'].max())
print("Test time range:", X_test['Time'].min(), 'to', X_test['Time'].max())

Given the sequential nature of the data and non-uniform transaction volume over time, 
a time-based split was used to simulate real-world deployment. 
The model was trained on earlier transactions and evaluated on later ones

In [None]:
#Scaling

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns= feature_cols)
X_test_scaled = pd.DataFrame(X_test_scaled, columns= feature_cols)

print('Scaled traing set shape:', X_train_scaled.shape)
print('Scaled test set shape:', X_test_scaled.shape)
print('Train medians after scaling (should be near 0):')
print(X_train_scaled.median().head())

Given the heavy-tailed distribution of transaction amounts, 
RobustScaler was used to reduce the influence of extreme values. 
The scaler was fit exclusively on training data and applied unchanged to test data to avoid information leakage.

In [None]:
#Isolation Forest

iso_forest = IsolationForest(
    n_estimators=100,
    max_samples='auto',
    contamination='auto',
    random_state=RANDOM_STATE,
    n_jobs=-1
)

In [None]:
iso_forest.fit(X_train_scaled)

In [None]:
train_scores = iso_forest.decision_function(X_train_scaled)
test_scores = iso_forest.decision_function(X_test_scaled)

print("Training scores summary:")
print(pd.Series(train_scores).describe())
print("Test scores summary:")
print(pd.Series(test_scores).describe())

In [None]:
# Implementing threshold

alert_rate = 0.01 # 1% most anomalous
threshold = np.percentile(train_scores, 100 * alert_rate)
print(f'Anomaly score threshold (1%):{threshold:.4f}')

y_test_pred = (test_scores <= threshold).astype(int)

alert_count = y_test_pred.sum()
alert_fraction = alert_count / len(y_test_pred)

print(f'Number of alerts in test set: {alert_count}')
print(f'Alert fraction in test set: {alert_fraction:.4f}')


In [None]:
#Evaluation
cm = confusion_matrix(y_test_hidden, y_test_pred)
cm

In [None]:
print(classification_report(y_test_hidden, y_test_pred, digits=4))

In [None]:
precision, recall, thresholds_pr = precision_recall_curve(
    y_test_hidden,
    -test_scores  # negate because lower scores = more anomalous
)

auprc = average_precision_score(y_test_hidden, -test_scores)

print(f"AUPRC: {auprc:.4f}")

In [None]:
plt.figure(figsize=(6, 4))
plt.plot(recall, precision)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precisionâ€“Recall Curve (Isolation Forest)")
plt.show()


In [None]:
# The Isolation Forest model has been successfully implemented for credit card fraud detection. 
# The Isolation Forest, trained without labels, successfully identified approximately half of fraudulent transactions while flagging around 1% of total transactions as anomalous. 
# Although precision remains low due to the extreme class imbalance, the model achieves a substantial lift over random detection, 
# as reflected in an AUPRC significantly above the baseline fraud rate. This behavior is consistent with unsupervised anomaly detection, 
# where the model captures rare and unusual patterns rather than fraud specifically.

In [None]:
# Error Analysis

# Build a results DataFrame for analysis
results = X_test.copy()

results["anomaly_score"] = test_scores
results["predicted_anomaly"] = y_test_pred
results["actual_fraud"] = y_test_hidden.values

results.head()


In [None]:
# True Positives (caught fraud)
tp = results[(results["predicted_anomaly"] == 1) & (results["actual_fraud"] == 1)]

# False Positives (flagged but not fraud)
fp = results[(results["predicted_anomaly"] == 1) & (results["actual_fraud"] == 0)]

# False Negatives (missed fraud)
fn = results[(results["predicted_anomaly"] == 0) & (results["actual_fraud"] == 1)]

print("TP:", len(tp))
print("FP:", len(fp))
print("FN:", len(fn))

In [None]:
plt.figure(figsize=(8, 4))
plt.hist(tp["Amount"], bins=50, alpha=0.6, label="TP (caught fraud)")
plt.hist(fn["Amount"], bins=50, alpha=0.6, label="FN (missed fraud)")
plt.xlabel("Transaction Amount")
plt.ylabel("Frequency")
plt.legend()
plt.title("Amount distribution: caught vs missed fraud")
plt.show()

In [None]:
#The model tends to catch higher-amount frauds more reliably, , while low-value fraud often appears statistically normal and is therefore missed.

In [None]:
plt.figure(figsize=(8, 4))
plt.hist(fp["Amount"], bins=50, alpha=0.6, label="False Positives")
plt.hist(
    results[results["actual_fraud"] == 0]["Amount"],
    bins=50,
    alpha=0.4,
    label="All normal transactions"
)
plt.xlabel("Transaction Amount")
plt.ylabel("Frequency")
plt.legend()
plt.title("False positives vs normal transactions (Amount)")
plt.show()


In [None]:
#False positives are dominated by low-value transactions.

In [None]:
tp_scores = tp["anomaly_score"]
fp_scores = fp["anomaly_score"]

def q(s):
    return s.quantile([0.01, 0.05, 0.10, 0.25, 0.50, 0.75]).rename({
        0.01:"p01", 0.05:"p05", 0.10:"p10", 0.25:"p25", 0.50:"p50", 0.75:"p75"
    })

print("TP score quantiles:")
print(q(tp_scores))
print("\nFP score quantiles:")
print(q(fp_scores))

print("\nMeans:")
print("TP mean:", tp_scores.mean())
print("FP mean:", fp_scores.mean())


In [None]:
plt.figure(figsize=(8,4))
plt.hist(tp_scores, bins=40, density=True, alpha=0.6, label="TP (density)")
plt.hist(fp_scores, bins=40, density=True, alpha=0.6, label="FP (density)")
plt.xlabel("Anomaly score")
plt.ylabel("Density")
plt.title("Score distribution among alerts (density)")
plt.legend()
plt.show()


In [None]:
#Among flagged alerts, non-fraud transactions exhibit more extreme anomaly scores than fraud cases. 
# False positives dominate the extreme left tail of the score distribution, while true fraud cases tend to be moderately anomalous rather than extreme. 
# This indicates that extreme rarity in feature space does not uniquely correspond to fraud, and that many fraudulent transactions closely resemble normal behavior. 
# Consequently, anomaly score magnitude alone is insufficient to distinguish fraud from other rare but legitimate patterns.

In [None]:
plt.figure(figsize=(8, 4))
plt.scatter(tp["Time"], tp["Amount"], alpha=0.6, label="TP")
plt.scatter(fn["Time"], fn["Amount"], alpha=0.6, label="FN")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.legend()
plt.title("Fraud behavior over time")
plt.show()


In [None]:
#Fraudulent transactions occur throughout the evaluation period, with no strong temporal clustering, 
# indicating that anomaly detection performance is driven primarily by feature-space rarity rather than timing effects.

In [None]:
'''

Conclusion
1. Unsupervised anomaly detection captures fraud as a subset of rare behavior

The Isolation Forest, trained without labels, successfully identifies approximately half of fraudulent transactions. 
This indicates that a meaningful portion of fraud manifests as statistically unusual behavior relative to the broader transaction population. 
However, fraud does not constitute the most extreme anomalies in feature space.

2. Extreme anomalies are often legitimate transactions
Error analysis shows that false positives dominate the extreme left tail of the anomaly score distribution. 
Many legitimate transactions exhibit more extreme anomaly scores than actual fraud cases. 
This confirms that rarity does not imply fraud, and that unsupervised anomaly detection primarily identifies unusual structure rather than malicious intent.

3. Fraud tends to be moderately anomalous, not extreme
True fraud cases cluster closer to the anomaly threshold rather than at the most extreme scores. 
This suggests that many fraudulent transactions closely resemble normal behavior, particularly at low transaction amounts, 
which limits recall and precision when using anomaly detection alone.

4. Transaction amount influences detectability but is not decisive
Higher-value fraud is more likely to be detected, while low-amount fraud frequently blends into dense regions of normal behavior and is missed. 
However, false positives also concentrate heavily at low transaction amounts, 
indicating that amount alone does not separate fraud from non-fraud.

5. Anomaly scores are informative but not sufficient
Although anomaly score magnitude does not cleanly separate fraud from non-fraud, 
it provides useful ranking information. This supports the use of anomaly detection 
as a prioritization or filtering mechanism, rather than a standalone decision system.

'''