In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split


# Generate synthetic data (Credit card transactions scenario)

In [2]:
# Step 1: Generate synthetic data (Credit card transactions scenario)
# Generating 100 normal transactions (2 features: 'amount', 'transaction_frequency') and 10 fraudulent transactions
np.random.seed(42)

## Normal transactions

In [4]:
# Normal transactions: Amounts around $50, and frequency of transactions between 1 to 5 times per day
normal_transactions = np.random.normal(loc=50, scale=10, size=(100, 2))  # 100 normal transactions
normal_transactions

array([[53.5778736 , 55.60784526],
       [60.83051243, 60.53802052],
       [36.22330632, 40.6217496 ],
       [55.15035267, 55.13785951],
       [55.15047686, 88.52731491],
       [55.70890511, 61.3556564 ],
       [59.54001763, 56.51391251],
       [46.84730755, 57.5896922 ],
       [42.27174785, 47.63181393],
       [45.14636452, 50.81874139],
       [73.14658567, 31.32734807],
       [56.8626019 , 33.87284129],
       [45.28068134, 60.88950597],
       [50.64280019, 39.22255222],
       [42.84696291, 56.79597749],
       [42.69633368, 52.1645859 ],
       [50.4557184 , 43.48399652],
       [71.43944089, 56.33919022],
       [29.74857413, 51.86454315],
       [43.38213535, 58.52433335],
       [42.07479262, 48.85263559],
       [55.04987279, 58.65755194],
       [37.99703593, 46.65498764],
       [45.25054689, 43.46670767],
       [67.6545424 , 54.04981711],
       [37.39116046, 59.17861947],
       [71.22156197, 60.32465261],
       [34.80630034, 45.15765927],
       [62.66911149,

## Fraudulent transactions

In [27]:
# Fraudulent transactions: Randomly large amounts and high frequency (outliers)
fraudulent_transactions = np.random.uniform(low=500, high=5000, size=(10, 2))  # 10 fraudulent transactions
fraudulent_transactions

array([[1011.0091649 , 3522.07938017],
       [2841.38465407, 3975.43276281],
       [2840.735755  , 4334.81675143],
       [2983.58077449, 3024.22087191],
       [4444.94121196, 2315.67289796],
       [1103.06852803,  629.52204341],
       [3898.11765053, 3291.39298109],
       [3668.35895645, 1458.33872679],
       [1113.67164014,  565.45099551],
       [2077.64401463, 3154.62959085]])

## Combined dataset (normal + fraudulent)

In [6]:

# Combine normal and fraudulent transactions
# The first 100 rows are normal transactions, and the last 10 rows are fraudulent transactions
X = np.vstack([normal_transactions, fraudulent_transactions])

In [14]:
X

array([[  53.5778736 ,   55.60784526],
       [  60.83051243,   60.53802052],
       [  36.22330632,   40.6217496 ],
       [  55.15035267,   55.13785951],
       [  55.15047686,   88.52731491],
       [  55.70890511,   61.3556564 ],
       [  59.54001763,   56.51391251],
       [  46.84730755,   57.5896922 ],
       [  42.27174785,   47.63181393],
       [  45.14636452,   50.81874139],
       [  73.14658567,   31.32734807],
       [  56.8626019 ,   33.87284129],
       [  45.28068134,   60.88950597],
       [  50.64280019,   39.22255222],
       [  42.84696291,   56.79597749],
       [  42.69633368,   52.1645859 ],
       [  50.4557184 ,   43.48399652],
       [  71.43944089,   56.33919022],
       [  29.74857413,   51.86454315],
       [  43.38213535,   58.52433335],
       [  42.07479262,   48.85263559],
       [  55.04987279,   58.65755194],
       [  37.99703593,   46.65498764],
       [  45.25054689,   43.46670767],
       [  67.6545424 ,   54.04981711],
       [  37.39116046,   

In [25]:
dimensions = X.shape
rows, columns = dimensions

print("Rows:", rows)
print("Columns:", columns)

Rows: 110
Columns: 2


# True labels

In [30]:
# True labels (0: Normal, 1: Fraudulent)
y_true = np.array([0]*100 + [1]*10)  # 100 normal, 10 fraudulent
y_true

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [34]:
y_true_dimensions = y_true.shape
y_true_dimensions

(110,)

# Isolation Forest for Anomaly Detection

In [41]:
# Step 2: Fit Isolation Forest for anomaly detection
model = IsolationForest(contamination=0.1, random_state=42)
model

In [42]:
# Explanation of IsolationForest parameters

print("=== CONTAMINATION PARAMETER ===")
print("The 'contamination' parameter specifies the expected proportion of outliers in the dataset.")
print(f"In our model: contamination=0.1 means we expect 10% of the data to be anomalies/outliers.")
print(f"With {len(X)} total transactions, this means the model expects {int(len(X) * 0.1)} fraudulent transactions.")
print(f"Our actual fraudulent transactions: {sum(y_true)} out of {len(y_true)}")
print(f"Actual contamination rate: {sum(y_true)/len(y_true):.1%}")
print()

print("=== RANDOM_STATE PARAMETER ===")
print("The 'random_state' parameter controls the randomness of the algorithm.")
print("- IsolationForest uses random sampling to build isolation trees")
print("- Setting random_state=42 ensures reproducible results")
print("- Every time you run the code, you'll get the same predictions")
print("- Without random_state, results would vary between runs")
print()

print("=== WHY THESE VALUES? ===")
print("• contamination=0.1: Matches our synthetic data (10 fraudulent out of 110 total)")
print("• random_state=42: Common choice for reproducibility (42 is arbitrary but conventional)")

=== CONTAMINATION PARAMETER ===
The 'contamination' parameter specifies the expected proportion of outliers in the dataset.
In our model: contamination=0.1 means we expect 10% of the data to be anomalies/outliers.
With 110 total transactions, this means the model expects 11 fraudulent transactions.
Our actual fraudulent transactions: 10 out of 110
Actual contamination rate: 9.1%

=== RANDOM_STATE PARAMETER ===
The 'random_state' parameter controls the randomness of the algorithm.
- IsolationForest uses random sampling to build isolation trees
- Setting random_state=42 ensures reproducible results
- Every time you run the code, you'll get the same predictions
- Without random_state, results would vary between runs

=== WHY THESE VALUES? ===
• contamination=0.1: Matches our synthetic data (10 fraudulent out of 110 total)
• random_state=42: Common choice for reproducibility (42 is arbitrary but conventional)


## Predicted dataset
* y values are -1 for anomalies (fraudulent) and 1 for normal data

In [39]:
y_pred = model.fit_predict(X)
y_pred

array([ 1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1])

In [43]:
y_pred_dimensions = y_pred.shape
y_pred_dimensions

(110,)

## Convert predicted dataset to binary labels
* Convert the output of fit_predict to binary labels (1: Fraudulent, 0: Normal)
* IsolationForest outputs -1 for anomalies (fraudulent) and 1 for normal data

In [36]:
# Convert the output of fit_predict to binary labels (1: Fraudulent, 0: Normal)
# IsolationForest outputs -1 for anomalies (fraudulent) and 1 for normal data
y_pred = np.where(y_pred == -1, 1, 0)
y_pred 

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

# Generate confusion matrix

In [37]:
# Step 3: Generate confusion matrix
conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(conf_matrix)



Confusion Matrix:
[[99  1]
 [ 0 10]]


In [None]:
# Explain the results of the Isolation Forest anomaly detection

print("=== ISOLATION FOREST RESULTS EXPLANATION ===\n")

print("1. CONFUSION MATRIX BREAKDOWN:")
print(f"   True Negatives (TN): {conf_matrix[0,0]} - Normal transactions correctly identified as normal")
print(f"   False Positives (FP): {conf_matrix[0,1]} - Normal transactions incorrectly flagged as fraudulent")
print(f"   False Negatives (FN): {conf_matrix[1,0]} - Fraudulent transactions missed (identified as normal)")
print(f"   True Positives (TP): {conf_matrix[1,1]} - Fraudulent transactions correctly identified as fraudulent")
print()

print("2. MODEL PERFORMANCE:")
accuracy = (conf_matrix[0,0] + conf_matrix[1,1]) / conf_matrix.sum()
precision = conf_matrix[1,1] / (conf_matrix[1,1] + conf_matrix[0,1])
recall = conf_matrix[1,1] / (conf_matrix[1,1] + conf_matrix[1,0])
f1_score = 2 * (precision * recall) / (precision + recall)

print(f"   Accuracy: {accuracy:.1%} - Overall correct predictions")
print(f"   Precision: {precision:.1%} - Of all fraud predictions, how many were actually fraud")
print(f"   Recall: {recall:.1%} - Of all actual fraud cases, how many were detected")
print(f"   F1-Score: {f1_score:.3f} - Harmonic mean of precision and recall")
print()

print("3. BUSINESS IMPACT:")
print(f"   • Successfully detected ALL {conf_matrix[1,1]} fraudulent transactions (100% recall)")
print(f"   • Only {conf_matrix[0,1]} false alarm out of {conf_matrix[0,0] + conf_matrix[0,1]} normal transactions")
print(f"   • False positive rate: {conf_matrix[0,1]/(conf_matrix[0,0] + conf_matrix[0,1]):.1%}")
print()

print("4. WHY THE MODEL WORKED WELL:")
print("   • Clear separation between normal (~$50) and fraudulent ($500-$5000) transaction amounts")
print("   • Isolation Forest effectively isolated the high-value outliers")
print("   • Contamination parameter (0.1) matched the actual fraud rate in our data")
print("   • The synthetic data had distinct patterns making anomaly detection easier")

In [38]:
# Step 4: Additional metrics
print("\nClassification Report:")
print(classification_report(y_true, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       100
           1       0.91      1.00      0.95        10

    accuracy                           0.99       110
   macro avg       0.95      0.99      0.97       110
weighted avg       0.99      0.99      0.99       110

