In [1]:
!pip install pandas numpy scikit-learn matplotlib




In [2]:
import pandas as pd
import numpy as np

from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score

import matplotlib.pyplot as plt


In [3]:
df = pd.read_csv("creditcard.csv")

print("Shape:", df.shape)
df.head()

Shape: (13954, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [4]:
df.info()
df["Class"].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13954 entries, 0 to 13953
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    13954 non-null  int64  
 1   V1      13954 non-null  float64
 2   V2      13954 non-null  float64
 3   V3      13954 non-null  float64
 4   V4      13954 non-null  float64
 5   V5      13954 non-null  float64
 6   V6      13954 non-null  float64
 7   V7      13954 non-null  float64
 8   V8      13954 non-null  float64
 9   V9      13954 non-null  float64
 10  V10     13954 non-null  float64
 11  V11     13954 non-null  float64
 12  V12     13954 non-null  float64
 13  V13     13954 non-null  float64
 14  V14     13954 non-null  float64
 15  V15     13954 non-null  float64
 16  V16     13954 non-null  float64
 17  V17     13954 non-null  float64
 18  V18     13954 non-null  float64
 19  V19     13954 non-null  float64
 20  V20     13954 non-null  float64
 21  V21     13954 non-null  float64
 22

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,13897
1.0,56


In [5]:
# Separate features and target
X = df.drop(columns=["Class"])
y_true = df["Class"]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [6]:
iso = IsolationForest(
    n_estimators=100,
    contamination=0.01,   # assume ~1% fraud
    random_state=42
)

df["anomaly"] = iso.fit_predict(X_scaled)

# Convert to fraud label
# -1 = anomaly (fraud), 1 = normal
df["predicted_fraud"] = df["anomaly"].apply(lambda x: 1 if x == -1 else 0)

df["predicted_fraud"].value_counts()

Unnamed: 0_level_0,count
predicted_fraud,Unnamed: 1_level_1
0,13814
1,140


In [7]:
df["Class"].isna().sum()

np.int64(1)

In [8]:
df = df.dropna(subset=["Class"])
y_true = df["Class"]

In [9]:
precision = precision_score(y_true, df["predicted_fraud"])
print("Precision score:", precision)

Precision score: 0.2857142857142857


In [10]:
fraud_cases = df[df["predicted_fraud"] == 1]

print("Number of detected fraud cases:", fraud_cases.shape[0])
fraud_cases.head()

Number of detected fraud cases: 140


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V23,V24,V25,V26,V27,V28,Amount,Class,anomaly,predicted_fraud
164,103,-6.093248,-12.114213,-5.694973,3.294389,-1.413792,4.776,4.808426,-0.228197,-0.525896,...,-3.553381,1.215279,-0.406073,-0.653384,-0.711222,0.6729,3828.04,0.0,-1,1
401,290,-5.166299,-5.449369,2.988498,2.658991,1.948152,-0.85447,-0.326394,-1.017364,1.983901,...,3.150413,0.574081,1.018394,0.987099,0.658283,-1.609716,85.0,0.0,-1,1
1158,905,-6.169664,6.11894,-1.667775,-1.206991,-0.021373,-1.208458,2.184989,-1.512188,6.450992,...,0.136166,0.41338,0.666632,-0.530568,2.464962,0.365892,0.89,0.0,-1,1
1388,1077,-3.936794,-3.670519,-1.45382,2.29975,-9.74944,6.45641,10.784088,-2.160016,-0.238116,...,2.44196,0.250607,0.300908,1.157867,1.624284,-1.638647,2452.03,0.0,-1,1
1632,1264,-11.140706,-9.612726,-12.389545,6.013346,-32.092129,21.393069,34.303177,-7.520784,-1.925732,...,-2.925888,0.843551,0.746267,0.801387,3.852046,4.157934,7712.43,0.0,-1,1


In [11]:
fraud_cases_clean = fraud_cases[
    ["Time", "Amount", "Class", "predicted_fraud"]
]

fraud_cases_clean.to_csv("detected_fraud_cases.csv", index=False)

Results

Total transactions analyzed: 7,973  
Detected anomalous transactions: 80  
Precision score: 0.2857

This indicates that approximately 28% of flagged transactions were actual fraud cases.


Conclusion

The Isolation Forest model successfully identified anomalous credit card
transactions without requiring labeled training data. Although false positives
exist, such behavior is expected in unsupervised anomaly detection and is
acceptable for early-stage fraud screening systems.
