In [3]:
!pip install pyspark
!pip install shap
!pip install pyod
!pip install streamlit

Collecting pyod
  Downloading pyod-2.0.5-py3-none-any.whl.metadata (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.3/46.3 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Downloading pyod-2.0.5-py3-none-any.whl (200 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.6/200.6 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyod
Successfully installed pyod-2.0.5
Collecting streamlit
  Downloading streamlit-1.50.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.50.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m60.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m88.7 MB/s[0m eta [36m0:00:00[0m
[?25hI

In [4]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV

# Unsupervised ML models from PyOD
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF

# Metrics and Evaluation
from sklearn import metrics
from sklearn.metrics import (make_scorer, classification_report, confusion_matrix,
                            precision_score, recall_score, f1_score,
                            roc_auc_score, precision_recall_curve, roc_curve)

# Explainable AI
import shap

# Model storage
import joblib

# Model Deployment
import streamlit as st

# Display settings
pd.set_option('display.max_columns', None)

# Warnings
import warnings
warnings.filterwarnings('ignore')

In [7]:
import sklearn
print(sklearn.__version__)

1.6.1


### A) Data Ingestion



In [3]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"ramyavijayalayan","key":"6fbbc7bcde80b741ae1e038472f754b6"}'}

In [4]:
# Configuring Kaggle CLI
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [5]:
!kaggle datasets download -d ealaxi/paysim1
!unzip paysim1.zip

Dataset URL: https://www.kaggle.com/datasets/ealaxi/paysim1
License(s): CC-BY-SA-4.0
Downloading paysim1.zip to /content
 78% 139M/178M [00:00<00:00, 1.44GB/s]
100% 178M/178M [00:00<00:00, 1.15GB/s]
Archive:  paysim1.zip
  inflating: PS_20174392719_1491204439457_log.csv  


In [6]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('AnomalyDetectionPipeline').getOrCreate()
df_spark = spark.read.csv('/content/PS_20174392719_1491204439457_log.csv', header = True, inferSchema = True)
df_spark.createOrReplaceTempView('paysim')

### B) Exploratory Data Analysis - EDA

In [7]:
df_spark.show(10)

+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|step|    type|  amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|
+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|   1| PAYMENT| 9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|      0|             0|
|   1| PAYMENT| 1864.28|C1666544295|      21249.0|      19384.72|M2044282225|           0.0|           0.0|      0|             0|
|   1|TRANSFER|   181.0|C1305486145|        181.0|           0.0| C553264065|           0.0|           0.0|      1|             0|
|   1|CASH_OUT|   181.0| C840083671|        181.0|           0.0|  C38997010|       21182.0|           0.0|      1|             0|
|   1| PAYMENT|11668.14|C2048537720|      41554.0|      29885.86|M1230701703|      

In [8]:
df_spark.count()

6362620

In [9]:
df_spark.printSchema()

root
 |-- step: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- nameOrig: string (nullable = true)
 |-- oldbalanceOrg: double (nullable = true)
 |-- newbalanceOrig: double (nullable = true)
 |-- nameDest: string (nullable = true)
 |-- oldbalanceDest: double (nullable = true)
 |-- newbalanceDest: double (nullable = true)
 |-- isFraud: integer (nullable = true)
 |-- isFlaggedFraud: integer (nullable = true)



In [10]:
df_spark.describe().show()

+-------+------------------+--------+-----------------+-----------+-----------------+------------------+-----------+------------------+------------------+--------------------+--------------------+
|summary|              step|    type|           amount|   nameOrig|    oldbalanceOrg|    newbalanceOrig|   nameDest|    oldbalanceDest|    newbalanceDest|             isFraud|      isFlaggedFraud|
+-------+------------------+--------+-----------------+-----------+-----------------+------------------+-----------+------------------+------------------+--------------------+--------------------+
|  count|           6362620| 6362620|          6362620|    6362620|          6362620|           6362620|    6362620|           6362620|           6362620|             6362620|             6362620|
|   mean|243.39724563151657|    NULL|179861.9035491287|       NULL|833883.1040744764| 855113.6685785812|       NULL|1100701.6665196533|1224996.3982019224|0.001290820448180152| 2.51468734577894E-6|
| stddev|142.33

In [11]:
df_spark.createOrReplaceTempView('paysim')
spark.sql("""select * from paysim group by step, type, amount, nameOrig, oldBalanceOrg, newbalanceOrig, nameDest, oldbalanceDest, newbalanceDest, isFraud, isFlaggedFraud having count(*)>1 """).show()

+----+----+------+--------+-------------+--------------+--------+--------------+--------------+-------+--------------+
|step|type|amount|nameOrig|oldbalanceOrg|newbalanceOrig|nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|
+----+----+------+--------+-------------+--------------+--------+--------------+--------------+-------+--------------+
+----+----+------+--------+-------------+--------------+--------+--------------+--------------+-------+--------------+



In [12]:
# Checking for nulls
spark.sql("""
          select sum(case when step is null then 1 else 0 end) as null_step,
               sum(case when type is null then 1 else 0 end) as null_type,
               sum(case when amount is null then 1 else 0 end) as null_amount,
               sum(case when nameOrig is null then 1 else 0 end) as null_nameOrig,
               sum(case when oldbalanceOrg is null then 1 else 0 end) as null_oldbalanceOrg,
               sum(case when newbalanceOrig is null then 1 else 0 end) as null_newbalanceOrig,
               sum(case when nameDest is null then 1 else 0 end) as null_nameDest,
               sum(case when oldbalanceDest is null then 1 else 0 end) as oldbalanceDest,
               sum(case when newbalanceDest is null then 1 else 0 end) as null_newbalanceDest,
               sum(case when isFraud is null then 1 else 0 end) as null_isFraud,
               sum(case when isFlaggedFraud is null then 1 else 0 end) as null_isFlaggedFraud
               from paysim
          """).show()

+---------+---------+-----------+-------------+------------------+-------------------+-------------+--------------+-------------------+------------+-------------------+
|null_step|null_type|null_amount|null_nameOrig|null_oldbalanceOrg|null_newbalanceOrig|null_nameDest|oldbalanceDest|null_newbalanceDest|null_isFraud|null_isFlaggedFraud|
+---------+---------+-----------+-------------+------------------+-------------------+-------------+--------------+-------------------+------------+-------------------+
|        0|        0|          0|            0|                 0|                  0|            0|             0|                  0|           0|                  0|
+---------+---------+-----------+-------------+------------------+-------------------+-------------+--------------+-------------------+------------+-------------------+



In [13]:
# Count of each category
df_spark.groupBy('type').count().orderBy('count', ascending = False).show()
df_spark.groupBy('isFlaggedFraud').count().show()
df_spark.groupBy('isFraud').count().show()

+--------+-------+
|    type|  count|
+--------+-------+
|CASH_OUT|2237500|
| PAYMENT|2151495|
| CASH_IN|1399284|
|TRANSFER| 532909|
|   DEBIT|  41432|
+--------+-------+

+--------------+-------+
|isFlaggedFraud|  count|
+--------------+-------+
|             0|6362604|
|             1|     16|
+--------------+-------+

+-------+-------+
|isFraud|  count|
+-------+-------+
|      1|   8213|
|      0|6354407|
+-------+-------+



In [14]:
# isFlaggedFraud is rule-based and doesn't help with modeling and predictions
df_spark = df_spark.drop('isFlaggedFraud')

In [15]:
df_spark.createOrReplaceTempView('paysim')

### **C) Feature Engineering**

In [16]:
from pyspark.sql.functions import col, when, lit

# Balance change for origin account
df_spark = df_spark.withColumn('balance_diff_orig', col('oldbalanceOrg') - col('newbalanceOrig'))

# Balance Change for destination account
df_spark = df_spark.withColumn('balance_diff_dest' , col('oldbalanceDest') - col('newbalanceDest'))

# Transaction Amount to Account balance ratio
df_spark = df_spark.withColumn('amount_to_balance_ratio', col('amount')/ when(col('oldbalanceOrg') !=0, col('oldbalanceOrg')).otherwise(lit(1e-6)))

# Is_Zero_balance flag
df_spark = df_spark.withColumn('is_zero_balance', when(col('oldbalanceOrg') == 0, 1).otherwise(0) )

df_spark.show(5)
df_spark.createOrReplaceTempView('paysim')

+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+------------------+-----------------+-----------------------+---------------+
|step|    type|  amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud| balance_diff_orig|balance_diff_dest|amount_to_balance_ratio|is_zero_balance|
+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+------------------+-----------------+-----------------------+---------------+
|   1| PAYMENT| 9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|      0| 9839.640000000014|              0.0|   0.057833968119621944|              0|
|   1| PAYMENT| 1864.28|C1666544295|      21249.0|      19384.72|M2044282225|           0.0|           0.0|      0|1864.2799999999988|              0.0|    0.08773495223304625|              0|
|   1|TRANSFER|   181.0|C1305486145

In [17]:
# Time based features
from pyspark.sql.functions import floor, expr

# Hour of the day from step feature from 0 to 23
df_spark = df_spark.withColumn('hour_of_the_day', col('step') % 24)

# Day of the week from 0 to 6
df_spark = df_spark.withColumn('day_of_the_week', (floor(col('step')/24) % 7).cast('int') )

# Is_weekend
df_spark = df_spark.withColumn('is_weekend', when(col('day_of_the_week').isin([5, 6]), 1).otherwise(0) )

df_spark.createOrReplaceTempView('paysim')
df_spark.show(5)

+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+------------------+-----------------+-----------------------+---------------+---------------+---------------+----------+
|step|    type|  amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud| balance_diff_orig|balance_diff_dest|amount_to_balance_ratio|is_zero_balance|hour_of_the_day|day_of_the_week|is_weekend|
+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+------------------+-----------------+-----------------------+---------------+---------------+---------------+----------+
|   1| PAYMENT| 9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|      0| 9839.640000000014|              0.0|   0.057833968119621944|              0|              1|              0|         0|
|   1| PAYMENT| 1864.28|C1666544295|      21249.0|      

In [25]:
df_spark.printSchema()

root
 |-- step: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- nameOrig: string (nullable = true)
 |-- oldbalanceOrg: double (nullable = true)
 |-- newbalanceOrig: double (nullable = true)
 |-- nameDest: string (nullable = true)
 |-- oldbalanceDest: double (nullable = true)
 |-- newbalanceDest: double (nullable = true)
 |-- isFraud: integer (nullable = true)
 |-- balance_diff_orig: double (nullable = true)
 |-- balance_diff_dest: double (nullable = true)
 |-- amount_to_balance_ratio: double (nullable = true)
 |-- is_zero_balance: integer (nullable = false)
 |-- hour_of_the_day: integer (nullable = true)
 |-- day_of_the_week: integer (nullable = true)
 |-- is_weekend: integer (nullable = false)



In [39]:
from pyspark.sql import Window
from pyspark.sql.functions import col, asc, desc, lag, mean, stddev, count, min, when, lit, expr, percentile_approx

df_spark = df_spark.orderBy(asc('step'), asc('nameOrig'))
# Rolling windows
window_12 = Window.partitionBy("nameOrig").orderBy("step").rowsBetween(-12, -1)
window_6 = Window.partitionBy("nameOrig").orderBy("step").rowsBetween(-6, -1)
window_dest_12 = Window.partitionBy("nameDest").orderBy("step").rowsBetween(-12, -1)
window_dest_6 = Window.partitionBy("nameDest").orderBy("step").rowsBetween(-6, -1)

# Full group windows
group_window_orig = Window.partitionBy("nameOrig")
group_window_dest = Window.partitionBy("nameDest")


In [40]:
df = df_spark

# Rolling features
df = df.withColumn("rolling_balance_change", mean("newbalanceOrig").over(window_12))
df = df.withColumn("rolling_amount_mean_12h", mean("amount").over(window_12))
df = df.withColumn("balance_volatility_12", stddev("newbalanceOrig").over(window_12))
df = df.withColumn("dest_balance_change_12h", mean("newbalanceDest").over(window_dest_12))



In [41]:
# Account age and activity
df = df.withColumn("account_age", col("step") - min("step").over(group_window_orig))
df = df.withColumn("account_activity", count("step").over(group_window_orig))
df = df.withColumn("dest_txn_count", count("step").over(group_window_dest))

# Ratio features
df = df.withColumn("balance_recovery_ratio", col("newbalanceOrig") / (col("oldbalanceOrg") + lit(1e-6)))

# High amount flag (95th percentile)
threshold = df.approxQuantile("amount", [0.95], 0.01)[0]
df = df.withColumn("is_high_amount", (col("amount") > threshold).cast("int"))

# Transaction count in past 6 steps
df = df.withColumn("txn_count_6h", count("step").over(window_6))
df = df.withColumn("dest_txn_count_6h", count("step").over(window_dest_6))

# Flags
df = df.withColumn("is_frequent_sender", (col("txn_count_6h") > 5).cast("int"))
df = df.withColumn("is_frequent_receiver", (col("dest_txn_count_6h") > 5).cast("int"))
df = df.withColumn("is_new_account", (col("account_age") < 24).cast("int"))


In [42]:
df.cache()

DataFrame[step: int, type: string, amount: double, nameOrig: string, oldbalanceOrg: double, newbalanceOrig: double, nameDest: string, oldbalanceDest: double, newbalanceDest: double, isFraud: int, balance_diff_orig: double, balance_diff_dest: double, amount_to_balance_ratio: double, is_zero_balance: int, hour_of_the_day: int, day_of_the_week: int, is_weekend: int, rolling_balance_change: double, rolling_amount_mean_12h: double, balance_volatility_12: double, dest_balance_change_12h: double, account_age: int, account_activity: bigint, dest_txn_count: bigint, balance_recovery_ratio: double, is_high_amount: int, txn_count_6h: bigint, dest_txn_count_6h: bigint, is_frequent_sender: int, is_frequent_receiver: int, is_new_account: int]

In [43]:
df.show(5)

+----+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+------------------+-------------------+-----------------------+---------------+---------------+---------------+----------+----------------------+-----------------------+---------------------+-----------------------+-----------+----------------+--------------+----------------------+--------------+------------+-----------------+------------------+--------------------+--------------+
|step|    type|   amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud| balance_diff_orig|  balance_diff_dest|amount_to_balance_ratio|is_zero_balance|hour_of_the_day|day_of_the_week|is_weekend|rolling_balance_change|rolling_amount_mean_12h|balance_volatility_12|dest_balance_change_12h|account_age|account_activity|dest_txn_count|balance_recovery_ratio|is_high_amount|txn_count_6h|dest_txn_count_6h|is_frequent_sender|is_frequent_receiver|is_new_account|


In [44]:
df.select([ count(when(col(c).isNull(), c )).alias(c + "_nulls") for c in df.columns ]).show()

+----------+----------+------------+--------------+-------------------+--------------------+--------------+--------------------+--------------------+-------------+-----------------------+-----------------------+-----------------------------+---------------------+---------------------+---------------------+----------------+----------------------------+-----------------------------+---------------------------+-----------------------------+-----------------+----------------------+--------------------+----------------------------+--------------------+------------------+-----------------------+------------------------+--------------------------+--------------------+
|step_nulls|type_nulls|amount_nulls|nameOrig_nulls|oldbalanceOrg_nulls|newbalanceOrig_nulls|nameDest_nulls|oldbalanceDest_nulls|newbalanceDest_nulls|isFraud_nulls|balance_diff_orig_nulls|balance_diff_dest_nulls|amount_to_balance_ratio_nulls|is_zero_balance_nulls|hour_of_the_day_nulls|day_of_the_week_nulls|is_weekend_nulls|rollin

In [49]:
df.groupBy('nameOrig').count().filter('count < 6').count()

6353307

In [50]:
# Imputating the nulls as here they mean users with limited or fewer transactions and not noise, and they could potentially be fraudulent too
df = df.fillna({
    "rolling_balance_change": -1,
    "rolling_amount_mean_12h": -1,
    "balance_volatility_12": -1,
    "dest_balance_change_12h": -1
})

In [52]:
df.select([ count(when(col(c).isNull(), c )).alias(c + "_nulls") for c in df.columns ]).show()

+----------+----------+------------+--------------+-------------------+--------------------+--------------+--------------------+--------------------+-------------+-----------------------+-----------------------+-----------------------------+---------------------+---------------------+---------------------+----------------+----------------------------+-----------------------------+---------------------------+-----------------------------+-----------------+----------------------+--------------------+----------------------------+--------------------+------------------+-----------------------+------------------------+--------------------------+--------------------+
|step_nulls|type_nulls|amount_nulls|nameOrig_nulls|oldbalanceOrg_nulls|newbalanceOrig_nulls|nameDest_nulls|oldbalanceDest_nulls|newbalanceDest_nulls|isFraud_nulls|balance_diff_orig_nulls|balance_diff_dest_nulls|amount_to_balance_ratio_nulls|is_zero_balance_nulls|hour_of_the_day_nulls|day_of_the_week_nulls|is_weekend_nulls|rollin

#### Downsampling the dataset for modeling



In [56]:
from pyspark.sql.functions import col

fraud_df = df.filter('isFraud == 1')

# Defining quantiles for splitting the data based on time (step)
q1, q2 = df.approxQuantile("step", [0.33, 0.66], 0.01)

# Slicing non-fraud data to include data from all three quantiles
early = df.filter((col("isFraud") == 0) & (col("step") <= q1))
mid   = df.filter((col("isFraud") == 0) & (col("step") > q1) & (col("step") <= q2))
late  = df.filter((col("isFraud") == 0) & (col("step") > q2))

# Sampling from each slice
target_count = fraud_df.count() * 100
early_sample = early.sample(False, 0.33 * target_count / early.count())
mid_sample   = mid.sample(False, 0.33 * target_count / mid.count())
late_sample  = late.sample(False, 0.34 * target_count / late.count())

# Combine with frauds
nonfraud_sampled = early_sample.union(mid_sample).union(late_sample)
downsampled_df = fraud_df.union(nonfraud_sampled)


In [57]:
downsampled_df.count()

829234

In [58]:
df_pandas = downsampled_df.toPandas()
df_pandas.isnull().sum()

Unnamed: 0,0
step,0
type,0
amount,0
nameOrig,0
oldbalanceOrg,0
newbalanceOrig,0
nameDest,0
oldbalanceDest,0
newbalanceDest,0
isFraud,0


#### Categorical Encoding - One Hot Encoding
##### As the distance based algorithms such as KNN, LoF may misinterpret integer categories as weighted or ordered

In [59]:
# Using One Hot Encoding to encode categorical variables
df_encoded = pd.get_dummies(df_pandas, columns = ['type'], dtype = int)

df_encoded.head()

Unnamed: 0,step,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,balance_diff_orig,balance_diff_dest,amount_to_balance_ratio,is_zero_balance,hour_of_the_day,day_of_the_week,is_weekend,rolling_balance_change,rolling_amount_mean_12h,balance_volatility_12,dest_balance_change_12h,account_age,account_activity,dest_txn_count,balance_recovery_ratio,is_high_amount,txn_count_6h,dest_txn_count_6h,is_frequent_sender,is_frequent_receiver,is_new_account,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,1,132842.64,C13692003,4499.08,0.0,C297927961,0.0,132842.64,1,4499.08,-132842.64,29.52662,0,1,0,0,-1.0,-1.0,-1.0,115416.5,0,1,42,0.0,0,0,2,0,0,1,0,1,0,0,0
1,1,416001.33,C749981943,0.0,0.0,C667346055,102.0,9291619.62,1,0.0,-9291517.62,416001300000.0,1,1,0,0,-1.0,-1.0,-1.0,10227530.0,0,1,86,0.0,0,0,3,0,0,1,0,1,0,0,0
2,1,25071.46,C669700766,25071.46,0.0,C1384210339,0.0,0.0,1,25071.46,0.0,1.0,0,1,0,0,-1.0,-1.0,-1.0,-1.0,0,1,10,0.0,0,0,0,0,0,1,0,0,0,0,1
3,1,2806.0,C2101527076,2806.0,0.0,C1007251739,26202.0,0.0,1,2806.0,26202.0,1.0,0,1,0,0,-1.0,-1.0,-1.0,-1.0,0,1,35,0.0,0,0,0,0,0,1,0,1,0,0,0
4,1,1277212.77,C467632528,1277212.77,0.0,C716083600,0.0,2444985.19,1,1277212.77,-2444985.19,1.0,0,1,0,0,-1.0,-1.0,-1.0,2444985.0,0,1,89,0.0,1,0,2,0,0,1,0,1,0,0,0


In [60]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 829502 entries, 0 to 829501
Data columns (total 35 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   step                     829502 non-null  int32  
 1   amount                   829502 non-null  float64
 2   nameOrig                 829502 non-null  object 
 3   oldbalanceOrg            829502 non-null  float64
 4   newbalanceOrig           829502 non-null  float64
 5   nameDest                 829502 non-null  object 
 6   oldbalanceDest           829502 non-null  float64
 7   newbalanceDest           829502 non-null  float64
 8   isFraud                  829502 non-null  int32  
 9   balance_diff_orig        829502 non-null  float64
 10  balance_diff_dest        829502 non-null  float64
 11  amount_to_balance_ratio  829502 non-null  float64
 12  is_zero_balance          829502 non-null  int32  
 13  hour_of_the_day          829502 non-null  int32  
 14  day_

#### Feature Scaling - on the training data to avoid data leakage

In [61]:
X = df_encoded.drop(['nameOrig', 'nameDest', 'isFraud'], axis = 1)
y = df_encoded['isFraud']   # only for evaluation

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 42)

In [63]:
# Scale the numerical features in the training dataset
numeric_cols = [col for col in X.columns if X[col].nunique() >2]
binary_cols = [col for col in X.columns if X[col].nunique() <=2]

scaler = StandardScaler()
X_train_num = pd.DataFrame( scaler.fit_transform(X_train[numeric_cols]), columns = numeric_cols, index = X_train.index )  # as scaler returns a NumPy Array
X_train_scaled = pd.concat([X_train_num, X_train[binary_cols] ], axis = 1)

X_test_num = pd.DataFrame( scaler.transform(X_test[numeric_cols]), columns = numeric_cols, index = X_test.index )
X_test_scaled = pd.concat([X_test_num, X_test[binary_cols] ], axis = 1)

### **D) Model Training and Evaluation**

#### Unsupervised ML Algorithms used - Isolation Forest, K-Nearest Neighbours and LOF (Local Outlier Factor)

In [64]:
# Defining a Scoring function for GridSearchCV to score the model for identifying the best parameters
def anomaly_recall(y_test, y_pred):
  return recall_score(y_test, y_pred)

scorer = make_scorer(anomaly_recall)

In [65]:
# Defining hyper-parameter grids

iforest_params = {'n_estimators': [100, 200],
    'max_samples': ['auto', 0.8], 'contamination': [0.01, 0.02]}

knn_params = {'n_neighbors': [5, 10], 'method': ['largest', 'median'],
              'contamination': [0.1, 0.2] }

lof_params = { 'n_neighbors': [10, 20],
               'contamination': [0.1, 0.2],
               'metric':['euclidean']}

In [66]:
# Model Training
iforest_grid = GridSearchCV(IForest(random_state = 42, n_jobs =-1), iforest_params, scoring=scorer, cv=3)
iforest_grid.fit(X_train_scaled, y_train)
print("Best IForest Params:", iforest_grid.best_params_)

Best IForest Params: {'contamination': 0.02, 'max_samples': 0.8, 'n_estimators': 100}


#### Downsampling the data further to feed into KNN and LOF and evaluate their performance

In [76]:
fraud_downsampled = df_encoded[df_encoded['isFraud'] == 1].sample(n = 150, random_state = 42)  # To preserve the original fraud:non-fraud ratio of 1:100
nonfraud_downsampled = df_encoded[df_encoded['isFraud'] == 0].sample(n = 150000, random_state = 42)
df_reduced = pd.concat([fraud_downsampled, nonfraud_downsampled], axis = 0)

X_reduced = df_reduced.drop('isFraud', axis = 1)
y_reduced = df_reduced['isFraud']
X_train_red, X_test_red, y_train_red, y_test_red = train_test_split(X_reduced, y_reduced, test_size = 0.2, stratify= y_reduced, random_state = 42)

scaler_downsampled = StandardScaler()
X_train_red_scaled =  pd.concat( [pd.DataFrame(scaler_downsampled.fit_transform(X_train_red[numeric_cols]), columns = numeric_cols, index = X_train_red.index ), X_train_red[binary_cols]] , axis = 1)
X_test_red_scaled = pd.concat( [pd.DataFrame(scaler_downsampled.transform(X_test_red[numeric_cols]), columns = numeric_cols, index = X_test_red.index), X_test_red[binary_cols]] , axis = 1 )


In [69]:
knn_grid = GridSearchCV(KNN(n_jobs = -1), knn_params, scoring=scorer, cv=3)
knn_grid.fit(X_train_red_scaled[:10000], y_train_red[:10000])
print("Best KNN Params:", knn_grid.best_params_)

Best KNN Params: {'contamination': 0.2, 'method': 'largest', 'n_neighbors': 5}


In [70]:
lof_grid = GridSearchCV(LOF(n_jobs = -1), lof_params, scoring=scorer, cv=3)
lof_grid.fit(X_train_red_scaled[:10000], y_train_red[:10000])
print("Best LOF Params:", lof_grid.best_params_)

Best LOF Params: {'contamination': 0.2, 'metric': 'euclidean', 'n_neighbors': 10}


In [81]:
# Model Predictions on Test set

#iforest_preds = iforest_grid.predict(X_test_scaled)
iforest_anomaly_scores = iforest_grid.decision_function(X_test_scaled)
threshold = np.percentile(iforest_anomaly_scores, 75)
iforest_preds = (iforest_anomaly_scores > threshold).astype(int)

In [78]:
knn_preds = knn_grid.predict(X_test_red_scaled)
knn_anomaly_scores = knn_grid.decision_function(X_test_red_scaled)

In [79]:
lof_preds = lof_grid.predict(X_test_red_scaled)
lof_anomaly_scores = lof_grid.decision_function(X_test_red_scaled)

#### Model Evaluation and Benchmarking

In [82]:
# Classification Report and Confusion Matrix for each model

reports = {}

for grid, name in zip([iforest_grid, knn_grid, lof_grid], ['Isolation Forest', 'K-Nearest Neighbors', 'Local Outlier Factor']):
  if name == 'Isolation Forest':
    preds = iforest_preds
    y_true = y_test
    X_eval = X_test_scaled
  else:
    preds = grid.predict(X_test_red_scaled)
    y_true = y_test_red
    X_eval = X_test_red_scaled

  class_report = pd.DataFrame(classification_report(y_true, preds, output_dict=True)).transpose()
  conf_matrix = confusion_matrix(y_true, preds)
  anomaly_score = grid.decision_function(X_eval)
  reports[name] = { 'report':  class_report,
                    'confusion_matrix' : conf_matrix,
                    'anomaly_scores': anomaly_score}
  print(f'Classfication Report of {name}: \n {class_report}')
  print(f'The Confusion Matrix for {name}: \n {conf_matrix}' )
  print('='*60)





Classfication Report of Isolation Forest: 
               precision    recall  f1-score        support
0              0.998264  0.756188  0.860526  164258.000000
1              0.034406  0.868533  0.066190    1643.000000
accuracy       0.757301  0.757301  0.757301       0.757301
macro avg      0.516335  0.812361  0.463358  165901.000000
weighted avg   0.988718  0.757301  0.852659  165901.000000
The Confusion Matrix for Isolation Forest: 
 [[124210  40048]
 [   216   1427]]
Classfication Report of K-Nearest Neighbors: 
               precision    recall  f1-score      support
0              0.999959  0.803567  0.891070  30000.00000
1              0.004897  0.966667  0.009745     30.00000
accuracy       0.803730  0.803730  0.803730      0.80373
macro avg      0.502428  0.885117  0.450407  30030.00000
weighted avg   0.998964  0.803730  0.890189  30030.00000
The Confusion Matrix for K-Nearest Neighbors: 
 [[24107  5893]
 [    1    29]]
Classfication Report of Local Outlier Factor: 
       

### **E) Saving the models and the features for deployment on Streamlit**

In [85]:
joblib.dump(iforest_grid, "iforest_model.pkl")
joblib.dump(knn_grid, "knn_model.pkl")
joblib.dump(lof_grid, "lof_model.pkl")

['lof_model.pkl']

In [86]:
# Download each model file
files.download("iforest_model.pkl")
files.download("knn_model.pkl")
files.download("lof_model.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [87]:
# Downloading the test data for displaying the top N fraud cases
X_test_display = X_test.drop(columns = [col for col in X_test.columns if 'type' in col])
X_test_display["true_label"] = y_test
X_test_display["nameOrig"] = df_pandas.loc[X_test.index, "nameOrig"]
X_test_display["nameDest"] = df_pandas.loc[X_test.index, "nameDest"]
X_test_display["type"] = df_pandas.loc[X_test.index, "type"]

# Save for Streamlit
X_test_display.to_csv("X_test_display.csv", index=False)


In [91]:
# downloading the numpy scaled test and training data to serve the model on streamlit

np.save('X_test_scaled.npy', X_test_scaled)
np.save('X_train_scaled.npy', X_train_scaled)
np.save('y_test.npy', y_test)
np.save('y_train.npy', y_train)

In [92]:
files.download("X_test_display.csv")
files.download("X_test_scaled.npy")
files.download('X_train_scaled.npy')
files.download('y_test.npy')
files.download('y_train.npy')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

---

_This notebook was created and authored by Ramya Vijayalayan for educational and portfolio use only._  
© 2025 Ramya | [github portfolio](https://github.com/ramyavijayalayan10)