In [None]:
# For data handling we use pandas and numpy
# For visualization we use matplotlib and seaborn
# these are foundational tools for loading, manipulating, summarizing, and visualizing datasets in almost every ML workflow.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import warnings
# Supress noisy warnings and set consistent seaborn style
warnings.filterwarnings("ignore")
sns.set_theme(style="whitegrid")

In [None]:
# Load the dataset into a pandas DataFrame
df = pd.read_csv("./data/AIML Dataset.csv")
df.head()

In [None]:
# Display DataFrame info summary, including data types and non-null counts
# Helps identify missing values, proper dtypes (numeric vs object), and memory considerations
df.info()

In [None]:
# List column names
df.columns

In [None]:
# Count occurrences of each class in the target variable 'isFraud'
# Shows class imbalance (fraud vs non-fraud). Knowing imbalance is crucial because it affects model choice, 
# evaluation metrics, and sampling strategies.
df["isFraud"].value_counts()

In [None]:
# Helps understand related flags/indicators
df["isFlaggedFraud"].value_counts()

In [None]:
# Count total missing values in the whole DataFrame
# Quick health check to decide whether imputation or row/column removal is needed prior to modeling.
df.isnull().sum().sum()

In [None]:
# Show dataset dimensions (rows, columns)
# To know dataset size which influences training time, memory, and statistical stability.
df.shape

In [None]:
# Compute percentage of fraud cases in the dataset (fraud prevalence).
# Percent fraud quantifies imbalance and informs thresholds for oversampling/undersampling or metric choice like precision/recall.
round((df["isFraud"].value_counts()[1] / df.shape[0]) * 100, 2)

In [None]:
# Bar chart of transaction type frequencies
# Helps spot dominant transaction types and whether some types are rare, useful for feature selection/encoding
df["type"].value_counts().plot(kind="bar", title="Transaction Types", color="skyblue")
plt.xlabel("Transaction Type")
plt.ylabel("Count")
plt.show()

In [None]:
# Compute and plot fraud rate per transaction type (mean of isFraud per type).
# Identifies which transaction types are riskier, valuable for feature engineering and domain interpretation
fraud_by_type = df.groupby("type")["isFraud"].mean().sort_values(ascending=False)
fraud_by_type.plot(kind="bar", title="Fraud Rate by Transaction Type", color="salmon")
plt.ylabel("Fraud Rate")
plt.show()

In [None]:
# Show descriptive stats (count, mean, std, min, quartiles, max) for amount, cast to int for readability.
# Reveals scale, outliers, and distribution shape — guides transforms (log, clipping) and scaling decisions.
df["amount"].describe().astype(int)

In [None]:
# Visualize the log-transformed amount distribution with KDE
# Transaction amounts are typically skewed; log transform normalizes distribution, improving model stability 
# and assumptions for algorithms that prefer symmetric features.
sns.histplot(np.log1p(df["amount"]), bins=100, kde=True, color="purple")
plt.title("Log-Transformed Transaction Amount Distribution")
plt.xlabel("Log(Amount + 1)")
plt.show()

In [None]:
# Boxplot of amount by fraud status, limited to amounts below a threshold to avoid extreme outlier domination.
# Compares typical amounts for fraud vs non-fraud; can reveal whether frauds use notably different amounts, suggesting amount is predictive
sns.boxplot(data = df[df["amount"] < 50000], x="isFraud", y="amount")
plt.title("Transaction Amount Distribution by Fraud Status (Amount < 5000)")
plt.ylabel("Amount")
plt.show()

In [None]:
# Create two engineered features representing balance changes for origin and destination accounts
# Raw balances are less informative than the change; engineered differences often capture the transaction 
# effect and can be strong predictors.
df["balanceDiffOrig"] = df["oldbalanceOrg"] - df["newbalanceOrig"]
df["balanceDiffDest"] = df["newbalanceDest"] - df["oldbalanceDest"]

In [None]:
# Count cases where origin balance decreased by a negative amount (unexpected negative diff)
# Helps detect inconsistencies or errors in data and potential anomalous behavior — useful for data cleaning or flag features
(df["balanceDiffOrig"] < 0).sum()

In [None]:
# Count destination negative differences.
# Verify data consistency and find suspicious records
(df["balanceDiffDest"] < 0).sum()

In [None]:
df.head(2)

In [None]:
# Visualize fraud counts over the step time variable
# Examines temporal patterns (peaks, trends) which can inform time-based features, seasonality handling, or train/test split strategies
frauds_per_step = df[df["isFraud"] == 1]["step"].value_counts().sort_index()
plt.plot(frauds_per_step.index, frauds_per_step.values, label="Frauds per Step")
plt.title("Number of Frauds Over Time Steps")
plt.xlabel("Time Step")
plt.ylabel("Number of Frauds")
plt.grid(True)
plt.show()

In [None]:
# Either step was deemed irrelevant, leaking, or not useful for modeling (or was only used for the earlier visualization). 
# Dropping reduces noise; ensure no leakage before dropping
df.drop(columns="step", inplace=True)

In [None]:
df.head()

In [None]:
# compute the top 10 most frequent sender ID
# understanding heavy senders can indicate hubs, bots, or anomalies; also helps decide whether to encode these IDs or aggregate them rather than use 
# raw high-cardinality string IDs
top_senders = df["nameOrig"].value_counts().head(10)

In [None]:
top_senders

In [None]:

top_receivers = df["nameDest"].value_counts().head(10)
top_receivers

In [None]:
#Purpose: top senders among fraud cases.
#Why it's needed: identifies suspicious accounts disproportionately involved in fraud — could become a high-signal categorical feature (or used to blacklist).
fraud_users = df[df["isFraud"] == 1]['nameOrig'].value_counts().head(10)
fraud_users

In [None]:
#Purpose: filter dataset for transaction types TRANSFER and CASH_OUT.
#Why it's needed: these types are commonly associated with fraud
fraud_types = df[df["type"].isin(["TRANSFER", "CASH_OUT"])]

In [None]:
fraud_types.head()

In [None]:
# Purpose: stacked/side-by-side counts of fraud vs non-fraud for the selected types.
# Why it's needed: visualizes how fraud proportion differs between TRANSFER and CASH_OUT; helps prioritize features or targeted models per type.
sns.countplot(data=fraud_types, x="type", hue="isFraud")
plt.title("Fraud Counts by Transaction Type")
plt.show()

In [None]:
# Purpose: compute correlation matrix among selected numeric features including the target.
# Why it's needed: correlation can highlight which numeric features are linearly associated with isFraud; helps feature selection and multicollinearity checks.
corr = df[["amount", "oldbalanceOrg", "newbalanceOrig", "oldbalanceDest", "newbalanceDest", "isFraud"]].corr()
corr

In [None]:
# Purpose: visualize the correlation matrix with annotations.
# Why it's needed: easier visually to spot strong positive/negative correlations; guides which features to scale, transform, or drop.
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

In [None]:
# Purpose: create a subset where origin had positive balance but new origin balance is zero after a TRANSFER/CASH_OUT.
# Why it's needed: suspicious pattern that often indicates fraudulent behavior (draining account). 
# Such logical rules can be turned into binary features (flags) that models use effectively.
zero_after_transfer = df[
    (df["oldbalanceOrg"] > 0) &
    (df["newbalanceOrig"] == 0) &
    (df["type"].isin(["TRANSFER", "CASH_OUT"]))
]

In [None]:
len(zero_after_transfer)

In [None]:
zero_after_transfer.head()

In [None]:
df["isFraud"].value_counts()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [None]:
df.head()

In [None]:
df_model = df.drop(columns=["nameOrig", "nameDest", "isFlaggedFraud"], axis = 1)

In [None]:
df_model.head()

In [None]:
categorical = ["type"]
numeric = ["amount", "oldbalanceOrg", "newbalanceOrig", "oldbalanceDest", "newbalanceDest"]


In [None]:
y = df_model["isFraud"]
X = df_model.drop("isFraud", axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, stratify=y, random_state=42)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric),
        ('cat', OneHotEncoder(drop="first"), categorical)
    ],
    remainder="drop"
)

In [None]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', class_weight='balanced', random_state=42, max_iter=1000))
])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
pipeline.score(X_test, y_test) * 100

In [None]:
import joblib
joblib.dump(pipeline, 'fraud_detection_model.pkl')