# ML. Dataset: 'Fraudulent Transactions Data' from Kaggle

This Jupyter notebook is for preliminary training and testing of ML models for fraud detection on dataset 1.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [2]:
df = pd.read_csv("../datasets/Fraud.csv")
print(f"rows: {len(df)}")
df.head()

rows: 6362620


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [3]:
df["type"] = df["type"].astype("category").cat.codes
df["isMerchant"] = df["nameDest"].str.startswith("M").astype(int)
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,isMerchant
0,1,3,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,1
1,1,3,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,1
2,1,4,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,0
3,1,1,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,0
4,1,3,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,1


In [4]:
y = df["isFraud"]
X = df.drop(columns=["isFraud", "isFlaggedFraud", "nameOrig", "nameDest"])
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=2025
)

In [22]:
rf = RandomForestClassifier(n_estimators=10, random_state=2025)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
f1_weighted = f1_score(y_test, y_pred, average="weighted")
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.5f}")
print(f"Recall: {recall:.5f}")
print(f"Precision: {precision:.5f}")
print(f"Weighted F1 Score: {f1_weighted:.5f}")

Accuracy: 0.99969
Recall: 0.77906
Precision: 0.97264
Weighted F1 Score: 0.99967


In [21]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
f1_weighted = f1_score(y_test, y_pred, average="weighted")
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.5f}")
print(f"Recall: {recall:.5f}")
print(f"Precision: {precision:.5f}")
print(f"Weighted F1 Score: {f1_weighted:.5f}")

Accuracy: 0.99823
Recall: 0.44492
Precision: 0.35212
Weighted F1 Score: 0.99833


In [None]:
from sklearn.svm import SVC

model = SVC(class_weight="balanced", kernel="linear")
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
f1_weighted = f1_score(y_test, y_pred, average="weighted")
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.5f}")
print(f"Recall: {recall:.5f}")
print(f"Precision: {precision:.5f}")
print(f"Weighted F1 Score: {f1_weighted:.5f}")

In [5]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
f1_weighted = f1_score(y_test, y_pred, average="weighted")
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.5f}")
print(f"Recall: {recall:.5f}")
print(f"Precision: {precision:.5f}")
print(f"Weighted F1 Score: {f1_weighted:.5f}")

Accuracy: 0.99366
Recall: 0.18381
Precision: 0.04300
Weighted F1 Score: 0.99562


In [6]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
f1_weighted = f1_score(y_test, y_pred, average="weighted")
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.5f}")
print(f"Recall: {recall:.5f}")
print(f"Precision: {precision:.5f}")
print(f"Weighted F1 Score: {f1_weighted:.5f}")

Accuracy: 0.99919
Recall: 0.41205
Precision: 0.90508
Weighted F1 Score: 0.99903
