# ML. Dataset: 'Credit Card Transactions Synthetic Data Generation' from Kaggle

This Jupyter notebook is for preliminary training and testing of ML models for fraud detection on dataset 3.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [2]:
df = pd.read_csv("../datasets/3/transactions_df.csv")
print(f"rows: {len(df)}")
df.head()

rows: 1785308


Unnamed: 0,transaction_id,post_ts,customer_id,bin,terminal_id,amt,entry_mode,fraud,fraud_scenario
0,OyWUo6ruReKft-P_QtjJbQ,2023-02-01 00:00:30,C00005143,424208,T001014,38.97,Contactless,0,0
1,rrgYMZWnRK6kKtWqlGN6HA,2023-02-01 00:00:54,C00002570,364329,T001023,84.07,Contactless,0,0
2,H4G-WgpGQluYWIT17jdN8Q,2023-02-01 00:01:13,C00005507,455573,T001024,113.18,Contactless,0,0
3,_4WehzsiRCK2WA8LTBkvsA,2023-02-01 00:01:57,C00003688,552755,T001064,28.96,Chip,0,0
4,81or3lX-Q9-2EEOfOgLCEQ,2023-02-01 00:01:57,C00003353,465808,T001097,65.45,Chip,0,0


In [3]:
df["entry_mode"] = df["entry_mode"].astype("category").cat.codes
df.head()

Unnamed: 0,transaction_id,post_ts,customer_id,bin,terminal_id,amt,entry_mode,fraud,fraud_scenario
0,OyWUo6ruReKft-P_QtjJbQ,2023-02-01 00:00:30,C00005143,424208,T001014,38.97,1,0,0
1,rrgYMZWnRK6kKtWqlGN6HA,2023-02-01 00:00:54,C00002570,364329,T001023,84.07,1,0,0
2,H4G-WgpGQluYWIT17jdN8Q,2023-02-01 00:01:13,C00005507,455573,T001024,113.18,1,0,0
3,_4WehzsiRCK2WA8LTBkvsA,2023-02-01 00:01:57,C00003688,552755,T001064,28.96,0,0,0
4,81or3lX-Q9-2EEOfOgLCEQ,2023-02-01 00:01:57,C00003353,465808,T001097,65.45,0,0,0


In [4]:
y = df["fraud"]
X = df.drop(
    columns=["fraud", "transaction_id", "post_ts", "customer_id", "terminal_id"]
)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=2025
)

In [5]:
rf = RandomForestClassifier(n_estimators=10, random_state=2025)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
f1_weighted = f1_score(y_test, y_pred, average="weighted")
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.5f}")
print(f"Recall: {recall:.5f}")
print(f"Precision: {precision:.5f}")
print(f"Weighted F1 Score: {f1_weighted:.5f}")

Accuracy: 1.00000
Recall: 1.00000
Precision: 1.00000
Weighted F1 Score: 1.00000


In [6]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
f1_weighted = f1_score(y_test, y_pred, average="weighted")
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.5f}")
print(f"Recall: {recall:.5f}")
print(f"Precision: {precision:.5f}")
print(f"Weighted F1 Score: {f1_weighted:.5f}")

Accuracy: 0.96999
Recall: 0.00000
Precision: 0.00000
Weighted F1 Score: 0.95522


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.svm import SVC

model = SVC(class_weight="balanced", kernel="linear")
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
f1_weighted = f1_score(y_test, y_pred, average="weighted")
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.5f}")
print(f"Recall: {recall:.5f}")
print(f"Precision: {precision:.5f}")
print(f"Weighted F1 Score: {f1_weighted:.5f}")

In [5]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
f1_weighted = f1_score(y_test, y_pred, average="weighted")
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.5f}")
print(f"Recall: {recall:.5f}")
print(f"Precision: {precision:.5f}")
print(f"Weighted F1 Score: {f1_weighted:.5f}")

Accuracy: 0.96985
Recall: 0.10090
Precision: 0.48848
Weighted F1 Score: 0.96012


In [6]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
f1_weighted = f1_score(y_test, y_pred, average="weighted")
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.5f}")
print(f"Recall: {recall:.5f}")
print(f"Precision: {precision:.5f}")
print(f"Weighted F1 Score: {f1_weighted:.5f}")

Accuracy: 1.00000
Recall: 1.00000
Precision: 1.00000
Weighted F1 Score: 1.00000
