# ML. Dataset: 'Fraudulent Transactions Data' from Kaggle

This Jupyter notebook is for preliminary training and testing of ML models for fraud detection on dataset 1.

In [18]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

In [1]:
df = pd.read_csv("../datasets/Fraud.csv")
print(f"rows: {len(df)}")
df.head()

rows: 6362620


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [9]:
def preprocess_row_1(row):
    """
    Data preprocessing for a single row in dataset 1 ('Fraudulent Transactions Data' from Kaggle)

    Parameters:
      raw (dict): A single unprocessed row in the dataset

    Returns:
      dict: Preprocessed row with keys: 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig',
        'isMerchant', 'isFlaggedFraud', 'isFraud' transformed to float/int values only.
    """
    transaction_types = {
        "CASH-IN": 1,
        "CASH-OUT": 2,
        "DEBIT": 3,
        "PAYMENT": 4,
        "TRANSFER": 5,
    }
    return pd.Series({
        "type": transaction_types.get(row["type"], 0),
        "amount": row["amount"],
        "oldbalanceOrg": row["oldbalanceOrg"],
        "newbalanceOrig": row["newbalanceOrig"],
        "isMerchant": row["nameDest"].startswith("M"),
        "isFlaggedFraud": row["isFlaggedFraud"],
        "isFraud": row["isFraud"],
    })

In [10]:
preprocessed = df.apply(preprocess_row_1, axis=1)
preprocessed.head()

Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig,isMerchant,isFlaggedFraud,isFraud
0,4,9839.64,170136.0,160296.36,True,0,0
1,4,1864.28,21249.0,19384.72,True,0,0
2,5,181.0,181.0,0.0,False,0,1
3,0,181.0,181.0,0.0,False,0,1
4,4,11668.14,41554.0,29885.86,True,0,0


In [36]:
y = preprocessed['isFraud']
X = preprocessed.drop(columns=['isFraud'])
X_train, X_test, y_train, y_test = train_test_split(X, preprocessed['isFraud'], test_size=0.9999, stratify=y, random_state=2025)

In [37]:
len(X_train)

636

In [24]:
rf = RandomForestClassifier(n_estimators=10, random_state=2025)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
f1_weighted = f1_score(y_test, y_pred, average='weighted')
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(f"Weighted F1 Score: {f1_weighted:.2f}")

Accuracy: 1.00
Weighted F1 Score: 1.00


In [26]:
df.to_dict(orient='records')

[{'step': 1,
  'type': 'PAYMENT',
  'amount': 9839.64,
  'nameOrig': 'C1231006815',
  'oldbalanceOrg': 170136.0,
  'newbalanceOrig': 160296.36,
  'nameDest': 'M1979787155',
  'oldbalanceDest': 0.0,
  'newbalanceDest': 0.0,
  'isFraud': 0,
  'isFlaggedFraud': 0},
 {'step': 1,
  'type': 'PAYMENT',
  'amount': 1864.28,
  'nameOrig': 'C1666544295',
  'oldbalanceOrg': 21249.0,
  'newbalanceOrig': 19384.72,
  'nameDest': 'M2044282225',
  'oldbalanceDest': 0.0,
  'newbalanceDest': 0.0,
  'isFraud': 0,
  'isFlaggedFraud': 0},
 {'step': 1,
  'type': 'TRANSFER',
  'amount': 181.0,
  'nameOrig': 'C1305486145',
  'oldbalanceOrg': 181.0,
  'newbalanceOrig': 0.0,
  'nameDest': 'C553264065',
  'oldbalanceDest': 0.0,
  'newbalanceDest': 0.0,
  'isFraud': 1,
  'isFlaggedFraud': 0},
 {'step': 1,
  'type': 'CASH_OUT',
  'amount': 181.0,
  'nameOrig': 'C840083671',
  'oldbalanceOrg': 181.0,
  'newbalanceOrig': 0.0,
  'nameDest': 'C38997010',
  'oldbalanceDest': 21182.0,
  'newbalanceDest': 0.0,
  'isFraud