3 fraud scenarios to simulate different types of fraudulent activities in transaction data are as follows:
1. go-big-or-go-home fraud:
   - desc: fraudster uses stolen credit card to make a few large purchases.
   - implementation: every day, select 1 customer at random and alter 1-3 of their transactions to be high 
        amount transactions at random terminals.
2. under-wraps-rapid-fire fraud:
    - desc: fraudster makes small purchases to avoid detection.
    - implementation: every day, select 3-5 customer at random and inject 10 daily transactions for the next 
        3 days per customer. each transaction amount is low (e.g., between $1 and $5). 
3. night-owl fraud:
    - desc: fraudster makes transactions during odd hours.
    - implementation: every 4 days, select 2 customers at random and inject 4 transactions between 12 AM and 
        4 AM for each customer for the following 2 days.

In [41]:
import numpy as np
import pandas as pd

In [None]:
SEED = 11

cust_df = pd.read_csv("../../data/intermediary/customer_df.csv")
term_df = pd.read_csv("../../data/intermediary/terminal_df.csv")
txn_df = pd.read_csv("../../data/no_fraud_txn_df.csv")

In [43]:
def go_big_or_go_home(txn_df, customer_df, terminal_df, starting_date, seed=SEED):
    df = txn_df.copy()
    day_col = 'TXN_DAYS_ELAPSED'
    amt_col = 'TRANSACTION_AMOUNT'
    cust_col = 'CUSTOMER_ID'
    term_col = 'TERMINAL_ID'
    dt_col = 'TXN_DATETIME'

    start_dt = pd.to_datetime(starting_date)
    min_day = int(df[day_col].min())
    max_day = int(df[day_col].max())

    all_customers = customer_df[cust_col].unique()
    all_terminals = terminal_df[term_col].unique()

    for current_day in range(min_day, max_day + 1):
        rng = np.random.default_rng(seed + current_day)
        customer_id = int(rng.choice(all_customers))

        rng2 = np.random.default_rng(seed + 1000 + current_day)
        cust_row = customer_df.loc[customer_df[cust_col] == customer_id].iloc[0]
        mean_amt = cust_row['MEAN_PURCHASE_AMOUNT']

        high_amt = float(rng2.uniform(5 * mean_amt, 10 * mean_amt))
        term_id = int(rng2.choice(all_terminals))
        seconds = int(rng2.integers(0, 24 * 3600))
        txn_dt = start_dt + pd.to_timedelta(current_day, unit='D') + pd.to_timedelta(seconds, unit='s')

        new_row = {
            cust_col: customer_id,
            term_col: term_id,
            amt_col: high_amt,
            day_col: current_day,
            dt_col: txn_dt,
            'IS_FRAUD': 1
        }
        df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

    return df


In [44]:
def under_wraps_rapid_fire(txn_df, customer_df, terminal_df, starting_date, seed=SEED):
    df = txn_df.copy()

    day_col = 'TXN_DAYS_ELAPSED'
    amt_col = 'TRANSACTION_AMOUNT'
    cust_col = 'CUSTOMER_ID'
    term_col = 'TERMINAL_ID'
    dt_col = 'TXN_DATETIME'

    start_dt = pd.to_datetime(starting_date)

    min_day = int(df[day_col].min())
    max_day = int(df[day_col].max())

    all_customers = customer_df[cust_col].to_numpy()
    all_terminals = terminal_df[term_col].to_numpy()

    rng = np.random.default_rng(seed)

    new_rows = []  # collect dictionaries here

    for current_day in range(min_day, max_day + 1):
        if len(all_customers) == 0:
            break

        # number of customers to target today: 3â€“5
        n_customers = int(rng.integers(3, 6))
        n_customers = min(n_customers, len(all_customers))

        chosen_customers = rng.choice(all_customers, size=n_customers, replace=False)

        for customer_id in chosen_customers:
            # current day and next 2 days
            for offset in range(3):
                d = current_day + offset
                if d > max_day:
                    continue

                # 10 injected transactions for this day and customer
                seconds = rng.integers(0, 24 * 3600, size=10)  # rand time within the day
                tx_datetimes = start_dt + pd.to_timedelta(d, unit='D') + pd.to_timedelta(seconds, unit='s')
                amts = rng.uniform(1, 5, size=10)
                terms = rng.choice(all_terminals, size=10, replace=True)

                for i in range(10):
                    new_rows.append({
                        cust_col: int(customer_id),
                        term_col: int(terms[i]),
                        amt_col: float(amts[i]),
                        day_col: int(d),
                        dt_col: tx_datetimes[i],
                        'IS_FRAUD': 1
                    })

    if new_rows:
        fraud_df = pd.DataFrame(new_rows)
        df = pd.concat([df, fraud_df], ignore_index=True)

    return df

In [45]:
def night_owl_fraud(txn_df, customer_df, terminal_df, starting_date, seed=SEED):
    df = txn_df.copy()
    day_col = 'TXN_DAYS_ELAPSED'
    amt_col = 'TRANSACTION_AMOUNT'
    cust_col = 'CUSTOMER_ID'
    term_col = 'TERMINAL_ID'
    dt_col = 'TXN_DATETIME'

    start_dt = pd.to_datetime(starting_date)
    min_day = int(df[day_col].min())
    max_day = int(df[day_col].max())

    all_customers = customer_df[cust_col].unique()
    all_terminals = terminal_df[term_col].unique()

    if 'IS_FRAUD' not in df.columns:
        df['IS_FRAUD'] = 0

    # every 4 days
    for current_day in range(min_day, max_day + 1, 4):
        rng = np.random.default_rng(seed + 20_000 + current_day)

        n_customers = min(2, len(all_customers))
        chosen_customers = rng.choice(all_customers, size=n_customers, replace=False)

        for customer_id in chosen_customers:
            for offset in range(0, 2):  # current day & following
                d = current_day + offset
                if d > max_day:
                    continue

                for _ in range(4):  # 4 night txn / day
                    seconds = int(rng.integers(0, 4 * 3600))  # 00:00 to 04:00
                    txn_dt = start_dt + pd.to_timedelta(d, unit='D') + pd.to_timedelta(seconds, unit='s')

                    cust_row = customer_df.loc[customer_df[cust_col] == customer_id].iloc[0]
                    mean_amt = cust_row['MEAN_PURCHASE_AMOUNT']
                    std_amt = cust_row['STD_PURCHASE_AMOUNT']

                    amt = float(np.clip(np.random.normal(mean_amt, std_amt), 1, None))
                    term_id = int(rng.choice(all_terminals))

                    new_row = {
                        cust_col: int(customer_id),
                        term_col: term_id,
                        amt_col: amt,
                        day_col: d,
                        dt_col: txn_dt,
                        'IS_FRAUD': 1
                    }
                    df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

    return df


In [46]:
df = go_big_or_go_home(txn_df, cust_df, term_df, "2011-01-01", 11)
n_go_big_or_go_home = df['IS_FRAUD'].sum()
print("num of fraudulent transactions after go-big-or-go-home:", n_go_big_or_go_home)



num of fraudulent transactions after go-big-or-go-home: 183


In [47]:
def apply_all_frauds(txn_df, cust_df, term_df, starting_date, seed=SEED):
    df = go_big_or_go_home(txn_df, cust_df, term_df, starting_date, seed)
    n_go_big_or_go_home = df['IS_FRAUD'].sum()
    print("num of fraudulent transactions after go-big-or-go-home:", n_go_big_or_go_home)
    df = under_wraps_rapid_fire(df, cust_df, term_df, starting_date, seed)
    n_under_wraps_rapid_fire = df['IS_FRAUD'].sum()
    print("num of fraudulent transactions after under-wraps-rapid-fire:", n_under_wraps_rapid_fire)
    df = night_owl_fraud(df, cust_df, term_df, starting_date, seed)
    n_night_owl_fraud = df['IS_FRAUD'].sum() - n_go_big_or_go_home - n_under_wraps_rapid_fire
    print("num of fraudulent transactions after night-owl-fraud:", n_night_owl_fraud)
    return df

txn_df_fraud = apply_all_frauds(txn_df, cust_df, term_df, "2011-01-01")
txn_df_fraud.head()

num of fraudulent transactions after go-big-or-go-home: 183
num of fraudulent transactions after under-wraps-rapid-fire: 22423
num of fraudulent transactions after night-owl-fraud: 553


Unnamed: 0.1,Unnamed: 0,CUSTOMER_ID,TERMINAL_ID,TRANSACTION_AMOUNT,TXN_DAYS_ELAPSED,TXN_DATETIME,IS_FRAUD
0,0.0,0.0,8542,304.39,0,2011-01-01 03:12:40,0
1,1.0,0.0,4452,364.53,1,2011-01-02 03:05:08,0
2,2.0,0.0,1410,428.43,2,2011-01-03 19:07:47,0
3,3.0,0.0,518,472.21,3,2011-01-04 11:58:57,0
4,4.0,0.0,4768,414.32,4,2011-01-05 14:09:38,0


In [48]:
print(txn_df_fraud.head())
print(txn_df_fraud['IS_FRAUD'].mean())
print(txn_df_fraud['IS_FRAUD'].sum())

   Unnamed: 0  CUSTOMER_ID  TERMINAL_ID  TRANSACTION_AMOUNT  TXN_DAYS_ELAPSED  \
0         0.0          0.0         8542              304.39                 0   
1         1.0          0.0         4452              364.53                 1   
2         2.0          0.0         1410              428.43                 2   
3         3.0          0.0          518              472.21                 3   
4         4.0          0.0         4768              414.32                 4   

          TXN_DATETIME  IS_FRAUD  
0  2011-01-01 03:12:40         0  
1  2011-01-02 03:05:08         0  
2  2011-01-03 19:07:47         0  
3  2011-01-04 11:58:57         0  
4  2011-01-05 14:09:38         0  
0.024685581015584778
23159


In [49]:
txn_df_fraud.to_csv("txn_df_with_fraud.csv", index=False)