In [None]:
%pip install kaggle
%pip install pandarallel

In [10]:
import numpy as np
import pandas as pd

SEED = 11

In [11]:
def generate_customers(nc):
    rng = np.random.default_rng(SEED)

    for i in range(nc):
        df = pd.DataFrame({            
            'CUSTOMER_ID': np.arange(nc),
            'X_CUSTOMER': rng.uniform(0, 100, nc),
            'Y_CUSTOMER': rng.uniform(0, 100, nc),
            'MEAN_PURCHASE_AMOUNT': rng.uniform(5, 500, nc),
            'STD_PURCHASE_AMOUNT': rng.uniform(5, 50, nc),
            'MEAN_PURCHASE_FREQUENCY': rng.integers(0, 10, nc)         
        }) 
    return df

In [12]:
n_customers = 5
cust_df = generate_customers(n_customers)
cust_df

Unnamed: 0,CUSTOMER_ID,X_CUSTOMER,Y_CUSTOMER,MEAN_PURCHASE_AMOUNT,STD_PURCHASE_AMOUNT,MEAN_PURCHASE_FREQUENCY
0,0,24.509405,69.227911,76.092614,6.593369,9
1,1,84.522498,78.105482,224.349816,21.173543,2
2,2,74.18071,92.75026,394.21097,12.337557,8
3,3,54.579398,14.973517,447.875408,49.946112,3
4,4,66.147551,62.613016,380.817936,11.480691,7


In [13]:
def generate_terminals(nt):
    rng = np.random.default_rng(SEED)
    
    df = pd.DataFrame({
        'TERMINAL_ID': np.arange(nt),
        'X_TERMINAL': rng.uniform(0, 100, nt),
        'Y_TERMINAL': rng.uniform(0, 100, nt),
        'TERMINAL_CATEGORY': rng.choice(['GROCERY', 'ELECTRONICS', 'CLOTHING', 'FUEL', 'OTHER'], nt)
    })

    online_terminal = pd.Series({
        'TERMINAL_ID': -1,
        'X_TERMINAL': np.nan,
        'Y_TERMINAL': np.nan,
        'TERMINAL_CATEGORY': 'ONLINE'})

    df.loc[len(df)] = online_terminal

    return df

In [14]:
n_terminals = 5
term_df = generate_terminals(n_terminals)
term_df

Unnamed: 0,TERMINAL_ID,X_TERMINAL,Y_TERMINAL,TERMINAL_CATEGORY
0,0,12.85702,92.821102,OTHER
1,1,49.927786,7.042058,ELECTRONICS
2,2,60.149836,12.977395,GROCERY
3,3,2.868901,94.832845,CLOTHING
4,4,14.792608,62.188359,CLOTHING
5,-1,,,ONLINE


In [15]:
def get_x_y_terminals(term_df):
    return term_df[['X_TERMINAL', 'Y_TERMINAL']].to_numpy()

def terms_within_radius(customer, term_df, r=50):
    # coordinates array (NaNs allowed for online terminal)
    x_y_terminals = get_x_y_terminals(term_df)

    # customer's coordinates
    cx, cy = customer['X_CUSTOMER'], customer['Y_CUSTOMER']

    # compute euclidean distances and get indices within radius
    distances = np.linalg.norm(x_y_terminals - np.array([cx, cy]), axis=1)
    within_r = np.where(distances <= r)[0]

    return np.append(within_r,-1)

In [16]:
print(terms_within_radius(cust_df.iloc[0], term_df, r=50))
print(terms_within_radius(cust_df.iloc[1], term_df, r=50))
print(terms_within_radius(cust_df.iloc[2], term_df, r=50))

[ 0  3  4 -1]
[-1]
[-1]


In [17]:
def generate_customers_txn(cust, term_df, starting_date, n_days, fraud_prob=0.005):
    rng = np.random.default_rng(SEED)

    start_dt = pd.to_datetime(starting_date)

    # Simulate days between transactions using a Poisson model.
    # We treat customer's 'MEAN_PURCHASE_FREQUENCY' as the mean days between purchases.
    lam = max(0.1, float(cust.get('MEAN_PURCHASE_FREQUENCY', 1)))
    interarrival = rng.poisson(lam=lam, size=n_days)

    # make first transaction occur on the starting date (days elapsed = 0),
    if n_days > 0:
        days_elapsed = np.concatenate(([0], np.cumsum(interarrival)[:-1]))
    else:
        days_elapsed = np.array([], dtype=int)

    # Build transaction datetimes by adding days_elapsed and a random time-of-day
    seconds_in_day = rng.integers(0, 24 * 3600, size=n_days) if n_days > 0 else np.array([])
    txn_datetimes = start_dt + pd.to_timedelta(days_elapsed, unit='D') + pd.to_timedelta(seconds_in_day, unit='s')

    # Transaction amounts (clip to non-negative)
    txn_amt = rng.normal(cust['MEAN_PURCHASE_AMOUNT'], cust['STD_PURCHASE_AMOUNT'], n_days).clip(min=0)

    # Choose terminals within radius (include online terminal -1).
    terminal_choices = terms_within_radius(cust, term_df, r=50)
    terminals = rng.choice(terminal_choices, size=n_days)

    # Simple fraud flag (Bernoulli) - can be replaced with a rule-based generator later
    is_fraud = (rng.random(n_days) < fraud_prob).astype(int)

    df = pd.DataFrame({
        'CUSTOMER_ID': np.repeat(cust['CUSTOMER_ID'], n_days),
        'TXN_DATETIME': pd.to_datetime(txn_datetimes),
        'TERMINAL_ID': terminals,
        'TXN_AMT': txn_amt,
        'TXN_DAYS_ELAPSED': days_elapsed,
        'IS_FRAUD': is_fraud,
    })

    return df


In [18]:
def generate_customer_txns(cust, term_df, starting_date, n_days, r=50):
    rng = np.random.default_rng(SEED)
    start_dt = pd.to_datetime(starting_date)

    base_dates = pd.date_range(start=start_dt, periods=n_days, freq='D')
    seconds = rng.integers(0, 24*3600, n_days)  # random time within each day
    txn_datetimes = base_dates + pd.to_timedelta(seconds, unit='s')

    txn_dt = pd.Series(txn_datetimes)
    days_elapsed = (txn_dt - start_dt).dt.days.values
    txn_amt = rng.normal(cust['MEAN_PURCHASE_AMOUNT'], cust['STD_PURCHASE_AMOUNT'], n_days).clip(min=0)
    terminals = rng.choice(terms_within_radius(cust, term_df, r), n_days)

    df = pd.DataFrame({
        'CUSTOMER_ID': np.repeat(cust['CUSTOMER_ID'], n_days),   
        'TERMINAL_ID': terminals,
        'TRANSACTION_AMOUNT': np.round(txn_amt, decimals=2),
        'TXN_DAYS_ELAPSED': days_elapsed,
        'TXN_DATETIME': txn_dt,
        'IS_FRAUD': 0   # for now, all non-fraudulent
    })
    return df

In [19]:
all_txns = pd.concat(
    cust_df.apply(lambda cust: generate_customer_txns(cust, term_df, '2018-04-01', 55,50), axis=1).tolist(),
    ignore_index=True
)
all_txns

Unnamed: 0,CUSTOMER_ID,TERMINAL_ID,TRANSACTION_AMOUNT,TXN_DAYS_ELAPSED,TXN_DATETIME,IS_FRAUD
0,0.0,-1,66.25,0,2018-04-01 03:12:40,0
1,0.0,-1,76.33,1,2018-04-02 03:05:08,0
2,0.0,3,82.01,2,2018-04-03 19:07:47,0
3,0.0,-1,74.56,3,2018-04-04 11:58:57,0
4,0.0,3,71.19,4,2018-04-05 14:09:38,0
...,...,...,...,...,...,...
270,4.0,-1,386.16,50,2018-05-21 22:16:47,0
271,4.0,2,379.74,51,2018-05-22 14:11:53,0
272,4.0,2,369.26,52,2018-05-23 17:17:55,0
273,4.0,-1,395.25,53,2018-05-24 05:38:50,0


In [20]:
def generate_all_txns(cust_df, term_df, starting_date, n_days, r):
    rng = np.random.default_rng(SEED)

    df= pd.concat(
        cust_df.apply(
            lambda cust: generate_customer_txns(cust, term_df, starting_date, n_days, r),
            axis=1
        ).to_list()
    )

    return df

In [21]:
def generate_all_dfs(nc, nt, starting_date, n_days, r):
    cust_df = generate_customers(nc)
    term_df = generate_terminals(nt)
    txn_df = generate_all_txns(cust_df, term_df, starting_date, n_days, r)
    return (cust_df, term_df, txn_df)

In [22]:
(cust_df, term_df, txn_df)=\
    generate_all_dfs(nc = 5000, 
                     nt = 10000, 
                     starting_date="2011-01-01",
                     n_days=183,
                     r=5)

In [None]:
cust_df.to_csv("../../data/intermediary/customer_df.csv")
term_df.to_csv("../../data/intermediary/terminal_df.csv")
txn_df.to_csv("../../data/no_fraud_txn_df.csv")