<a href="https://colab.research.google.com/github/rpujala/machine_learning/blob/main/Fraud_Detection_with_Transaction_%2B_User_profile.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fraud Detection with Transaction + User profile

* The compliance and enforcement team monitors procurement transactions to detect potential fraud such as inflated invoices, collusion or abnormal purchasing behavior.

* Fraud patterns cannot be identified using transaction data alone. Some suppliers consistently show risky behavior due to poor compliance history, while others may appear normal until combined with suspiciouts transaction patterns

* To improve detection accuracy, the team wants a machine learning system that jointly analyzes transaction behavior and supplier profile signlas to predict the probability of fraud, enabling risk-based investigation and enforcement actions.

* Since fraud is rare and costly to miss, ROC-AUC is the primary evaluation metric


**Transaction Features:**
* transaction_amount
* transactions_7d
* transactions_30d
* avg_transaction_amount_30d
* amount_deviation_ratio

**Supplier Profile Features:**
* supplier_country
* supplier_tenure_years
* past_fraud_flag
* audit_fail_rate
* supplier_risk_score

In [None]:
import pandas as pd
import numpy as np

np.random.seed(42)

n_samples = 7000

data = {
    'transaction_amount': np.random.uniform(100, 50000, n_samples),
    'transactions_7d': np.random.randint(1, 20, n_samples),
    'transactions_30d': np.random.randint(5, 100, n_samples),
    'avg_transaction_amount_30d': np.random.uniform(200, 40000, n_samples),
    'supplier_country': np.random.choice(['US', 'IN', 'CN', 'DE'], n_samples),
    'supplier_tenure_years': np.random.uniform(0.5, 15, n_samples),
    'past_fraud_flag': np.random.choice([0, 1], n_samples, p=[0.9, 0.1]),
    'audit_fail_rate': np.random.uniform(0, 0.6, n_samples),
    'supplier_risk_score': np.random.uniform(0, 1, n_samples)
}

df = pd.DataFrame(data=data)
df.head()

Unnamed: 0,transaction_amount,transactions_7d,transactions_30d,avg_transaction_amount_30d,supplier_country,supplier_tenure_years,past_fraud_flag,audit_fail_rate,supplier_risk_score
0,18789.55193,11,19,3123.089912,DE,11.147439,0,0.081954,0.410778
1,47540.64389,17,55,16142.073,US,13.890538,1,0.193094,0.774784
2,36626.497696,5,57,1914.622606,CN,8.053387,0,0.144859,0.422042
3,29973.058361,7,38,8129.212418,CN,10.322715,0,0.221856,0.067515
4,7885.330158,19,81,36317.346868,IN,5.858476,0,0.085846,0.394263


In [None]:
df['amount_deviation_ratio'] = (
    df['transaction_amount'] / df['avg_transaction_amount_30d']
)

In [None]:
risk_score = (
    0.4 * (df['amount_deviation_ratio'] > 2).astype(int) +
    0.3 * df['past_fraud_flag'] +
    0.2 * (df['audit_fail_rate'] > 0.3).astype(int) +
    0.1 * (df['transactions_7d'] > 10).astype(int)
)

prob = 1 / (1 + np.exp(-4 * (risk_score - 0.5)))
df['fraud_label'] = (np.random.rand(n_samples) < prob).astype(int)

In [None]:
df.head()

Unnamed: 0,transaction_amount,transactions_7d,transactions_30d,avg_transaction_amount_30d,supplier_country,supplier_tenure_years,past_fraud_flag,audit_fail_rate,supplier_risk_score,amount_deviation_ratio,fraud_label
0,18789.55193,11,19,3123.089912,DE,11.147439,0,0.081954,0.410778,6.016334,1
1,47540.64389,17,55,16142.073,US,13.890538,1,0.193094,0.774784,2.945139,1
2,36626.497696,5,57,1914.622606,CN,8.053387,0,0.144859,0.422042,19.129878,0
3,29973.058361,7,38,8129.212418,CN,10.322715,0,0.221856,0.067515,3.68708,1
4,7885.330158,19,81,36317.346868,IN,5.858476,0,0.085846,0.394263,0.217123,0


# Check blank values

In [None]:
import numpy as np

df.replace('', np.nan, inplace=True)
df.isna().sum()

transaction_amount            0
transactions_7d               0
transactions_30d              0
avg_transaction_amount_30d    0
supplier_country              0
supplier_tenure_years         0
past_fraud_flag               0
audit_fail_rate               0
supplier_risk_score           0
amount_deviation_ratio        0
fraud_label                   0
dtype: int64

# Feature / Target Split

In [None]:
df.head()

Unnamed: 0,transaction_amount,transactions_7d,transactions_30d,avg_transaction_amount_30d,supplier_country,supplier_tenure_years,past_fraud_flag,audit_fail_rate,supplier_risk_score,amount_deviation_ratio,fraud_label
0,18789.55193,11,19,3123.089912,DE,11.147439,0,0.081954,0.410778,6.016334,1
1,47540.64389,17,55,16142.073,US,13.890538,1,0.193094,0.774784,2.945139,1
2,36626.497696,5,57,1914.622606,CN,8.053387,0,0.144859,0.422042,19.129878,0
3,29973.058361,7,38,8129.212418,CN,10.322715,0,0.221856,0.067515,3.68708,1
4,7885.330158,19,81,36317.346868,IN,5.858476,0,0.085846,0.394263,0.217123,0


In [None]:
X = df.drop(columns=['fraud_label'], axis=1)
y = df['fraud_label']

In [None]:
X[:2]

Unnamed: 0,transaction_amount,transactions_7d,transactions_30d,avg_transaction_amount_30d,supplier_country,supplier_tenure_years,past_fraud_flag,audit_fail_rate,supplier_risk_score,amount_deviation_ratio
0,18789.55193,11,19,3123.089912,DE,11.147439,0,0.081954,0.410778,6.016334
1,47540.64389,17,55,16142.073,US,13.890538,1,0.193094,0.774784,2.945139


In [None]:
y[:2]

0    1
1    1
Name: fraud_label, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

np.random.seed(42)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   test_size=0.2,
                                                   random_state=42,
                                                   stratify=y)

In [None]:
X_train.shape, y_train.shape

((5600, 10), (5600,))

In [None]:
X_test.shape, y_test.shape

((1400, 10), (1400,))

# Preprocessing Pipelines

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [None]:
df.head()

Unnamed: 0,transaction_amount,transactions_7d,transactions_30d,avg_transaction_amount_30d,supplier_country,supplier_tenure_years,past_fraud_flag,audit_fail_rate,supplier_risk_score,amount_deviation_ratio,fraud_label
0,18789.55193,11,19,3123.089912,DE,11.147439,0,0.081954,0.410778,6.016334,1
1,47540.64389,17,55,16142.073,US,13.890538,1,0.193094,0.774784,2.945139,1
2,36626.497696,5,57,1914.622606,CN,8.053387,0,0.144859,0.422042,19.129878,0
3,29973.058361,7,38,8129.212418,CN,10.322715,0,0.221856,0.067515,3.68708,1
4,7885.330158,19,81,36317.346868,IN,5.858476,0,0.085846,0.394263,0.217123,0


In [None]:
txn_features = [
    'transaction_amount',
    'transactions_7d',
    'transactions_30d',
    'avg_transaction_amount_30d',
    'amount_deviation_ratio'
]

txn_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

txn_pipeline = ColumnTransformer([
    ('txn_pipeline', txn_pipeline, txn_features)
])

txn_pipeline

0,1,2
,transformers,"[('txn_pipeline', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [None]:
supplier_num = [
    'supplier_tenure_years',
    'audit_fail_rate',
    'supplier_risk_score'
]

supplier_cat = [
    'supplier_country',
    'past_fraud_flag'
]

supplier_features = [
    'supplier_tenure_years',
    'audit_fail_rate',
    'supplier_risk_score',
    'supplier_country',
    'past_fraud_flag'
]


supplier_num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

supplier_cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

supplier_pipeline = ColumnTransformer([
    ('supplier_num_pipeline', supplier_num_pipeline, supplier_num),
    ('supplier_cat_pipeline', supplier_cat_pipeline, supplier_cat)
])

supplier_pipeline

0,1,2
,transformers,"[('supplier_num_pipeline', ...), ('supplier_cat_pipeline', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


# Transform Inputs

In [None]:
X_txn_train = txn_pipeline.fit_transform(X_train[txn_features])
X_txn_test = txn_pipeline.transform(X_test[txn_features])

In [None]:
X_sup_train = supplier_pipeline.fit_transform(X_train[supplier_features])
X_sup_test = supplier_pipeline.transform(X_test[supplier_features])

In [None]:
type(X_txn_train), type(X_txn_test)

(numpy.ndarray, numpy.ndarray)

In [None]:
type(X_sup_train), type(X_sup_test)

(numpy.ndarray, numpy.ndarray)

# Functional API Model

In [None]:
import tensorflow as tf
import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [None]:
txn_input = tf.keras.layers.Input(shape=(X_txn_train.shape[1], ))
txn_x = tf.keras.layers.Dense(32, activation='relu')(txn_input)
txn_x = tf.keras.layers.Dropout(0.3)(txn_x)

sup_input = tf.keras.layers.Input(shape=(X_sup_train.shape[1], ))
sup_x = tf.keras.layers.Dense(32, activation='relu')(sup_input)
sup_x = tf.keras.layers.Dropout(0.3)(sup_x)

combined = tf.keras.layers.Concatenate()([txn_x, sup_x])
x = tf.keras.layers.Dense(64, activation='relu')(combined)
x = tf.keras.layers.Dense(32, activation='relu')(x)
output = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.Model(
    inputs=[txn_input, sup_input],
    outputs=output,
    name="rajiv_model"
)

initial_learning_rate = 0.001
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=1000,
    decay_rate=0.96,
    staircase=True
)

model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule),
    metrics=[tf.keras.metrics.AUC(name='auc')]
)

model.summary()

Model: "rajiv_model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_16 (InputLayer)          [(None, 5)]          0           []                               
                                                                                                  
 input_17 (InputLayer)          [(None, 9)]          0           []                               
                                                                                                  
 dense_35 (Dense)               (None, 32)           192         ['input_16[0][0]']               
                                                                                                  
 dense_36 (Dense)               (None, 32)           320         ['input_17[0][0]']               
                                                                                        

In [None]:
import os
from datetime import datetime

log_dir = "logs/fit/" + datetime.now().strftime('%Y%m%d-%H%M%S')

callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        filepath="myModel_{epoch:02d}.keras",
        save_best_only=True,
        monitor='val_loss',
        verbose=2),

    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        min_delta=1e-2,
        patience=5,
        verbose=2),

    tf.keras.callbacks.TensorBoard(
        log_dir = log_dir,
        histogram_freq = 1)
]

In [None]:
model.fit(
    [X_txn_train, X_sup_train],
    y_train,
    epochs=5,
    batch_size=32,
    validation_split=0.2,
    callbacks=callbacks,
    verbose=2
)

Epoch 1/5

Epoch 1: val_loss improved from inf to 0.58795, saving model to myModel_01.keras
140/140 - 3s - loss: 0.6127 - auc: 0.6371 - val_loss: 0.5880 - val_auc: 0.6907 - 3s/epoch - 19ms/step
Epoch 2/5

Epoch 2: val_loss improved from 0.58795 to 0.58291, saving model to myModel_02.keras
140/140 - 1s - loss: 0.5918 - auc: 0.6837 - val_loss: 0.5829 - val_auc: 0.6991 - 585ms/epoch - 4ms/step
Epoch 3/5

Epoch 3: val_loss improved from 0.58291 to 0.57988, saving model to myModel_03.keras
140/140 - 1s - loss: 0.5857 - auc: 0.6956 - val_loss: 0.5799 - val_auc: 0.7029 - 580ms/epoch - 4ms/step
Epoch 4/5

Epoch 4: val_loss improved from 0.57988 to 0.57868, saving model to myModel_04.keras
140/140 - 1s - loss: 0.5812 - auc: 0.7014 - val_loss: 0.5787 - val_auc: 0.7064 - 583ms/epoch - 4ms/step
Epoch 5/5

Epoch 5: val_loss did not improve from 0.57868
140/140 - 1s - loss: 0.5809 - auc: 0.7026 - val_loss: 0.5826 - val_auc: 0.7024 - 553ms/epoch - 4ms/step


<keras.callbacks.History at 0x7f6ae05f5900>

# Evaluation

In [None]:
from sklearn.metrics import roc_auc_score

probs = model.predict([X_txn_test, X_sup_test]).flatten()
print(f"ROC AUC: {roc_auc_score(y_test, probs)}")

ROC AUC: 0.6959381477697866


# Feature Importance (Branch-Wise Permutation)

In [None]:
import numpy as np

def auc_fn(txn, sup, y):
    preds = model.predict([txn, sup]).flatten()
    return roc_auc_score(y, preds)

baseline_auc = auc_fn(X_txn_test, X_sup_test, y_test)
baseline_auc



0.6959381477697866

In [None]:
txn_importance = []
for i in range(X_txn_test.shape[1]):
    txn_perm = X_txn_test.copy()
    np.random.shuffle(txn_perm[:, i])

    txn_importance.append(
        baseline_auc - auc_fn(txn_perm, X_sup_test, y_test)
    )
txn_importance



[0.03060158744306496,
 0.006453986776640441,
 0.0012501171270618006,
 0.11062279601150937,
 0.0016432069732311705]

In [None]:
txn_pipeline.get_feature_names_out()

array(['txn_pipeline__transaction_amount',
       'txn_pipeline__transactions_7d', 'txn_pipeline__transactions_30d',
       'txn_pipeline__avg_transaction_amount_30d',
       'txn_pipeline__amount_deviation_ratio'], dtype=object)

In [None]:
supp_importance = []
for i in range(X_sup_test.shape[1]):
    sup_perm = X_sup_test.copy()
    np.random.shuffle(sup_perm[:, i])

    supp_importance.append(
        baseline_auc - auc_fn(X_txn_test, sup_perm, y_test)
    )
supp_importance



[-0.0016363507549838063,
 0.03162316396188858,
 -0.0013986685224164352,
 0.0001348389588605725,
 0.00026282169947366985,
 -0.003215566357908317,
 -0.0008341732200685126,
 0.00657968411117138,
 0.00040908768874603485]

In [None]:
supplier_pipeline.get_feature_names_out()

array(['supplier_num_pipeline__supplier_tenure_years',
       'supplier_num_pipeline__audit_fail_rate',
       'supplier_num_pipeline__supplier_risk_score',
       'supplier_cat_pipeline__supplier_country_CN',
       'supplier_cat_pipeline__supplier_country_DE',
       'supplier_cat_pipeline__supplier_country_IN',
       'supplier_cat_pipeline__supplier_country_US',
       'supplier_cat_pipeline__past_fraud_flag_0',
       'supplier_cat_pipeline__past_fraud_flag_1'], dtype=object)