<a href="https://colab.research.google.com/github/rpujala/machine_learning/blob/main/Transaction_Anomaly_Detection_using_AutoEncoders.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transaction Anomaly Detection using AutoEncoders

* The procurement compliance team monitors thousands of purchase transactions daily across vendors, categories, and reguons. While most transactions follow normal procurement patterns, a small fraction may indicate **fraud, policy violations, or process abuse,** such as unusually high amounts, abnormal vendor behavior, or atypica buying frequency.

* However, labeled data is extremely limited and unreliable. To address this, the team wants an unsupervised anamoly detection system that learns normal transaction behavior and flgas **abnormal procurement transactions** for further investigation.

* The solution must work without labels, adapt to new pattersn, and support **enforcement workflows**

In [None]:
import pandas as pd
import numpy as np

np.random.seed(42)

n_samples = 8000

data = {
    'vendor_id': np.random.choice([f"v_{i}" for i in range(50)], n_samples),
    'category': np.random.choice(
        ['IT', 'Logistics', 'Marketing', 'Facilities'], n_samples),
    'payment_terms_days': np.random.choice([15, 30, 45, 60], n_samples),
    'unit_price': np.random.normal(100, 20, n_samples).clip(10),
    'quantity': np.random.randint(1, 100, n_samples),
    'buyer_tenure_months': np.random.randint(1, 120, n_samples),
    'country':  np.random.choice(
                ['US', 'IN', 'DE', 'VN', 'CN'], n_samples),
    'is_weekend': np.random.choice([0, 1], n_samples, p=[0.9, 0.1])
}

df = pd.DataFrame(data=data)
df.head()

Unnamed: 0,vendor_id,category,payment_terms_days,unit_price,quantity,buyer_tenure_months,country,is_weekend
0,v_38,IT,15,127.600632,87,12,IN,0
1,v_28,IT,60,104.01394,56,48,VN,0
2,v_14,IT,45,117.785717,54,76,VN,0
3,v_42,Marketing,15,101.613464,17,53,IN,0
4,v_7,Marketing,30,102.540122,45,74,CN,0


In [None]:
df['total_amount'] = df['unit_price'] * df['quantity']

In [None]:
df.head()

Unnamed: 0,vendor_id,category,payment_terms_days,unit_price,quantity,buyer_tenure_months,country,is_weekend,total_amount
0,v_38,IT,15,127.600632,87,12,IN,0,11101.254956
1,v_28,IT,60,104.01394,56,48,VN,0,5824.780624
2,v_14,IT,45,117.785717,54,76,VN,0,6360.428717
3,v_42,Marketing,15,101.613464,17,53,IN,0,1727.428887
4,v_7,Marketing,30,102.540122,45,74,CN,0,4614.305477


# Inject Anomalies (For Evaluation only)

In [None]:
anomaly_index = np.random.choice(df.index, size=150, replace=False)
anomaly_index

array([5031, 3078, 3516, 6766, 1306, 1602, 2204, 2593, 3832,  749, 2625,
       3532, 1046, 4908, 2639, 1627, 1097, 1303, 3428, 1566, 2918, 3835,
       7514, 3498, 6410, 1856, 5554, 5170, 6366, 3335, 6193, 1984, 5489,
       7606, 5988, 4741, 3300, 6578, 4798, 1394, 2608, 3267, 3131, 1054,
        378, 5750, 7045, 4272, 1002, 4205, 2383, 7799, 7856, 1105, 4755,
       7346, 5192, 2011, 3683, 1444, 6478, 2688, 3092, 2391, 2322, 7198,
       1803, 3174,  906, 4767, 2481, 5369, 4193, 5109, 3255, 5863,  799,
       6886, 6081,  880, 3195,  195, 6932,  337, 6030,  105, 7391, 3579,
       4459, 7350, 1269, 7984, 3359, 6537, 4415, 4981, 7073, 4963, 6471,
       2915, 4441, 7403, 5460, 3033,   61, 3847, 1446, 1840, 2150, 5175,
        501, 7139, 1232,   56, 1040, 2866,  308, 7372, 2266, 6041, 5428,
        685, 5707, 2877, 6263, 3343, 7107, 2861, 7038, 2973, 7123,  890,
       5210,  691, 5357, 4151, 1062, 1939, 1863, 1095, 5891, 5207,   27,
       6290, 1779, 6760, 3615, 2683, 1480, 2531])

In [None]:
df.loc[anomaly_index, 'unit_price'] *= 5
df.loc[anomaly_index, 'quantity'] *= 3
df.loc[anomaly_index, 'is_weekend'] = 1

**Note: In real life, the anamolies are unknown, we inject them only to validate the model**

In [None]:
import numpy as np

df.replace('', np.nan, inplace=True)
df.isna().sum()

vendor_id              0
category               0
payment_terms_days     0
unit_price             0
quantity               0
buyer_tenure_months    0
country                0
is_weekend             0
total_amount           0
dtype: int64

# Feature Selection (No Labels Used)

In [None]:
df.head()

Unnamed: 0,vendor_id,category,payment_terms_days,unit_price,quantity,buyer_tenure_months,country,is_weekend,total_amount
0,v_38,IT,15,127.600632,87,12,IN,0,11101.254956
1,v_28,IT,60,104.01394,56,48,VN,0,5824.780624
2,v_14,IT,45,117.785717,54,76,VN,0,6360.428717
3,v_42,Marketing,15,101.613464,17,53,IN,0,1727.428887
4,v_7,Marketing,30,102.540122,45,74,CN,0,4614.305477


In [None]:
X = df.drop(columns=['total_amount', 'vendor_id'], axis=1)

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

np.random.seed(42)

X_train, X_test = train_test_split(X,
                                  test_size=0.2,
                                  random_state=42)

In [None]:
X_train.shape

(6400, 7)

In [None]:
X_test.shape

(1600, 7)

**Note: We intentionally exclude target labels (unsupervised)**

# Preprocessing Pipeline

In [None]:
X[:2]

Unnamed: 0,category,payment_terms_days,unit_price,quantity,buyer_tenure_months,country,is_weekend
0,IT,15,127.600632,87,12,IN,0
1,IT,60,104.01394,56,48,VN,0


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [None]:
num_features = [
    'payment_terms_days',
    'unit_price',
    'quantity',
    'buyer_tenure_months',
    'is_weekend'
]

cat_features = [
    'category',
    'country'
]

num_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, num_features),
    ('cat_pipeline', cat_pipeline, cat_features)
])

preprocessor

0,1,2
,transformers,"[('num_pipeline', ...), ('cat_pipeline', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [None]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [None]:
X_train_processed = X_train_processed.astype('float')
X_test_processed = X_test_processed.astype('float')

# Dense AutoEncoder model

In [None]:
input = tf.keras.layers.Input(shape=(X_train_processed.shape[1],))
encoded = tf.keras.layers.Dense(64, activation='relu')(input)
encoded = tf.keras.layers.Dense(32, activation='relu')(encoded)

decoded = tf.keras.layers.Dense(64, activation='relu')(encoded)
output = tf.keras.layers.Dense(X_train_processed.shape[1],
                              activation='linear')(decoded)

autoencoder = tf.keras.Model(
    inputs=input,
    outputs=output,
    name='AutoEncoder_Model'
)

autoencoder.summary()

Model: "AutoEncoder_Model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_20 (InputLayer)       [(None, 14)]              0         
                                                                 
 dense_59 (Dense)            (None, 64)                960       
                                                                 
 dense_60 (Dense)            (None, 32)                2080      
                                                                 
 dense_61 (Dense)            (None, 64)                2112      
                                                                 
 dense_62 (Dense)            (None, 14)                910       
                                                                 
Total params: 6,062
Trainable params: 6,062
Non-trainable params: 0
_________________________________________________________________


In [None]:
initial_learning_rate = 0.001
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=1000,
    decay_rate=0.96,
    staircase=True
)

autoencoder.compile(
    loss='mse',
    optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule)
)

import os
from datetime import datetime

log_dir = "logs/fit/" + datetime.now().strftime('%Y%m%d-%H%M%S')

callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        filepath="myModel{epoch:02d}.keras",
        save_best_only=True,
        monitor='val_loss',
        verbose=2),

    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        min_delta=1e-2,
        patience=5,
        verbose=2),

    tf.keras.callbacks.TensorBoard(
        log_dir = log_dir,
        histogram_freq = 1)
]

autoencoder.fit(
    X_train_processed,
    X_train_processed,
    epochs=5,
    batch_size=32,
    validation_split=0.2,
    callbacks=callbacks,
    verbose=2
)

Epoch 1/5

Epoch 1: val_loss improved from inf to 0.03261, saving model to myModel01.keras
160/160 - 2s - loss: 0.2045 - val_loss: 0.0326 - 2s/epoch - 11ms/step
Epoch 2/5

Epoch 2: val_loss improved from 0.03261 to 0.00844, saving model to myModel02.keras
160/160 - 0s - loss: 0.0154 - val_loss: 0.0084 - 444ms/epoch - 3ms/step
Epoch 3/5

Epoch 3: val_loss improved from 0.00844 to 0.00501, saving model to myModel03.keras
160/160 - 0s - loss: 0.0064 - val_loss: 0.0050 - 420ms/epoch - 3ms/step
Epoch 4/5

Epoch 4: val_loss improved from 0.00501 to 0.00375, saving model to myModel04.keras
160/160 - 0s - loss: 0.0043 - val_loss: 0.0037 - 440ms/epoch - 3ms/step
Epoch 5/5

Epoch 5: val_loss improved from 0.00375 to 0.00277, saving model to myModel05.keras
160/160 - 0s - loss: 0.0032 - val_loss: 0.0028 - 433ms/epoch - 3ms/step


<keras.callbacks.History at 0x7fe9456d47c0>

# Reconstruction Error & Anamoly Score

In [None]:

reconstructions = autoencoder.predict(X_test_processed)
recon_error = np.mean(np.square((X_test_processed - reconstructions)))

recon_error



0.002715529988871575

In [None]:
threshold = np.percentile(recon_error, 95)
threshold

0.002715529988871575

In [None]:
anamolies = recon_error > threshold
anamolies

False

# Feature Importance (Permutation Based for AutoEncoder)

* Shuffle one feature --> reconstruction error increases --> Important Feature

In [None]:
baseline_error = recon_error.mean()

feature_importance = []

for i in range(X_test_processed.shape[1]):
    X_perm = X_test_processed.copy()
    np.random.shuffle(X_perm[:, i])

    recon_perm = autoencoder.predict(X_perm)
    error_perm = np.mean(np.square(X_perm - recon_perm))

    feature_importance.append(error_perm.mean() - baseline_error)

sorted(feature_importance, reverse=True)



[0.006952976874068057,
 0.00674612624663961,
 0.006607242228964458,
 0.006437596719559962,
 0.005525145845085823,
 0.0052026477738758604,
 0.005046405409767318,
 0.004770463200675494,
 0.0047182977599405426,
 0.0026203084070379836,
 0.0012227377584087804,
 0.001040996018504522,
 -1.3083036787702763e-06,
 -8.547536218798338e-06]