<a href="https://colab.research.google.com/github/rpujala/machine_learning/blob/main/Supply_Chain_Compliance_Violation_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Supply Chain Compliance Violation Prediction

* The supply chain compliance team enforces hundreds of procurement and logistics policies covering pricing limits, supplier eligibility, shipment rules, and approval workflows.

* Today, most violations are detected after the fact, leading to corrective actions, audits, and penalities. The team wants to shift from reactive enforcement to proactive prevention by predicting whether a transaction or supplier action is likely to violate policy before it occurs.

* Compliance risk depends on:
    * Policy rules (textual, complex and evolving)
    * Historical violation behavior (patterns over time)
    
* A machine learning model that jointly learns from policy rule embeddings and historical violation signals can enable early intervention,r reduce audt costs, and improve policy adherence - but only if the predictions are explainable

* Policy Rules (Text):
    * Supplier must be ISO-Certified
    * Max order value <= 50k
    * Restricted country shipping prohibited
    
* Historical Violation Signals:
    * past_violation_count
    * violation_rate_90d
    * days_since_last_violation
    * supplier_risk_score
    * audit_fail_rate

In [None]:
import pandas as pd
import numpy as np

np.random.seed(42)

n_samples = 6000

policy_texts = [
    "max order value limit exceeded",
    "restricted supplier used",
    "missing compliance certification",
    "unauthorized shipment route",
    "pricing policy violation"
]

data = {
    'policy_text': np.random.choice(policy_texts, n_samples),
    'past_violation_count': np.random.poisson(1.5, n_samples),
    'violation_rate_90d': np.random.uniform(0, 0.6, n_samples),
    'days_since_last_violation': np.random.randint(1, 1000, n_samples),
    'supplier_risk_score': np.random.uniform(0, 1, n_samples),
    'audit_fail_rate': np.random.uniform(0, 0.5, n_samples)
}

df = pd.DataFrame(data=data)
df.head()

Unnamed: 0,policy_text,past_violation_count,violation_rate_90d,days_since_last_violation,supplier_risk_score,audit_fail_rate
0,unauthorized shipment route,1,0.03369,401,0.199359,0.359354
1,pricing policy violation,1,0.546613,146,0.827391,0.09765
2,missing compliance certification,1,0.193451,298,0.676576,0.204073
3,pricing policy violation,0,0.255651,145,0.471379,0.403046
4,pricing policy violation,2,0.496406,16,0.915624,0.305965


In [None]:
risk = (
    0.4 * (df['violation_rate_90d'] < 0.3).astype(int) +
    0.3 * (df['past_violation_count'] > 2).astype(int) +
    0.2 * (df['supplier_risk_score'] > 0.6).astype(int) +
    0.1 * (df['audit_fail_rate'] < 0.25).astype(int)
)

prob = 1 / (1 + np.exp(-3 * (risk - 0.5)))
df['violation_label'] = (np.random.rand(n_samples) < prob).astype(int)

In [None]:
df.head()

Unnamed: 0,policy_text,past_violation_count,violation_rate_90d,days_since_last_violation,supplier_risk_score,audit_fail_rate,violation_label
0,unauthorized shipment route,1,0.03369,401,0.199359,0.359354,0
1,pricing policy violation,1,0.546613,146,0.827391,0.09765,1
2,missing compliance certification,1,0.193451,298,0.676576,0.204073,1
3,pricing policy violation,0,0.255651,145,0.471379,0.403046,1
4,pricing policy violation,2,0.496406,16,0.915624,0.305965,0


# Check for Null / Blank values

In [None]:
import numpy as np

df.replace('', np.nan, inplace=True)
df.isna().sum()

policy_text                  0
past_violation_count         0
violation_rate_90d           0
days_since_last_violation    0
supplier_risk_score          0
audit_fail_rate              0
violation_label              0
dtype: int64

# Train / Test split

In [None]:
X = df.drop(columns=['violation_label'], axis=1)
y = df['violation_label']

In [None]:
X[:2]

Unnamed: 0,policy_text,past_violation_count,violation_rate_90d,days_since_last_violation,supplier_risk_score,audit_fail_rate
0,unauthorized shipment route,1,0.03369,401,0.199359,0.359354
1,pricing policy violation,1,0.546613,146,0.827391,0.09765


In [None]:
y[:2]

0    0
1    1
Name: violation_label, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

np.random.seed(42)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   test_size=0.2,
                                                   random_state=42,
                                                   stratify=y)

In [None]:
X_train.shape, y_train.shape

((4800, 6), (4800,))

In [None]:
X_test.shape, y_test.shape

((1200, 6), (1200,))

# Text Processing (Policy Rules --> Embedding)

In [None]:
max_words = 3000
max_len = 12

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_words,
                                                 oov_token="<OOV>")
tokenizer.fit_on_texts(X_train['policy_text'])

X_train_text = tf.keras.preprocessing.sequence.pad_sequences(
    tokenizer.texts_to_sequences(X_train['policy_text']),
    maxlen=max_len)

X_test_text = tf.keras.preprocessing.sequence.pad_sequences(
    tokenizer.texts_to_sequences(X_test['policy_text']),
    maxlen=max_len
)

# Structured Feature Processing

In [None]:
df.head(2)

Unnamed: 0,policy_text,past_violation_count,violation_rate_90d,days_since_last_violation,supplier_risk_score,audit_fail_rate,violation_label
0,unauthorized shipment route,1,0.03369,401,0.199359,0.359354,0
1,pricing policy violation,1,0.546613,146,0.827391,0.09765,1


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [None]:
num_features = [
    'past_violation_count',
    'violation_rate_90d',
    'days_since_last_violation',
    'supplier_risk_score',
    'audit_fail_rate'
]

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

X_train_num = num_pipeline.fit_transform(X_train[num_features])
X_test_num = num_pipeline.transform(X_test[num_features])

In [None]:
type(X_train_num), type(X_test_num)

(numpy.ndarray, numpy.ndarray)

# Functional API

In [None]:
import tensorflow as tf
import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [None]:
# Policy text branch

text_input = tf.keras.layers.Input(shape=(max_len,))
emb = tf.keras.layers.Embedding(max_words, 64)(text_input)
text_feat = tf.keras.layers.GlobalAveragePooling1D()(emb)

# Historical signal branch

num_input = tf.keras.layers.Input(shape=(X_train_num.shape[1], ))
num_feat = tf.keras.layers.Dense(32, )(num_input)

merged = tf.keras.layers.Concatenate()([text_feat, num_feat])
x = tf.keras.layers.Dense(64, activation='relu')(merged)
x = tf.keras.layers.Dense(32, activation='relu')(x)
output = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.Model(
    inputs=[text_input, num_input],
    outputs=output,
    name="rajiv_model"
)

initial_learning_rate = 0.001
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=1000,
    decay_rate=0.96,
    staircase=True
)

model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule),
    metrics=[tf.keras.metrics.AUC(name='auc')]
)

model.summary()

Model: "rajiv_model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_18 (InputLayer)          [(None, 12)]         0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 12, 64)       192000      ['input_18[0][0]']               
                                                                                                  
 input_19 (InputLayer)          [(None, 5)]          0           []                               
                                                                                                  
 global_average_pooling1d_1 (Gl  (None, 64)          0           ['embedding_1[0][0]']            
 obalAveragePooling1D)                                                                  

In [None]:
import os
from datetime import datetime

log_dir = "logs/fit/" + datetime.now().strftime('%Y%m%d-%H%M%S')

callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        filepath="myModel_{epoch:02d}.keras",
        save_best_only=True,
        monitor='val_loss',
        verbose=2),

    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        min_delta=1e-2,
        patience=5,
        verbose=2),

    tf.keras.callbacks.TensorBoard(
        log_dir = log_dir,
        histogram_freq = 1)
]

print(callbacks)

[<keras.callbacks.ModelCheckpoint object at 0x7f6a4832b490>, <keras.callbacks.EarlyStopping object at 0x7f6a4832a7a0>, <keras.callbacks.TensorBoard object at 0x7f6a4832b970>]


In [None]:
# Model Training

model.fit(
    [X_train_text, X_train_num],
    y_train,
    epochs=5,
    batch_size=32,
    validation_split=0.2,
    callbacks=callbacks,
    verbose=2
)

Epoch 1/5

Epoch 1: val_loss improved from inf to 0.63318, saving model to myModel_01.keras
120/120 - 2s - loss: 0.6485 - auc: 0.6502 - val_loss: 0.6332 - val_auc: 0.6795 - 2s/epoch - 19ms/step
Epoch 2/5

Epoch 2: val_loss did not improve from 0.63318
120/120 - 1s - loss: 0.6379 - auc: 0.6703 - val_loss: 0.6354 - val_auc: 0.6747 - 508ms/epoch - 4ms/step
Epoch 3/5

Epoch 3: val_loss improved from 0.63318 to 0.63224, saving model to myModel_03.keras
120/120 - 1s - loss: 0.6362 - auc: 0.6720 - val_loss: 0.6322 - val_auc: 0.6795 - 530ms/epoch - 4ms/step
Epoch 4/5

Epoch 4: val_loss improved from 0.63224 to 0.63133, saving model to myModel_04.keras
120/120 - 1s - loss: 0.6340 - auc: 0.6774 - val_loss: 0.6313 - val_auc: 0.6823 - 689ms/epoch - 6ms/step
Epoch 5/5

Epoch 5: val_loss did not improve from 0.63133
120/120 - 0s - loss: 0.6328 - auc: 0.6806 - val_loss: 0.6344 - val_auc: 0.6767 - 489ms/epoch - 4ms/step


<keras.callbacks.History at 0x7f6af4498310>

# Evaluation

In [None]:
from sklearn.metrics import roc_auc_score

probs = model.predict([X_test_text, X_test_num]).flatten()
print(f"ROC AUC: {roc_auc_score(y_test, probs)}")

ROC AUC: 0.6412083651680447


# Feature Importance (Permutation - Structured Branch)

In [None]:
import numpy as np

def auc_fn(X_text, X_num, y):
    probs = model.predict([X_text, X_num]).flatten()
    return roc_auc_score(y, probs)

baseline_auc = auc_fn(X_test_text, X_test_num, y_test)
baseline_auc



0.6412083651680447

In [None]:
importance = []
for i in range(X_test_num.shape[1]):
    x_perm = X_test_num.copy()
    np.random.shuffle(x_perm[:, i])
    importance.append(baseline_auc - auc_fn(X_test_text, x_perm, y_test))

importance



[0.006595110756392852,
 0.10299677061114132,
 -0.002769264852490405,
 0.011088420496536355,
 0.0016217951084842586]

In [None]:
num_pipeline.get_feature_names_out()

array(['past_violation_count', 'violation_rate_90d',
       'days_since_last_violation', 'supplier_risk_score',
       'audit_fail_rate'], dtype=object)