<a href="https://colab.research.google.com/github/rpujala/machine_learning/blob/main/Vendor_Risk_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Vendor Risk Classification (Multi-Source Inputs)

* Inputs:
    * Vendor metadata
    * Transaction behaviour
    * Compliance history
    
* The procurement compliance team manages thousands of vendors globally. While most vendors operate within policy, a subset pose elevated risk due to inconsistent transaction behavior, poort delivery performance, or priot compliance violations

* Aduting all vendors equally is costly and inefficient. Instead, the team wants a risk based audit system that classifies vendors into High / Medium / Low risk tiers, allowing enforcement teams to focus audits where they matter most.

* Vendor risk is not driven by a single signal but by multiple independent data sources: vendor metadata, transactional behavior, and historical compliance records

* A multi-input deep learning model is required to learn each signal independently and combine them into a unified risk score.

* Vendor Metadata:
    * vendor country
    * vendor tenure years
    * vendor size
    
* Transaction Behavior
    * avg order value
    * order volatility
    * late delivery rate
    * transactions_90d
    
* Compliance History
    * past_violations
    * audit fail rate
    * days since last violation

In [None]:
import numpy as np
import pandas as pd

np.random.seed(42)
n_samples = 5000

data = {
    'vendor_country': np.random.choice(['US', 'IN', 'CN', 'DE'], n_samples),
    'vendor_tenure_years': np.random.uniform(0.5, 15, n_samples),
    'vendor_size': np.random.choice(['SMALL', 'MEDIUM', 'LARGE'], n_samples),

    'avg_order_value': np.random.uniform(500, 5000, n_samples),
    'order_volatility': np.random.uniform(0.1,2.5, n_samples),
    'late_delivery_rate': np.random.uniform(0, 0.6, n_samples),
    'transaction_90d': np.random.randint(1, 200, n_samples),

    'past_violations': np.random.poisson(1.2, n_samples),
    'audit_fail_rate': np.random.uniform(0, 0.5, n_samples),
    'days_since_last_violation': np.random.randint(1, 1000, n_samples)
}

df = pd.DataFrame(data=data)
df.head()

Unnamed: 0,vendor_country,vendor_tenure_years,vendor_size,avg_order_value,order_volatility,late_delivery_rate,transaction_90d,past_violations,audit_fail_rate,days_since_last_violation
0,CN,12.499024,MEDIUM,3771.292409,2.196076,0.133792,52,2,0.341301,248
1,DE,11.585653,SMALL,4996.984647,1.200158,0.169675,75,1,0.377531,375
2,US,8.81617,SMALL,4956.195379,0.491733,0.256537,73,1,0.486226,396
3,CN,14.362684,SMALL,3849.515793,2.092683,0.075078,29,2,0.265222,576
4,CN,3.40688,SMALL,3828.139204,0.87904,0.548023,171,0,0.082967,959


In [None]:
risk_score = (
    0.4 * df['late_delivery_rate'] +
    0.3 * df['audit_fail_rate'] +
    0.2 * (df['past_violations'] > 2).astype(int) +
    0.1 * (df['order_volatility'] > 1.5).astype(int)
)

df['risk_score'] = pd.cut(risk_score,
      bins=[-1, 0.25, 0.5, 1],
      labels=['LOW','MEDIUM','HIGH'])

In [None]:
df.head()

Unnamed: 0,vendor_country,vendor_tenure_years,vendor_size,avg_order_value,order_volatility,late_delivery_rate,transaction_90d,past_violations,audit_fail_rate,days_since_last_violation,risk_score
0,CN,12.499024,MEDIUM,3771.292409,2.196076,0.133792,52,2,0.341301,248,MEDIUM
1,DE,11.585653,SMALL,4996.984647,1.200158,0.169675,75,1,0.377531,375,LOW
2,US,8.81617,SMALL,4956.195379,0.491733,0.256537,73,1,0.486226,396,LOW
3,CN,14.362684,SMALL,3849.515793,2.092683,0.075078,29,2,0.265222,576,LOW
4,CN,3.40688,SMALL,3828.139204,0.87904,0.548023,171,0,0.082967,959,LOW


# Encode target

In [None]:
X = df.drop(columns=['risk_score'], axis=1)
y = df['risk_score']

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y = le.fit_transform(y)
y = tf.keras.utils.to_categorical(y)

In [None]:
type(X), type(y)

(pandas.core.frame.DataFrame, numpy.ndarray)

In [None]:
y[:2]

array([[0., 0., 1.],
       [0., 1., 0.]], dtype=float32)

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

np.random.seed(42)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   test_size=0.2,
                                                   random_state=42,
                                                   stratify=y)

In [None]:
X_train.shape, y_train.shape

((4000, 10), (4000, 3))

In [None]:
X_test.shape, y_test.shape

((1000, 10), (1000, 3))

# Preprocessing Pipelines

In [None]:
df.head()

Unnamed: 0,vendor_country,vendor_tenure_years,vendor_size,avg_order_value,order_volatility,late_delivery_rate,transaction_90d,past_violations,audit_fail_rate,days_since_last_violation,risk_score
0,CN,12.499024,MEDIUM,3771.292409,2.196076,0.133792,52,2,0.341301,248,MEDIUM
1,DE,11.585653,SMALL,4996.984647,1.200158,0.169675,75,1,0.377531,375,LOW
2,US,8.81617,SMALL,4956.195379,0.491733,0.256537,73,1,0.486226,396,LOW
3,CN,14.362684,SMALL,3849.515793,2.092683,0.075078,29,2,0.265222,576,LOW
4,CN,3.40688,SMALL,3828.139204,0.87904,0.548023,171,0,0.082967,959,LOW


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [None]:
# Vendor Meta pipeline

meta_num = [
    'vendor_tenure_years'
]

meta_cat = [
    'vendor_country',
    'vendor_size'
]

meta_features = [
    'vendor_tenure_years',
    'vendor_country',
    'vendor_size'
]


meta_num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

meta_cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

meta_pipeline = ColumnTransformer([
    ('meta_num_pipeline', meta_num_pipeline, meta_num),
    ('meta_cat_pipeline', meta_cat_pipeline, meta_cat)
])

meta_pipeline


0,1,2
,transformers,"[('meta_num_pipeline', ...), ('meta_cat_pipeline', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [None]:
# Transactional Behavior Pipeline

txn_num = [
    'avg_order_value',
    'order_volatility',
    'late_delivery_rate',
    'transaction_90d'
]

txn_num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

txn_pipeline = ColumnTransformer([
    ('txn_num_pipeline', txn_num_pipeline, txn_num),
])

txn_pipeline

0,1,2
,transformers,"[('txn_num_pipeline', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [None]:
# Compliance History Pipeline

comp_num = [
    'past_violations',
    'audit_fail_rate',
    'days_since_last_violation'
]

comp_num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])


comp_pipeline = ColumnTransformer([
    ('comp_num_pipeline', comp_num_pipeline, comp_num)
])

comp_pipeline

0,1,2
,transformers,"[('comp_num_pipeline', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


# Transform Inputs

In [None]:
X[meta_features][:2]

Unnamed: 0,vendor_tenure_years,vendor_country,vendor_size
0,12.499024,CN,MEDIUM
1,11.585653,DE,SMALL


In [None]:
X_meta_train = meta_pipeline.fit_transform(X_train[meta_features])
X_meta_test = meta_pipeline.transform(X_test[meta_features])

In [None]:
X_txn_train = txn_pipeline.fit_transform(X_train[txn_num])
X_txn_test = txn_pipeline.transform(X_test[txn_num])

In [None]:
X_comp_train = comp_pipeline.fit_transform(X_train[comp_num])
X_comp_test = comp_pipeline.transform(X_test[comp_num])

In [None]:
type(X_meta_train), type(X_meta_test)

(numpy.ndarray, numpy.ndarray)

In [None]:
type(X_txn_train), type(X_txn_test)

(numpy.ndarray, numpy.ndarray)

In [None]:
type(X_comp_train), type(X_comp_test)

(numpy.ndarray, numpy.ndarray)

# Functional API Model

In [None]:
meta_input = tf.keras.layers.Input(shape=X_meta_train.shape[1], )
meta_x  = tf.keras.layers.Dense(32, activation='relu')(meta_input)

txn_input = tf.keras.layers.Input(shape=X_txn_train.shape[1], )
txn_x = tf.keras.layers.Dense(32, activation='relu')(txn_input)

comp_input = tf.keras.layers.Input(shape=X_comp_train.shape[1], )
comp_x = tf.keras.layers.Dense(32, activation='relu')(comp_input)

combined = tf.keras.layers.Concatenate()([meta_x, txn_x, comp_x])
x = tf.keras.layers.Dense(64, activation='relu')(combined)
x = tf.keras.layers.Dense(32, activation='relu')(x)
output = tf.keras.layers.Dense(y.shape[1], activation='softmax')(x)

model = tf.keras.Model(
    inputs=[meta_input, txn_input, comp_input],
    outputs=output,
    name="Rajiv_Model"
)

initial_learning_rate = 0.001
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=1000,
    decay_rate=0.96,
    staircase=True
)

model.compile(
    loss=tf.keras.losses.CategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule),
    metrics=[tf.keras.metrics.CategoricalAccuracy()]
)

model.summary()

Model: "Rajiv_Model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_13 (InputLayer)          [(None, 8)]          0           []                               
                                                                                                  
 input_14 (InputLayer)          [(None, 4)]          0           []                               
                                                                                                  
 input_15 (InputLayer)          [(None, 3)]          0           []                               
                                                                                                  
 dense_29 (Dense)               (None, 32)           288         ['input_13[0][0]']               
                                                                                        

In [None]:
import os
from datetime import datetime

log_dir = "logs/fit/" + datetime.now().strftime('%Y%m%d-%H%M%S')

callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        filepath = "myModel_{epoch:02d}.keras",
        save_best_only=True,
        monitor='val_loss',
        verbose=2),

    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        min_delta=1e-2,
        patience=5,
        verbose=2),

    tf.keras.callbacks.TensorBoard(
        log_dir = log_dir,
        histogram_freq=1)
]

callbacks

[<keras.callbacks.ModelCheckpoint at 0x7f6a585d48e0>,
 <keras.callbacks.EarlyStopping at 0x7f6a585d5ba0>,
 <keras.callbacks.TensorBoard at 0x7f6a585d6b90>]

In [None]:
model.fit(
    [X_meta_train, X_txn_train, X_comp_train],
    y_train,
    batch_size=32,
    epochs=5,
    validation_split=0.2,
    callbacks=callbacks,
    verbose=2
)

Epoch 1/5

Epoch 1: val_loss improved from inf to 0.32892, saving model to myModel_01.keras
100/100 - 3s - loss: 0.5527 - categorical_accuracy: 0.7841 - val_loss: 0.3289 - val_categorical_accuracy: 0.8587 - 3s/epoch - 25ms/step
Epoch 2/5

Epoch 2: val_loss improved from 0.32892 to 0.24202, saving model to myModel_02.keras
100/100 - 0s - loss: 0.2737 - categorical_accuracy: 0.8794 - val_loss: 0.2420 - val_categorical_accuracy: 0.8975 - 431ms/epoch - 4ms/step
Epoch 3/5

Epoch 3: val_loss improved from 0.24202 to 0.21301, saving model to myModel_03.keras
100/100 - 0s - loss: 0.2208 - categorical_accuracy: 0.8991 - val_loss: 0.2130 - val_categorical_accuracy: 0.9087 - 437ms/epoch - 4ms/step
Epoch 4/5

Epoch 4: val_loss improved from 0.21301 to 0.18292, saving model to myModel_04.keras
100/100 - 0s - loss: 0.1839 - categorical_accuracy: 0.9234 - val_loss: 0.1829 - val_categorical_accuracy: 0.9275 - 417ms/epoch - 4ms/step
Epoch 5/5

Epoch 5: val_loss improved from 0.18292 to 0.16704, saving 

<keras.callbacks.History at 0x7f6a585f3ee0>

# Evaluation

In [None]:
loss, acc = model.evaluate(
    [X_meta_test, X_txn_test, X_comp_test],
    y_test
)

print(f"Test Loss: {loss:.2f}")
print(f"Test Acc: {acc:.2f}")

Test Loss: 0.19
Test Acc: 0.92


# Feature Importance (Branch - wise permutation)

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score

def acc_fn(meta, txn, comp, y_true):
    preds = model.predict([meta, txn, comp])
    return accuracy_score(y_true.argmax(axis=1), preds.argmax(axis=1))

baseline_acc = acc_fn(X_meta_test, X_txn_test, X_comp_test, y_test)
print(baseline_acc)

0.919


In [None]:
importance_comp = []

for i in range(X_comp_test.shape[1]):
    comp_perm = X_comp_test.copy()
    np.random.shuffle(comp_perm[:, i])
    drop = baseline_acc - acc_fn(X_meta_test, X_txn_test, comp_perm, y_test)
    importance_comp.append(drop)

importance_comp



[0.134, 0.14800000000000002, 0.0050000000000000044]

In [None]:
importance_meta = []

for i in range(X_meta_test.shape[1]):
    meta_perm = X_meta_test.copy()
    np.random.shuffle(meta_perm[:, i])
    drop = baseline_acc - acc_fn(meta_perm, X_txn_test, X_comp_test, y_test)
    importance_meta.append(drop)

importance_meta



[0.0010000000000000009,
 0.0040000000000000036,
 0.0030000000000000027,
 -0.0020000000000000018,
 0.0050000000000000044,
 -0.0030000000000000027,
 0.008000000000000007,
 0.006000000000000005]