In [2]:
!pip install torchinfo

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [3]:
import os
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from torchinfo import summary
import time
import numpy as np
import pandas as pd

In [4]:
class SimpleMLP(nn.Module):
    def __init__(self, input_feature_dim, hidden_dim_1, hidden_dim_2, output_size=1): 
        super(SimpleMLP, self).__init__()
        self.fc1 = nn.Linear(input_feature_dim, hidden_dim_1)
        self.relu1 = nn.ReLU()                        
        self.fc2 = nn.Linear(hidden_dim_1, hidden_dim_2)
        self.relu2 = nn.ReLU()
        self.output_logits = nn.Linear(hidden_dim_2, output_size)

    def forward(self, x):
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        logits = self.output_logits(x)
        
        # Scale sigmoid output to be between 0 and 5
        rating = 5.0 * torch.sigmoid(logits)
        return rating

In [16]:
INPUT_DIM = 422
H1, H2 = 256, 128

model_path = "./models/simple_mlp_ratings_statedict.pth"
device = torch.device("cpu")

model = SimpleMLP(INPUT_DIM, H1, H2).to(device)
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()
summary(model)

  model.load_state_dict(torch.load(model_path, map_location=device))


Layer (type:depth-idx)                   Param #
SimpleMLP                                --
├─Linear: 1-1                            108,288
├─ReLU: 1-2                              --
├─Linear: 1-3                            32,896
├─ReLU: 1-4                              --
├─Linear: 1-5                            129
Total params: 141,313
Trainable params: 141,313
Non-trainable params: 0

# Pytorch Model

### Model Size

In [17]:
model_size = os.path.getsize(model_path) 
print(f"Model Size on Disk: {model_size/ (1e6) :.2f} MB")

Model Size on Disk: 0.57 MB


In [18]:
data_dir = os.getenv("DATA_DIR", "/mnt/object")
df_test = pd.read_csv(data_dir+"/df_test.csv")

In [21]:
all_cols = df_test.columns.tolist()
col_exclude = ['stars', 'Unnamed: 0']
model_feature_columns = [col for col in all_cols if col not in col_exclude]

X_test = df_test[model_feature_columns]
y_test = df_test['stars']  

print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

X_test shape: (139332, 422), y_test shape: (139332,)


In [22]:
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

In [23]:
batch_size = 2048 # Adjust based on V100 VRAM and dataset size
num_data_workers = 4 # Good starting point for V100

test_loader = DataLoader( # Test loader
    dataset=test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=num_data_workers,
    pin_memory=True
)

### Test Accuracy

In [27]:
model.eval() # Ensure model is in evaluation mode
total_test_loss_mse = 0
correct_test_predictions = 0
total_test_samples = 0
criterion = nn.MSELoss()
with torch.no_grad():
    for features, labels in test_loader: # Use test_loader here
        features, labels = features.to(device), labels.to(device)
        predictions = model(features) # Model outputs continuous values (0-5)
        test_loss = criterion(predictions, labels)
        total_test_loss_mse += test_loss.item()

        # Accuracy calculation for test set
        rounded_predictions = torch.round(predictions)
        correct_test_predictions += (rounded_predictions == labels).sum().item()
        total_test_samples += labels.size(0)

avg_test_loss_mse = total_test_loss_mse / len(test_loader)
avg_test_loss_rmse = np.sqrt(avg_test_loss_mse)
test_accuracy = (correct_test_predictions / total_test_samples) * 100 if total_test_samples > 0 else 0

print(f'\nFINAL TEST SET PERFORMANCE:')
print(f'Test Loss (MSE): {avg_test_loss_mse:.4f}')
print(f'Test RMSE: {avg_test_loss_rmse:.4f}')
print(f'Test Accuracy (Rounded): {test_accuracy:.2f}%')


FINAL TEST SET PERFORMANCE:
Test Loss (MSE): 1.2751
Test RMSE: 1.1292
Test Accuracy (Rounded): 33.93%


### Inference Latency

In [28]:
num_trials = 100  # Number of trials

# Get a single sample from the test data

single_sample, _ = next(iter(test_loader))  
single_sample = single_sample[0].unsqueeze(0)  

# Warm-up run 
with torch.no_grad():
    model(single_sample)

latencies = []
with torch.no_grad():
    for _ in range(num_trials):
        start_time = time.time()
        _ = model(single_sample)
        latencies.append(time.time() - start_time)

In [29]:
print(f"Inference Latency (single sample, median): {np.percentile(latencies, 50) * 1000:.2f} ms")
print(f"Inference Latency (single sample, 95th percentile): {np.percentile(latencies, 95) * 1000:.2f} ms")
print(f"Inference Latency (single sample, 99th percentile): {np.percentile(latencies, 99) * 1000:.2f} ms")
print(f"Inference Throughput (single sample): {num_trials/np.sum(latencies):.2f} FPS")

Inference Latency (single sample, median): 0.15 ms
Inference Latency (single sample, 95th percentile): 0.22 ms
Inference Latency (single sample, 99th percentile): 0.28 ms
Inference Throughput (single sample): 6339.64 FPS


### Batch Throughput

In [30]:
num_batches = 50  # Number of trials

# Get a batch from the test data
batch_input, _ = next(iter(test_loader))  

# Warm-up run 
with torch.no_grad():
    model(batch_input)

batch_times = []
with torch.no_grad():
    for _ in range(num_batches):
        start_time = time.time()
        _ = model(batch_input)
        batch_times.append(time.time() - start_time)

In [31]:
batch_fps = (batch_input.shape[0] * num_batches) / np.sum(batch_times) 
print(f"Batch Throughput: {batch_fps:.2f} FPS")

Batch Throughput: 794569.74 FPS


### Summary Of Results

In [33]:
print(f"Model Size on Disk: {model_size/ (1e6) :.2f} MB")
print(f"Accuracy: {test_accuracy:.2f}% ({correct_test_predictions}/{total_test_samples} correct)")
print(f"Inference Latency (single sample, median): {np.percentile(latencies, 50) * 1000:.2f} ms")
print(f"Inference Latency (single sample, 95th percentile): {np.percentile(latencies, 95) * 1000:.2f} ms")
print(f"Inference Latency (single sample, 99th percentile): {np.percentile(latencies, 99) * 1000:.2f} ms")
print(f"Inference Throughput (single sample): {num_trials/np.sum(latencies):.2f} FPS")
print(f"Batch Throughput: {batch_fps:.2f} FPS")

Model Size on Disk: 0.57 MB
Accuracy: 33.93% (47280/139332 correct)
Inference Latency (single sample, median): 0.15 ms
Inference Latency (single sample, 95th percentile): 0.22 ms
Inference Latency (single sample, 99th percentile): 0.28 ms
Inference Throughput (single sample): 6339.64 FPS
Batch Throughput: 794569.74 FPS


# ONNX Model

In [34]:
import onnx
import onnxruntime as ort

In [36]:
onnx_model_path = "./models/simple_mlp_ratings_statedict.onnx"

dummy = torch.randn(1, INPUT_DIM)
torch.onnx.export(
    model, dummy, onnx_model_path,
    input_names=["features"],
    output_names=["rating"],
    opset_version=13, dynamic_axes={"features": {0: "batch_size"}, "rating": {0: "batch_size"}}
)

print(f"ONNX model saved to {onnx_model_path}")

ONNX model saved to ./models/simple_mlp_ratings_statedict.onnx


In [37]:
onnx_model = onnx.load(onnx_model_path)
onnx.checker.check_model(onnx_model)

In [38]:
ort_session = ort.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])
ort_session.get_providers()

['CPUExecutionProvider']

### Test Accuracy

In [64]:
total_sq_err = 0.0
total_samples = 0
correct_rounded = 0

for features, labels in test_loader:
    # 1) Move to CPU‐numpy
    feats_np  = features.cpu().numpy().astype(np.float32)
    labels_np = labels.cpu().numpy().astype(np.float32)
    
    # 2) ONNX inference
    preds_np = ort_session.run(None, {"features": feats_np})[0].squeeze(1)  # shape (batch,)
    labels_np = labels_np.reshape(-1)
    
    
    # 3) Accumulate squared errors
    errs = preds_np - labels_np
    total_sq_err    += np.sum(errs ** 2)
    total_samples   += labels_np.shape[0]
    
    # 4) “Accuracy” by rounding to nearest star
    rounded = np.rint(preds_np)
    correct_rounded += np.sum(rounded == labels_np)

# 5) Final metrics
mse  = total_sq_err / total_samples
rmse = np.sqrt(mse)
acc  = 100.0 * correct_rounded / total_samples

print(f'\nFINAL TEST SET PERFORMANCE:')
print(f'Test Loss (MSE): {mse:.4f}')
print(f'Test RMSE: {rmse:.4f}')
print(f'Test Accuracy (Rounded): {acc:.2f}%')


FINAL TEST SET PERFORMANCE:
Test Loss (MSE): 1.2719
Test RMSE: 1.1278
Test Accuracy (Rounded): 33.93%


### Model Size

In [65]:
model_size = os.path.getsize(onnx_model_path) 
print(f"Model Size on Disk: {model_size/ (1e6) :.2f} MB")

Model Size on Disk: 0.57 MB


### Inference Latency

In [66]:
num_trials = 100  # Number of trials

# Get a single sample from the test data

single_sample, _ = next(iter(test_loader))  
single_sample = single_sample[:1].numpy()

# Warm-up run
ort_session.run(None, {ort_session.get_inputs()[0].name: single_sample})

latencies = []
for _ in range(num_trials):
    start_time = time.time()
    ort_session.run(None, {ort_session.get_inputs()[0].name: single_sample})
    latencies.append(time.time() - start_time)

In [67]:
print(f"Inference Latency (single sample, median): {np.percentile(latencies, 50) * 1000:.2f} ms")
print(f"Inference Latency (single sample, 95th percentile): {np.percentile(latencies, 95) * 1000:.2f} ms")
print(f"Inference Latency (single sample, 99th percentile): {np.percentile(latencies, 99) * 1000:.2f} ms")
print(f"Inference Throughput (single sample): {num_trials/np.sum(latencies):.2f} FPS")


Inference Latency (single sample, median): 0.05 ms
Inference Latency (single sample, 95th percentile): 0.08 ms
Inference Latency (single sample, 99th percentile): 0.15 ms
Inference Throughput (single sample): 16532.53 FPS


### Batch Throughput

In [68]:
num_batches = 50  # Number of trials

# Get a batch from the test data
batch_input, _ = next(iter(test_loader))  
batch_input = batch_input.numpy()

# Warm-up run
ort_session.run(None, {ort_session.get_inputs()[0].name: batch_input})

batch_times = []
for _ in range(num_batches):
    start_time = time.time()
    ort_session.run(None, {ort_session.get_inputs()[0].name: batch_input})
    batch_times.append(time.time() - start_time)

In [69]:
batch_fps = (batch_input.shape[0] * num_batches) / np.sum(batch_times) 
print(f"Batch Throughput: {batch_fps:.2f} FPS")


Batch Throughput: 784385.82 FPS


In [72]:
print(f"Accuracy: {acc:.2f}% ({correct_rounded}/{total_samples} correct)")
print(f"Model Size on Disk: {model_size/ (1e6) :.2f} MB")
print(f"Inference Latency (single sample, median): {np.percentile(latencies, 50) * 1000:.2f} ms")
print(f"Inference Latency (single sample, 95th percentile): {np.percentile(latencies, 95) * 1000:.2f} ms")
print(f"Inference Latency (single sample, 99th percentile): {np.percentile(latencies, 99) * 1000:.2f} ms")
print(f"Inference Throughput (single sample): {num_trials/np.sum(latencies):.2f} FPS")
print(f"Batch Throughput: {batch_fps:.2f} FPS")

Accuracy: 33.93% (47280/139332 correct)
Model Size on Disk: 0.57 MB
Inference Latency (single sample, median): 0.05 ms
Inference Latency (single sample, 95th percentile): 0.08 ms
Inference Latency (single sample, 99th percentile): 0.15 ms
Inference Throughput (single sample): 16532.53 FPS
Batch Throughput: 784385.82 FPS


# Optimizations

In [90]:
def benchmark_session(ort_session):

    print(f"Execution provider: {ort_session.get_providers()}")

    ## Benchmark accuracy

    correct = 0
    total = 0
    for features, labels in test_loader:
        features  = features.cpu().numpy().astype(np.float32)
        labels = labels.cpu().numpy().astype(np.float32)
        predicted = ort_session.run(None, {"features": features})[0].squeeze(1)
        labels = labels.reshape(-1)
        total += labels.shape[0]
        rounded = np.rint(predicted)
        correct += np.sum(rounded == labels)
    accuracy = (correct / total) * 100

    print(f"Accuracy: {accuracy:.2f}% ({correct}/{total} correct)")

    ## Benchmark inference latency for single sample

    num_trials = 100  # Number of trials

    # Get a single sample from the test data

    single_sample, _ = next(iter(test_loader))  
    single_sample = single_sample[:1].numpy()

    # Warm-up run
    ort_session.run(None, {ort_session.get_inputs()[0].name: single_sample})

    latencies = []
    for _ in range(num_trials):
        start_time = time.time()
        ort_session.run(None, {ort_session.get_inputs()[0].name: single_sample})
        latencies.append(time.time() - start_time)

    print(f"Inference Latency (single sample, median): {np.percentile(latencies, 50) * 1000:.2f} ms")
    print(f"Inference Latency (single sample, 95th percentile): {np.percentile(latencies, 95) * 1000:.2f} ms")
    print(f"Inference Latency (single sample, 99th percentile): {np.percentile(latencies, 99) * 1000:.2f} ms")
    print(f"Inference Throughput (single sample): {num_trials/np.sum(latencies):.2f} FPS")

    ## Benchmark batch throughput

    num_batches = 50  # Number of trials

    # Get a batch from the test data
    batch_input, _ = next(iter(test_loader))  
    batch_input = batch_input.numpy()

    # Warm-up run
    ort_session.run(None, {ort_session.get_inputs()[0].name: batch_input})

    batch_times = []
    for _ in range(num_batches):
        start_time = time.time()
        ort_session.run(None, {ort_session.get_inputs()[0].name: batch_input})
        batch_times.append(time.time() - start_time)

    batch_fps = (batch_input.shape[0] * num_batches) / np.sum(batch_times) 
    print(f"Batch Throughput: {batch_fps:.2f} FPS")

### Basic graph optimizations

In [112]:
optimized_model_path = "models/simple_mlp_optimized.onnx"

session_options = ort.SessionOptions()
session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED # apply graph optimizations
session_options.optimized_model_filepath = optimized_model_path 

ort_session = ort.InferenceSession(onnx_model_path, sess_options=session_options, providers=['CPUExecutionProvider'])

In [113]:
benchmark_session(ort_session)

Execution provider: ['CPUExecutionProvider']
Accuracy: 33.93% (47280/139332 correct)
Inference Latency (single sample, median): 0.05 ms
Inference Latency (single sample, 95th percentile): 0.07 ms
Inference Latency (single sample, 99th percentile): 0.10 ms
Inference Throughput (single sample): 20344.90 FPS
Batch Throughput: 457573.44 FPS


### Dynamic quantization

In [104]:
!pip uninstall -y neural_compressor

[0m

In [105]:
!pip install numpy scipy scikit-learn



In [106]:
!pip install onnxruntime onnxruntime-tools



In [107]:
from onnxruntime.quantization import quantize_dynamic, QuantType

In [114]:
quant_model =  "models/simple_mlp_quantized_dynamic.onnx"

quantize_dynamic(
    model_input=onnx_model_path,
    model_output=quant_model,
    weight_type=QuantType.QInt8,
    per_channel=True    # set True to apply ONNX graph optimizations before quant
)

print(f"Quantized model written to: {quant_model}")



Quantized model written to: models/simple_mlp_quantized_dynamic.onnx


In [115]:
model_size = os.path.getsize(quant_model) 
print(f"Model Size on Disk: {model_size/ (1e6) :.2f} MB")

Model Size on Disk: 0.15 MB


In [116]:
ort_session = ort.InferenceSession(quant_model, providers=['CPUExecutionProvider'])
benchmark_session(ort_session)

Execution provider: ['CPUExecutionProvider']
Accuracy: 33.92% (47263/139332 correct)
Inference Latency (single sample, median): 0.07 ms
Inference Latency (single sample, 95th percentile): 0.10 ms
Inference Latency (single sample, 99th percentile): 0.13 ms
Inference Throughput (single sample): 13161.90 FPS
Batch Throughput: 764079.50 FPS
