# Testing and optimizing agents

## Practice activity: Designing test cases for ML systems

In [None]:
!pip install pytest
!Pip install ipytest

In [33]:
def checker(f):  
    """Measures execution time."""  
    def wrap(*args, **kwargs):  
        print("Before")  # Pre-execution message  
        f(*args, **kwargs)  
        print("Before")  # Pre-execution message  
    return wrap

In [37]:
import ipytest
import pytest
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
import numpy as np

# Load the dataset
iris = load_iris()
X, y = iris.data, iris.target

# Set up the model
model = DecisionTreeClassifier()
model.fit(X, y)


In [50]:
def predict_model(*args, **kwargs):
    return model.predict(*args, **kwargs)

model.predict = predict_model

In [51]:
def test_typical_case():
    input_data = np.array([[4.5, 2.3, 1.3, 0.3]])  # Example input for a flower classification model
    expected_output = 0  # Expected output for typical case (Setosa class index)
    result = model.predict(input_data)[0]
    assert result == expected_output, f"Expected {expected_output}, but got {result}"

In [52]:
def test_edge_case_extreme_values():
    input_data = np.array([[1000, 1000, 1000, 1000]])  # Extreme values for flower classification
    try:
        model.predict(input_data)
    except ValueError:
        assert True  # The model should raise a ValueError for extreme inputs
    else:
        assert False, "Expected ValueError for extreme values, but no error was raised"

In [48]:
def test_error_handling_missing_values():
    input_data = np.array([[None, None, None, None]])  # Missing values in input
    try:
        model.predict(input_data)
    except ValueError:
        assert True  # The model should raise a ValueError for missing inputs
    else:
        assert False, "Expected ValueError for missing values, but no error was raised"

In [49]:
# Run tests using ipytest
ipytest.run('-v')

platform win32 -- Python 3.13.7, pytest-8.4.2, pluggy-1.6.0 -- C:\Users\Shahram\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\python.exe
cachedir: .pytest_cache
rootdir: g:\Downloads\Organised\Video\Online Courses\Microsoft AI ML\Building Intelligent Troubleshooting Agents
[1mcollecting ... [0mcollected 3 items

t_76afb62dd1e94f36929c977f3791daa6.py::test_typical_case [31mFAILED[0m[31m                              [ 33%][0m
t_76afb62dd1e94f36929c977f3791daa6.py::test_error_handling_missing_values [31mFAILED[0m[31m             [ 66%][0m
t_76afb62dd1e94f36929c977f3791daa6.py::test_edge_case_extreme_values [31mFAILED[0m[31m                  [100%][0m

[31m[1m________________________________________ test_typical_case ________________________________________[0m

    [0m[94mdef[39;49;00m[90m [39;49;00m[92mtest_typical_case[39;49;00m():[90m[39;49;00m
        input_data = np.array([[[94m4.5[39;49;00m, [94m2.3[39;49;00m, [9

<ExitCode.TESTS_FAILED: 1>

## Practice activity: Evaluating agent effectiveness

In [1]:
import tensorflow as tf
from tensorflow.keras.datasets import mnist

# Load dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

# Build a simple model
model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(10, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train, epochs=5, validation_data=(x_test, y_test))

  super().__init__(**kwargs)


Epoch 1/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9157 - loss: 0.2908 - val_accuracy: 0.9582 - val_loss: 0.1389
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9579 - loss: 0.1403 - val_accuracy: 0.9683 - val_loss: 0.0998
Epoch 3/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9687 - loss: 0.1023 - val_accuracy: 0.9742 - val_loss: 0.0834
Epoch 4/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9733 - loss: 0.0843 - val_accuracy: 0.9765 - val_loss: 0.0771
Epoch 5/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9779 - loss: 0.0718 - val_accuracy: 0.9774 - val_loss: 0.0728


<keras.src.callbacks.history.History at 0x2224a7497f0>

In [5]:
from sklearn.metrics import accuracy_score, precision_score
import numpy as np

# Make predictions on the test set
y_pred_probs = model.predict(x_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

# Calculate precision (average='weighted' to handle multiple classes)
precision = precision_score(y_test, y_pred, average='weighted')
print(f'Precision: {precision:.4f}')

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Accuracy: 0.9774
Precision: 0.9774


In [3]:
import time

# Measure response time for multiple iterations
start_time = time.time()
for _ in range(25):
    model.predict(x_test)
end_time = time.time()

average_response_time = (end_time - start_time) / 25
print(f"Average Response Time: {average_response_time:.4f} seconds")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m313/313[0m [32m━━━━

In [11]:
import psutil

# Monitor resource usage
cpu_usage = psutil.cpu_percent()
memory_usage = psutil.virtual_memory().percent

# for better results, measure CPU usage while inference is active,
# and measure memory usage against a baseline before the model is loaded
print(f"CPU Usage: {cpu_usage}%")
print(f"Memory Usage: {memory_usage}%")

CPU Usage: 36.9%
Memory Usage: 67.0%


In [12]:
import numpy as np
import time

# Ensure correct shape before repeating
print("Original x_test shape:", x_test.shape)  # Expected: (10000, 28, 28)

# Properly duplicate test data along batch axis
large_input = np.repeat(x_test, 10, axis=0)  # Expands batch size only

# Verify new shape
print("Large input shape after fix:", large_input.shape)  # Should be (100000, 28, 28)

# Measure performance under stress
start_time = time.time()
model.predict(large_input)  # Now matches model input (batch_size, 28, 28)
end_time = time.time()

print(f"Response Time under Stress (Reduced Size): {end_time - start_time:.4f} seconds")

Original x_test shape: (10000, 28, 28)
Large input shape after fix: (100000, 28, 28)
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step
Response Time under Stress (Reduced Size): 5.9649 seconds


In [13]:
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier

# Example data generation for demonstration (replace with actual data)
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
agent_model = RandomForestClassifier()  # Replace with your actual model

# Perform 5-fold cross-validation
cv_scores = cross_val_score(agent_model, X, y, cv=5)

# Print the cross-validation scores for each fold
print(f'Cross-Validation Scores: {cv_scores}')

# Print the mean and standard deviation of the scores
print(f'Mean CV Score: {cv_scores.mean():.4f}')
print(f'Standard Deviation of CV Scores: {cv_scores.std():.4f}')

Cross-Validation Scores: [0.935 0.9   0.905 0.89  0.855]
Mean CV Score: 0.8970
Standard Deviation of CV Scores: 0.0258
