# Debugging with evidently test suites and reports

In [1]:
import datetime
import pandas as pd

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset

from evidently.test_suite import TestSuite
from evidently.test_preset import DataDriftTestPreset

from joblib import dump, load

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

# Load data and model

In [2]:
ref_data = pd.read_parquet('data/reference.parquet')

In [3]:
current_data = pd.read_parquet('data/green_tripdata_2022-02.parquet')

In [4]:
with open('models/lin_reg.bin', 'rb') as f_in:
    model = load(f_in)

In [5]:
# data labeling
target = "duration_min"
num_features = ["passenger_count", "trip_distance", "fare_amount", "total_amount"]
cat_features = ["PULocationID", "DOLocationID"]

In [None]:
# we will assume a problem data point
problematic_data = current_data.loc[(current_data.lpep_pickup_datetime >= datetime.datetime(2022,2,2,0,0)) & 
                               (current_data.lpep_pickup_datetime < datetime.datetime(2022,2,3,0,0))]

# Generate test suite and report

In [6]:
column_mapping = ColumnMapping(
    prediction='prediction',
    numerical_features=num_features,
    categorical_features=cat_features,
    target=None
)

In [None]:
problematic_data['prediction'] = model.predict(problematic_data[num_features + cat_features].fillna(0))

In [None]:
test_suite = TestSuite(tests = [DataDriftTestPreset()])
test_suite.run(reference_data=ref_data, current_data=problematic_data, column_mapping=column_mapping)