### Install alibi_detect library

In [1]:
import numpy as np
np.__version__

'1.26.4'

In [2]:
!pip install alibi alibi_detect


Collecting alibi
  Downloading alibi-0.9.6-py3-none-any.whl.metadata (22 kB)
Collecting alibi_detect
  Downloading alibi_detect-0.12.0-py3-none-any.whl.metadata (28 kB)
Collecting scikit-image<0.23,>=0.17.2 (from alibi)
  Downloading scikit_image-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting Pillow<11.0,>=5.4.1 (from alibi)
  Downloading pillow-10.4.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (9.2 kB)
Collecting attrs<24.0.0,>=19.2.0 (from alibi)
  Downloading attrs-23.2.0-py3-none-any.whl.metadata (9.5 kB)
Collecting dill<0.4.0,>=0.3.0 (from alibi)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting numba!=0.54.0,<0.60.0,>=0.50.0 (from alibi_detect)
  Downloading numba-0.59.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.7 kB)
Collecting llvmlite<0.43,>=0.42.0dev0 (from numba!=0.54.0,<0.60.0,>=0.50.0->alibi_detect)
  Downloading llvmlite-0.42.0-cp310-cp310-manylinux_2_17_x86_64.manylinux201

In [1]:
# importing all necessary libraries
import alibi
from alibi_detect.cd import ChiSquareDrift, TabularDrift
from alibi_detect.saving import save_detector, load_detector
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

In [3]:
# loading train and production data from the github
# this is splitted in first notebook
train_url = "https://raw.githubusercontent.com/ritikisb83/CTAssignmentgroup5/refs/heads/main/Datasets/restaurant_train_df.parquet"
train_df = pd.read_parquet(train_url, engine="pyarrow")
prod_url = "https://raw.githubusercontent.com/ritikisb83/CTAssignmentgroup5/refs/heads/main/Datasets/restaurant_prod_df.parquet"
prod_df = pd.read_parquet(prod_url, engine="pyarrow")

In [4]:
# creating features set
features_set = ['Number_of_Customers', 'Menu_Price', 'Marketing_Spend', 'Cuisine_Type',
       'Average_Customer_Spending', 'Promotions', 'Reviews',
       'Monthly_Revenue']

In [5]:
# Creating X features
x_columns = ['Number_of_Customers', 'Menu_Price', 'Marketing_Spend', 'Cuisine_Type',
       'Average_Customer_Spending', 'Promotions', 'Reviews']

In [6]:
# creating categorical features
cat_vars = ['Cuisine_Type']

In [7]:
# Creating numerical features
num_vars = list(set(x_columns) - set(cat_vars))

In [8]:
# Splitting train & prod dataset into X & y
x_train = train_df[x_columns]
y_train = train_df['Monthly_Revenue']
x_prod = prod_df[x_columns]
y_prod = prod_df['Monthly_Revenue']

### Measure the drift

In [11]:
cat_vars = [3]
categories_per_feature = {f: None for f in cat_vars}
categories_per_feature

{3: None}

In [12]:
cd = TabularDrift(x_train.values,
                  p_val=.05,
                  categories_per_feature=categories_per_feature)

In [13]:
filepath = 'restaurantdrift'  # change to directory where detector is saved
save_detector(cd, filepath, legacy = True)



In [14]:
cd = load_detector(filepath)



In [15]:
preds = cd.predict(x_prod.to_numpy())

### Printing the test results

- KS test for the numerical features
- chi-squared test for the categorical features

In [16]:
# Iterate through the range of features
for f in range(cd.n_features):
    stat = 'Chi2' if f in list(categories_per_feature.keys()) else 'K-S'
    fname = x_columns[f] # Get the feature name
    stat_val, p_val = preds['data']['distance'][f], preds['data']['p_val'][f]
    print(f'{fname} -- {stat} {stat_val:.3f} -- p-value {p_val:.3f}')  # Print the feature name, statistical test, statistic value, and p-value

Number_of_Customers -- K-S 0.045 -- p-value 0.908
Menu_Price -- K-S 0.058 -- p-value 0.665
Marketing_Spend -- K-S 0.085 -- p-value 0.216
Cuisine_Type -- Chi2 0.743 -- p-value 0.863
Average_Customer_Spending -- K-S 0.068 -- p-value 0.465
Promotions -- K-S 0.015 -- p-value 1.000
Reviews -- K-S 0.107 -- p-value 0.061
