## Model Monitoring

### Install alibi_detect library

In [None]:
import numpy as np
np.__version__

'1.26.4'

In [None]:
!pip install alibi alibi_detect



In [None]:
import alibi
from alibi_detect.cd import ChiSquareDrift, TabularDrift
from alibi_detect.saving import save_detector, load_detector

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

In [None]:
!mkdir datasets

In [None]:
!gdown 1PKSwiZuBcGzSFE3ruHbbTlGCpJ0ruP9d -O datasets/airline.zip

Downloading...
From: https://drive.google.com/uc?id=1PKSwiZuBcGzSFE3ruHbbTlGCpJ0ruP9d
To: /content/datasets/airline.zip
  0% 0.00/2.84M [00:00<?, ?B/s]100% 2.84M/2.84M [00:00<00:00, 149MB/s]


In [None]:
!ls -al datasets/

total 2784
drwxr-xr-x 2 root root    4096 Dec 14 18:02 .
drwxr-xr-x 1 root root    4096 Dec 14 18:00 ..
-rw-r--r-- 1 root root 2841945 Dec  6 12:40 airline.zip


In [None]:
!unzip datasets/airline.zip -d datasets/

Archive:  datasets/airline.zip
  inflating: datasets/test.csv       
  inflating: datasets/train.csv      


In [None]:
train_df = pd.read_csv("/content/datasets/train.csv")
test_df = pd.read_csv("/content/datasets/test.csv")

In [None]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25976 entries, 0 to 25975
Data columns (total 25 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Unnamed: 0                         25976 non-null  int64  
 1   id                                 25976 non-null  int64  
 2   Gender                             25976 non-null  object 
 3   Customer Type                      25976 non-null  object 
 4   Age                                25976 non-null  int64  
 5   Type of Travel                     25976 non-null  object 
 6   Class                              25976 non-null  object 
 7   Flight Distance                    25976 non-null  int64  
 8   Inflight wifi service              25976 non-null  int64  
 9   Departure/Arrival time convenient  25976 non-null  int64  
 10  Ease of Online booking             25976 non-null  int64  
 11  Gate location                      25976 non-null  int

In [None]:
train_df.dropna(subset=['Arrival Delay in Minutes'], inplace=True)
test_df.dropna(subset=['Arrival Delay in Minutes'], inplace=True)

In [None]:
x_features = ['Gender', 'Customer Type', 'Age', 'Type of Travel',
       'Class', 'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']

In [None]:
x_features

['Gender',
 'Customer Type',
 'Age',
 'Type of Travel',
 'Class',
 'Flight Distance',
 'Inflight wifi service',
 'Departure/Arrival time convenient',
 'Ease of Online booking',
 'Gate location',
 'Food and drink',
 'Online boarding',
 'Seat comfort',
 'Inflight entertainment',
 'On-board service',
 'Leg room service',
 'Baggage handling',
 'Checkin service',
 'Inflight service',
 'Cleanliness',
 'Departure Delay in Minutes',
 'Arrival Delay in Minutes']

#### Specify the index of the columns which are categorical feautures

In [None]:
cat_vars = [0,1,3,4,6,7,8,9,10,11,12,13,14,15,16,17,18,19]

In [None]:
x = train_df[x_features]
y = train_df.satisfaction

x_test = test_df[x_features]
y_test = test_df.satisfaction

In [None]:
x

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes
0,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,1,...,5,5,4,3,4,4,5,5,25,18.0
1,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,3,...,1,1,1,5,3,1,4,1,1,6.0
2,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,2,...,5,5,4,3,4,4,4,5,0,0.0
3,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,5,...,2,2,2,5,3,1,4,2,11,9.0
4,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,3,...,5,3,3,4,4,3,3,3,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103899,Female,disloyal Customer,23,Business travel,Eco,192,2,1,2,3,...,2,2,3,1,4,2,3,2,3,0.0
103900,Male,Loyal Customer,49,Business travel,Business,2347,4,4,4,4,...,5,5,5,5,5,5,5,4,0,0.0
103901,Male,disloyal Customer,30,Business travel,Business,1995,1,1,1,3,...,5,4,3,2,4,5,5,4,7,14.0
103902,Female,disloyal Customer,22,Business travel,Eco,1000,1,1,1,5,...,1,1,4,5,1,5,4,1,0,0.0


### Split the dataset into two sets

**Note**: In this exampls, data is split to create train and production datasets. This is done only for the lab session. In real world, the production data will come from the inference stystem.

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_prod, y_train, y_prod = train_test_split(x,
                                                    y,
                                                    train_size = 0.9,
                                                    random_state = 23)

In [None]:
categories_per_feature = {f: None for f in cat_vars}

In [None]:
categories_per_feature

{0: None,
 1: None,
 3: None,
 4: None,
 6: None,
 7: None,
 8: None,
 9: None,
 10: None,
 11: None,
 12: None,
 13: None,
 14: None,
 15: None,
 16: None,
 17: None,
 18: None,
 19: None}

### Measure the drift

In [None]:
cd = TabularDrift(x.values,
                  p_val=.05,
                  categories_per_feature=categories_per_feature)

#cd = TabularDrift(train_df.values,
#                  p_val = 0.05,
#                  categories_per_feature=categories_per_feature)

In [None]:
filepath = 'airline_drift'  # change to directory where detector is saved
save_detector(cd, filepath, legacy = True)

In [None]:
cd = load_detector(filepath)



In [None]:
preds = cd.predict(x_test.to_numpy())

### Printing the test results

- KS test for the numerical features
- chi-squared test for the categorical features

In [None]:
for f in range(cd.n_features):
    stat = 'Chi2' if f in list(categories_per_feature.keys()) else 'K-S'
    fname = x_features[f]
    stat_val, p_val = preds['data']['distance'][f], preds['data']['p_val'][f]
    print(f'{fname} -- {stat} {stat_val:.3f} -- p-value {p_val:.3f}')

Gender -- Chi2 0.023 -- p-value 0.880
Customer Type -- Chi2 0.504 -- p-value 0.478
Age -- K-S 0.008 -- p-value 0.119
Type of Travel -- Chi2 1.977 -- p-value 0.160
Class -- Chi2 2.276 -- p-value 0.320
Flight Distance -- K-S 0.005 -- p-value 0.567
Inflight wifi service -- Chi2 4.753 -- p-value 0.447
Departure/Arrival time convenient -- Chi2 3.808 -- p-value 0.577
Ease of Online booking -- Chi2 10.290 -- p-value 0.067
Gate location -- Chi2 1.728 -- p-value 0.885
Food and drink -- Chi2 4.932 -- p-value 0.424
Online boarding -- Chi2 12.107 -- p-value 0.033
Seat comfort -- Chi2 3.401 -- p-value 0.638
Inflight entertainment -- Chi2 3.292 -- p-value 0.655
On-board service -- Chi2 4.221 -- p-value 0.518
Leg room service -- Chi2 6.408 -- p-value 0.269
Baggage handling -- Chi2 1.587 -- p-value 0.811
Checkin service -- Chi2 3.193 -- p-value 0.670
Inflight service -- Chi2 5.840 -- p-value 0.322
Cleanliness -- Chi2 3.728 -- p-value 0.589
Departure Delay in Minutes -- K-S 0.004 -- p-value 0.867
Arriv

## Basis this result, we can conclude that there is no data drift in the train and test data sets