In [2]:
from sklearn import preprocessing
data = [['Bleach'], ['Cereal'], ['Toilet Roll']]

ordinal_enc= preprocessing.OrdinalEncoder()
ordinal_enc.fit(data)

In [3]:
print(ordinal_enc.transform(data))

[[0.]
 [1.]
 [2.]]


In [4]:
onehot_enc = preprocessing.OneHotEncoder()
onehot_enc.fit(data)

In [5]:
onehot_enc.transform(data).toarray()

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

### Engineering numerical features

In [6]:
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeClassifier

from sklearn import metrics 
from sklearn.datasets import load_wine 
from sklearn.pipeline import make_pipeline 

X, y = load_wine(return_X_y=True)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape   

(124, 13)

In [8]:
no_scale_clf = make_pipeline(RidgeClassifier(tol=1e-2, solver="sag"))
no_scale_clf.fit(X_train, y_train)
y_pred_no_scale = no_scale_clf.predict(X_test)

In [9]:
std_scale_clf = make_pipeline(StandardScaler(), RidgeClassifier(tol=1e-2, solver="sag"))
std_scale_clf.fit(X_train, y_train)
y_pred_std_scale = std_scale_clf.predict(X_test)

In [10]:
y_test

array([0, 0, 2, 0, 1, 0, 1, 2, 1, 2, 0, 2, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       1, 2, 2, 2, 1, 1, 1, 0, 0, 1, 2, 0, 0, 0, 2, 2, 1, 2, 0, 1, 1, 1,
       2, 0, 1, 1, 2, 0, 1, 0, 0, 2])

In [11]:
y_pred_no_scale

array([0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 2, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 1])

In [12]:
print("{:.2%}".format(metrics.accuracy_score(y_test, y_pred_no_scale)))
print(metrics.classification_report(y_test, y_pred_no_scale))

75.93%
              precision    recall  f1-score   support

           0       0.90      1.00      0.95        19
           1       0.66      1.00      0.79        21
           2       1.00      0.07      0.13        14

    accuracy                           0.76        54
   macro avg       0.85      0.69      0.63        54
weighted avg       0.83      0.76      0.68        54



In [13]:
print("{:.2%}".format(metrics.accuracy_score(y_test, y_pred_std_scale)))
print(metrics.classification_report(y_test, y_pred_std_scale))

98.15%
              precision    recall  f1-score   support

           0       0.95      1.00      0.97        19
           1       1.00      0.95      0.98        21
           2       1.00      1.00      1.00        14

    accuracy                           0.98        54
   macro avg       0.98      0.98      0.98        54
weighted avg       0.98      0.98      0.98        54



## Designing your training system

### Detecting data drift

In [14]:
from sklearn.datasets import load_wine 
from sklearn.model_selection import train_test_split 
import alibi
from alibi_detect.cd import TabularDrift

ImportError: /home/shuaizhu/anaconda3/envs/ml_eng_py/lib/python3.10/site-packages/torch/lib/../../nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12

In [None]:
wine_data = load_wine()
feature_name = wine_data.feature_names
X, y = wine_data.data, wine_data.target 

X_ref, X_test, y_ref, y_test = train_test_split(X, y, test_size=0.50, random_state=42)
X_ref.shape

(89, 13)

In [None]:
cd = TabularDrift(X_ref, p_val=0.05)
preds = cd.predict(X_test)
labels = ['No', 'Yes']
print('Drift: {}'.format(labels[preds['data']['is_drift']]))

Drift: No




In [None]:
preds

{'data': {'is_drift': 0,
  'distance': array([0.13483146, 0.12359551, 0.11235955, 0.13483146, 0.10112359,
         0.12359551, 0.14606741, 0.13483146, 0.15730338, 0.13483146,
         0.11235955, 0.14606741, 0.11235955], dtype=float32),
  'p_val': array([0.3674914 , 0.47484735, 0.59548855, 0.3674914 , 0.7211672 ,
         0.47484735, 0.2769307 , 0.3674914 , 0.20358618, 0.3674914 ,
         0.59548855, 0.2769307 , 0.59548855], dtype=float32),
  'threshold': 0.0038461538461538464},
 'meta': {'name': 'TabularDrift',
  'online': False,
  'data_type': None,
  'version': '0.12.0',
  'detector_type': 'drift'}}

In [None]:
X_test_cal_error = 1.1*X_test
preds = cd.predict(X_test_cal_error)
labels = ['No', 'Yes']
print('Drift: {}'.format(labels[preds['data']['is_drift']]))

Drift: Yes


## Detecting concept drift

In [3]:
from alibi_detect.cd import MMDDriftOnline 
ert = 50
window_size = 10
cd = MMDDriftOnline(X_ref, ert, window_size, backend="pytorch", n_bootstraps=2500)

ModuleNotFoundError: No module named 'alibi_detect'

In [2]:
import torch
torch.cuda.is_available()

True