In [1]:
import numpy as np
import os
import pandas as pd
import requests

from sklearn.model_selection import train_test_split
from tqdm import tqdm

from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task

from lightautoml.report.monitoring_deco import MonitoringDeco

In [2]:
N_THREADS = 8 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.9 # Test size for metric check
TIMEOUT = 120 # Time in seconds for automl run
TARGET_NAME = 'TARGET' # Target column name

In [3]:
DATASET_PATH = '../data/sampled_app_train.csv'
data = pd.read_csv(DATASET_PATH)

In [6]:
train_data, test_data = train_test_split(data, 
                                         test_size=TEST_SIZE, 
                                         stratify=data[TARGET_NAME], 
                                         random_state=RANDOM_STATE)

Prepare simulation data for monitoring. Split test_data on subsamples to feed model one by one.

In [7]:
N = 10
test_data['N_EPOCH'] = np.linspace(0.51, N+0.5, test_data.shape[0]).round().astype(int)

n_epochs = test_data['N_EPOCH'].drop_duplicates()
test_data_split = [test_data[test_data['N_EPOCH'] == i] for i in n_epochs]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [8]:
test_data['N_EPOCH'].value_counts()

7     901
6     901
4     901
3     901
10    901
2     901
9     901
8     901
5     900
1     892
Name: N_EPOCH, dtype: int64

# MonitoringDeco usage

Define AutoML model.

In [9]:
task = Task('binary', )
automl = TabularAutoML(task = task, 
                       timeout = TIMEOUT,
                       general_params = {'nested_cv': False, 'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]},
                       reader_params = {'cv': N_FOLDS, 'random_state': RANDOM_STATE},
                       tuning_params = {'max_tuning_iter': 20, 'max_tuning_time': 30},
                       lgb_params = {'default_params': {'num_threads': N_THREADS}})

Define MonitoringDeco wrap class.

In [10]:
MD = MonitoringDeco(output_path='./lama_monitoring_2',
                    report_file_name='lama_monitoring_report.html', \
                    null_threshold=0.01, feature_shift_threshold=0.05, \
                    model_shift_threshold=0.02)

automl_md = MD(automl)

Drop some columns for sanity reasons.

In [11]:
roles = {'target': TARGET_NAME,
         'drop': ['SK_ID_CURR',
                  'DAYS_EMPLOYED',
                  'AMT_INCOME_TOTAL',
                  'BASEMENTAREA_AVG',
                  'NONLIVINGAPARTMENTS_MEDI',
                 'BASEMENTAREA_AVG',
                 'ELEVATORS_AVG',
                 'ENTRANCES_AVG',
                 'LIVINGAPARTMENTS_AVG',
                 'LIVINGAREA_AVG',
                 'NONLIVINGAPARTMENTS_AVG',
                 'APARTMENTS_MODE',
                 'BASEMENTAREA_MODE',
                 'YEARS_BEGINEXPLUATATION_MODE',
                 'ELEVATORS_MODE',
                 'ENTRANCES_MODE',
                 'LANDAREA_MODE',
                 'LIVINGAPARTMENTS_MODE',
                 'NONLIVINGAPARTMENTS_MODE',
                 'APARTMENTS_MEDI',
                 'BASEMENTAREA_MEDI',
                 'ELEVATORS_MEDI',
                 'ENTRANCES_MEDI',
                 'LIVINGAPARTMENTS_MEDI',
                 'NONLIVINGAPARTMENTS_MEDI']
        }

Train AutoML model with MonitoringDeco wrapper.

In [12]:
oof_pred = automl_md.fit_predict(train_data, roles=roles)

You can easily access trained model without touching monitoring facilities:

In [13]:
preds = automl_md._model.predict(test_data)

Feed your monitoring instance with new data and see your lama_monitoring_report.html file with alerts report.

In [14]:
for i in tqdm(range(N)):
    automl_md.update(test_data_split[i])

100%|██████████| 10/10 [02:08<00:00, 12.88s/it]


Save information about alerts to json format if you need.

In [15]:
automl_md.save_data_json("info.json")