In [1]:
import numpy as np
import os
import pandas as pd
import requests

from sklearn.model_selection import train_test_split
from tqdm import tqdm

from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task

from lightautoml.report.monitoring_deco import MonitoringDeco

In [2]:
N_THREADS = 8 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.9 # Test size for metric check
TIMEOUT = 120 # Time in seconds for automl run
TARGET_NAME = 'TARGET' # Target column name

In [3]:
DATASET_PATH = '../data/sampled_app_train.csv'
data = pd.read_csv(DATASET_PATH)

In [4]:
train_data, test_data = train_test_split(data, 
                                         test_size=TEST_SIZE, 
                                         stratify=data[TARGET_NAME], 
                                         random_state=RANDOM_STATE)

Split test data on chunks:

In [5]:
N = 10
test_data['N_EPOCH'] = np.linspace(0.51, N+0.5, test_data.shape[0]).round().astype(int)

n_epochs = test_data['N_EPOCH'].drop_duplicates()
test_data_split = [test_data[test_data['N_EPOCH'] == i] for i in n_epochs]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [6]:
test_data['N_EPOCH'].value_counts()

7     901
6     901
4     901
3     901
10    901
2     901
9     901
8     901
5     900
1     892
Name: N_EPOCH, dtype: int64

# MonitoringDeco usage

In [7]:
task = Task('binary', )
automl = TabularAutoML(task = task, 
                       timeout = TIMEOUT,
                       general_params = {'nested_cv': False, 'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]},
                       reader_params = {'cv': N_FOLDS, 'random_state': RANDOM_STATE},
                       tuning_params = {'max_tuning_iter': 20, 'max_tuning_time': 30},
                       lgb_params = {'default_params': {'num_threads': N_THREADS}})

In [8]:
MD = MonitoringDeco(output_path='./lama_monitoring_2',
                    report_file_name='lama_monitoring_report.html', \
                    null_threshold=0.03, feature_shift_threshold=0.05, \
                    model_shift_threshold=0.02)

automl_md = MD(automl)

In [9]:
roles = {'target': TARGET_NAME
        }
oof_pred = automl_md.fit_predict(train_data, roles=roles)

In [10]:
preds = automl_md._model.predict(test_data)

In [11]:
for i in tqdm(range(N)):
    automl_md.update(test_data_split[i])

100%|██████████| 10/10 [01:26<00:00,  8.62s/it]


In [12]:
automl_md.DSD.alerts

{'SK_ID_CURR': {'psi_values': array([0.03289571, 0.0536971 , 0.05835109, 0.0634574 , 0.03979216,
         0.03052028, 0.04625072, 0.05674593, 0.05961223, 0.03949001])},
 'NAME_CONTRACT_TYPE': {},
 'CODE_GENDER': {},
 'FLAG_OWN_CAR': {},
 'FLAG_OWN_REALTY': {},
 'CNT_CHILDREN': {},
 'AMT_INCOME_TOTAL': {'new_category': [{'n_epoch': 0,
    'value': 78750.0,
    'occurance': 0.002242},
   {'n_epoch': 0, 'value': 346500.0, 'occurance': 0.002242},
   {'n_epoch': 0, 'value': 261000.0, 'occurance': 0.002242},
   {'n_epoch': 0, 'value': 229500.0, 'occurance': 0.002242},
   {'n_epoch': 0, 'value': 328500.0, 'occurance': 0.001121},
   {'n_epoch': 0, 'value': 198000.0, 'occurance': 0.001121},
   {'n_epoch': 0, 'value': 29250.0, 'occurance': 0.001121},
   {'n_epoch': 0, 'value': 173250.0, 'occurance': 0.001121},
   {'n_epoch': 0, 'value': 89550.0, 'occurance': 0.001121},
   {'n_epoch': 0, 'value': 184500.0, 'occurance': 0.001121},
   {'n_epoch': 0, 'value': 2070000.0, 'occurance': 0.001121},
   {'