In [1]:
import numpy as np
import pandas as pd

In [2]:
from lightautoml_gpu.reader.gpu.cudf_reader import CudfReader
from lightautoml_gpu.reader.base import PandasToPandasReader

from lightautoml_gpu.transformers.base import SequentialTransformer

from lightautoml_gpu.pipelines.utils import get_columns_by_role

from lightautoml_gpu.transformers.gpu import numeric_gpu, categorical_gpu, datetime_gpu
from lightautoml_gpu.transformers import numeric, categorical, datetime

from lightautoml_gpu.tasks import Task
from lightautoml_gpu.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML

import pandas as pd
import time
from sklearn.model_selection import train_test_split

from dask.distributed import Client
from dask_cuda import LocalCUDACluster
import cudf

from lightautoml_gpu.dataset.roles import TargetRole

import os
import joblib

In [3]:
key = 'airlines'
adv_roles = True
args_fold = 2

data_info = joblib.load(os.path.join("../../data/old_presets", 'data_info.pkl'))[key]
folds = joblib.load(os.path.join("../../data/old_presets", 'folds', '{0}.pkl'.format(key)))

read_csv_params = {}
if 'read_csv_params' in data_info:
    read_csv_params = {**read_csv_params, **data_info['read_csv_params']}

data = pd.read_csv(os.path.join("../../data/old_presets/data", data_info['path']), **read_csv_params)

if 'drop' in data_info:
    data.drop(data_info['drop'], axis=1, inplace=True)

if 'class_map' in data_info:
    data[data_info['target']] = data[data_info['target']].map(data_info['class_map']).values
    assert data[data_info['target']].notnull().all(), 'Class mapping is set unproperly'

print(data.head())
print("task type:", data_info['task_type'])

roles = {TargetRole(): data_info['target']}

  Airline  Flight AirportFrom AirportTo  DayOfWeek  Time  Length  Delay
0      CO     269         SFO       IAH          3    15     205      1
1      US    1558         PHX       CLT          3    15     222      1
2      AA    2400         LAX       DFW          3    20     165      1
3      AA    2466         SFO       DFW          3    20     195      1
4      AS     108         ANC       SEA          3    30     202      0
task type: binary


## Imports (for potential use)

In [4]:
# Imports from our package
from lightautoml_gpu.automl.base import AutoML

from lightautoml_gpu.automl.presets.gpu.tabular_gpu_presets import TabularAutoMLGPU, TabularUtilizedAutoMLGPU
from lightautoml_gpu.tasks import Task

from lightautoml_gpu.pipelines.features.gpu.lgb_pipeline_gpu import LGBSimpleFeaturesGPU, LGBAdvancedPipelineGPU
from lightautoml_gpu.pipelines.features.gpu.linear_pipeline_gpu import LinearFeaturesGPU

from lightautoml_gpu.pipelines.features.lgb_pipeline import LGBSimpleFeatures, LGBAdvancedPipeline
from lightautoml_gpu.pipelines.features.linear_pipeline import LinearFeatures


from lightautoml_gpu.ml_algo.gpu.boost_cb_gpu import BoostCBGPU
from lightautoml_gpu.ml_algo.gpu.boost_xgb_gpu import BoostXGB
from lightautoml_gpu.ml_algo.gpu.linear_gpu import LinearLBFGSGPU

from lightautoml_gpu.ml_algo.boost_cb import BoostCB
from lightautoml_gpu.ml_algo.linear_sklearn import LinearLBFGS


from lightautoml_gpu.pipelines.ml.base import MLPipeline
from lightautoml_gpu.pipelines.selection.importance_based import ModelBasedImportanceEstimator, ImportanceCutoffSelector

## TabularAutoML

In [5]:
task = Task(data_info['task_type'])

In [6]:
automl = TabularAutoML(
    task = task, 
    timeout = 3600,
    cpu_limit = 4,
    reader_params = {'n_jobs': 4, 'cv': 3, 'random_state': 42},
    general_params = {'use_algos': ['linear_l2', 'cb', 'lgbm']}
)

In [7]:
#cpu_fit_pred = automl.fit_predict(data[folds!=args_fold].reset_index().drop(['index'],axis=1), roles = roles, verbose = 2)

In [8]:
#cpu_pred = automl.predict(data[folds==args_fold].reset_index().drop(['index'],axis=1))

In [9]:
task = Task(data_info['task_type'], device='gpu')

In [10]:
automl_gpu = TabularAutoMLGPU(
    task = task, 
    timeout = 3600,
    cpu_limit = 1,
    reader_params = {'n_jobs': 1, 'cv': 3, 'random_state': 42},
    general_params = {'use_algos': ['xgb']}
)

In [11]:
gpu_fit_pred = automl_gpu.fit_predict(data[folds!=args_fold].reset_index().drop(['index'],axis=1), roles = roles, verbose = 2)

[13:45:06] Stdout logging level is INFO2.
[13:45:06] Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer
[13:45:06] Task: binary

[13:45:06] Start automl preset with listed constraints:
[13:45:06] - time: 3600.00 seconds
[13:45:06] - CPU: 1 cores
[13:45:06] - memory: 16 GB

[13:45:06] Train data shape: (431506, 8)
[13:45:07] Feats was rejected during automatic roles guess: []
[13:45:07] Layer [1m1[0m train process start. Time left 3599.40 secs


  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
Default metric period is 5 because AUC is/are not implemented for GPU


[13:45:28] [1mSelector_CatBoostGPU[0m fitting and predicting completed
[13:45:30] Start fitting [1mLvl_0_Pipe_0_Mod_0_XGB[0m ...
[13:45:30] ===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_0_Mod_0_XGB[0m (orig) =====
[13:45:43] ===== Start working with [1mfold 1[0m for [1mLvl_0_Pipe_0_Mod_0_XGB[0m (orig) =====
[13:45:52] ===== Start working with [1mfold 2[0m for [1mLvl_0_Pipe_0_Mod_0_XGB[0m (orig) =====
[13:46:02] Fitting [1mLvl_0_Pipe_0_Mod_0_XGB[0m finished. score = [1m0.7095164060592651[0m
[13:46:02] [1mLvl_0_Pipe_0_Mod_0_XGB[0m fitting and predicting completed
[13:46:02] Time left 3544.12 secs

[13:46:02] [1mLayer 1 training completed.[0m

[13:46:02] [1mAutoml preset training completed in 55.88 seconds[0m

[13:46:02] Model description:
Final prediction for new objects (level 0) = 
	 1.00000 * (3 averaged models Lvl_0_Pipe_0_Mod_0_XGB) 



In [12]:
gpu_inf = automl_gpu.predict(data[folds==args_fold].reset_index().drop(['index'],axis=1))

In [13]:
automl_gpu.to_cpu()

In [14]:
cpu_inf = automl_gpu.predict(data[folds==args_fold].reset_index().drop(['index'],axis=1))

In [15]:
cluster = LocalCUDACluster(rmm_managed_memory=True, CUDA_VISIBLE_DEVICES="0",
                               protocol="ucx", enable_nvlink=True,
                               memory_limit="8GB")
print("dashboard:", cluster.dashboard_link)
client = Client(cluster)
client.run(cudf.set_allocator, "managed")

2022-12-10 13:46:08,210 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-12-10 13:46:08,210 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize


dashboard: http://127.0.0.1:8787/status


{'ucx://127.0.0.1:58973': None}

In [16]:
task = Task(data_info['task_type'], device='mgpu')

automl_mgpu = TabularAutoMLGPU(
    task = task, 
    timeout = 3600,
    cpu_limit = 1,
    reader_params = {'n_jobs': 1, 'cv': 3, 'random_state': 42, 'npartitions': 2},
    general_params = {'use_algos': ['xgb']},
    client = client
)

mgpu_fit_pred = automl_mgpu.fit_predict(data[folds!=args_fold].reset_index().drop(['index'],axis=1), roles = roles, verbose = 2)

[13:46:08] Stdout logging level is INFO2.
[13:46:08] Task: binary

[13:46:08] Start automl preset with listed constraints:
[13:46:08] - time: 3600.00 seconds
[13:46:08] - CPU: 1 cores
[13:46:08] - memory: 16 GB

[13:46:08] Train data shape: (431506, 8)
[13:46:09] Feats was rejected during automatic roles guess: []
[13:46:09] Layer [1m1[0m train process start. Time left 3599.32 secs


  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
Default metric period is 5 because AUC is/are not implemented for GPU


[13:46:30] [1mSelector_CatBoostGPU[0m fitting and predicting completed
[13:46:32] Start fitting [1mLvl_0_Pipe_0_Mod_0_XGB[0m ...
[13:46:32] ===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_0_Mod_0_XGB[0m (orig) =====


2022-12-10 13:46:40,740 - distributed.worker - ERROR - Could not deserialize task ('_convert_datetime-c03464c8957edbdad7435988d78f4cd6', 0)
Traceback (most recent call last):
  File "/home/rishat/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/distributed/worker.py", line 2200, in execute
    function, args, kwargs = await self._maybe_deserialize_task(ts)
  File "/home/rishat/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/distributed/worker.py", line 2173, in _maybe_deserialize_task
    function, args, kwargs = _deserialize(*ts.run_spec)
  File "/home/rishat/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/distributed/worker.py", line 2844, in _deserialize
    function = loads_function(function)
  File "/home/rishat/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/distributed/worker.py", line 2838, in loads_function
    return pickle.loads(bytes_object)
  File "/home/rishat/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/distributed/protocol/

[13:46:41] Model Lvl_0_Pipe_0_Mod_0_XGB failed during ml_algo.fit_predict call.

error


2022-12-10 13:46:41,097 - distributed.worker - ERROR - Could not deserialize task ('_convert_datetime-0ecfa0f6482f4b864b5aef5ea7ea34a0', 0)
Traceback (most recent call last):
  File "/home/rishat/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/distributed/worker.py", line 2200, in execute
    function, args, kwargs = await self._maybe_deserialize_task(ts)
  File "/home/rishat/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/distributed/worker.py", line 2173, in _maybe_deserialize_task
    function, args, kwargs = _deserialize(*ts.run_spec)
  File "/home/rishat/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/distributed/worker.py", line 2844, in _deserialize
    function = loads_function(function)
  File "/home/rishat/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/distributed/worker.py", line 2838, in loads_function
    return pickle.loads(bytes_object)
  File "/home/rishat/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/distributed/protocol/

AssertionError: Pipeline finished with 0 models for some reason.
Probably one or more models failed

In [None]:
mgpu_pred = automl_mgpu.predict(data[folds==args_fold].reset_index().drop(['index'],axis=1))

In [None]:
print(cpu_inf.data.T)
print()
print(gpu_inf.data.T)
print()
print(cpu_pred.data.T)
print()
print(mgpu_pred.data.T)

In [None]:
print(cpu_fit_pred.data.T)
print()
print(gpu_fit_pred.data.T)
print()
print(mgpu_fit_pred.data.T)

In [None]:
automl_mgpu.to_cpu()

In [None]:
mgpu_inf = automl_mgpu.predict(data[folds==args_fold].reset_index().drop(['index'],axis=1))

In [None]:
print(mgpu_inf.data.T)

In [None]:
automl_mgpu.levels[0][0].ml_algos[0].get_features_score()