# Learning regression datasets and training model

### Imports

In [128]:
import os
from pathlib import Path
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [129]:
while "notebooks" in os.getcwd():
    os.chdir("../")

In [130]:
do_training = True

### California Housing Prices Datset

In [70]:
DATA_DIR = Path('data/housing_data')
file_name = 'housing.csv'
scaled_file_name = 'housing_scaled.csv'
train_file_name = 'train_housing_scaled.csv'
test_file_name = 'test_housing_scaled.csv'
scaler_params_file = 'housing_scaling_params.csv'

In [71]:
MODEL_PATH = Path('models/')
housing_model_name = 'housing_scaled'

In [72]:
df = pd.read_csv(DATA_DIR / file_name)

In [73]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [74]:
df.ocean_proximity.unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [75]:
df = df.drop(columns=['ocean_proximity'])

In [76]:
df.loc[:, df.columns == 'median_house_value']

Unnamed: 0,median_house_value
0,452600.0
1,358500.0
2,352100.0
3,341300.0
4,342200.0
...,...
20635,78100.0
20636,77100.0
20637,92300.0
20638,84700.0


In [77]:
def scale_split_df(_df: pd.DataFrame, prediction_column: str, test_size: float, data_dir: Path,
                   scaled_df_name: str, train_df_name: str, test_df_name: str, scaler_params_name: str):

    scaler = StandardScaler()
    features_df = df.loc[:, df.columns != prediction_column]
    prediction_df = df.loc[:, df.columns == prediction_column]
    scaled_features = pd.DataFrame(scaler.fit_transform(features_df))
    scaled_features.columns = features_df.columns.str.replace(' ', '_')
    scaled_df = pd.concat([scaled_features, prediction_df], axis=1)
    train_df, test_df = train_test_split(scaled_df, test_size=test_size)

    scaled_df.to_csv(data_dir/scaled_df_name, index=False)
    train_df.to_csv(data_dir/train_df_name, index=False)
    test_df.to_csv(data_dir/test_df_name, index=False)

    normalization_params = {
        "mean": scaler.mean_,
        "variance": scaler.var_,
    }

    normalization_params_df = pd.DataFrame.from_dict(
        normalization_params, orient="index")
    normalization_params_df.columns = features_df.columns
    normalization_params_df.to_csv(DATA_DIR/scaler_params_name)

    return scaled_df, train_df, test_df

In [78]:
df = df.dropna()

In [79]:
scaled_df, train_df, test_df = scale_split_df(
    df, 'median_house_value', 0.2, DATA_DIR, scaled_file_name, train_file_name, test_file_name, scaler_params_file)
scaled_df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0
mean,-8.457103e-16,1.535632e-15,4.4511070000000006e-17,-3.6512990000000005e-17,-7.372146e-17,-8.345826000000001e-17,-6.398466e-17,3.060136e-16,206864.413155
std,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,115435.667099
min,-2.385447,-1.447937,-2.194584,-1.205604,-1.274093,-1.254828,-1.30381,-1.775054,14999.0
25%,-1.112692,-0.797278,-0.8444662,-0.5429688,-0.5740034,-0.5629702,-0.5739976,-0.6884113,119500.0
50%,0.5393926,-0.642805,0.02913923,-0.2331597,-0.2441308,-0.2285134,-0.2365573,-0.1762077,179700.0
75%,0.7789699,0.9768213,0.6644886,0.2317829,0.2589843,0.2621409,0.2735268,0.4595713,264700.0
max,2.625711,2.956885,1.855769,16.78713,14.01871,30.23088,14.60297,5.859665,500001.0


In [80]:
train_df = pd.read_csv("data/housing_data/train_housing_scaled.csv")
test_df = pd.read_csv("data/housing_data/test_housing_scaled.csv")

In [81]:
X_train = train_df.loc[:, df.columns != 'median_house_value']
y_train = train_df.loc[:, df.columns == 'median_house_value']#/10000
X_test = test_df.loc[:, df.columns != 'median_house_value']
y_test = test_df.loc[:, df.columns == 'median_house_value']#/10000

#### Housing model training

In [82]:
def train_and_save_model(param: dict, steps: int, dtrain: xgb.DMatrix, dtest: xgb.DMatrix, model_path: Path, model_name: str):
    if do_training:
        gbdt_model = xgb.train(param, dtrain,
                               evals=[(dtest, 'test'), (dtrain, 'train')],
                               verbose_eval=50, early_stopping_rounds=1, num_boost_round=steps
                               )
    if do_training:
        gbdt_model.dump_model(
            model_path / f"{model_name}_dumped.txt", with_stats=True)
        gbdt_model.save_model(model_path / f"{model_name}_saved.json")
    else:
        gbdt_model = xgb.Booster()
        gbdt_model.load_model(model_path / f"{model_name}_saved.json")
    return gbdt_model

#### Performing grid_search

In [83]:
def grid_search(space: dict, X_train: pd.DataFrame, y_train: pd.DataFrame, X_test: pd.DataFrame, y_test: pd.DataFrame,
                num_boost_round = 300):
    clf = xgb.XGBRegressor(
        objective='reg:squarederror',
        early_stopping_rounds=10,
        #n_boost_rounds=30
        n_estimators = num_boost_round
) 
    grid_search = GridSearchCV(
        clf, param_grid=space, scoring='neg_root_mean_squared_error', n_jobs=4, cv=5, verbose = 0)
    grid_search.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose = 0)
    return grid_search

In [84]:
def search_hyperparameters_and_save_model(search_space: dict, 
                                          X_train: pd.DataFrame, 
                                          y_train: pd.DataFrame,
                                          X_test: pd.DataFrame, 
                                          y_test: pd.DataFrame,
                                          model_name: str,
                                          MODEL_PATH: Path,
                                          num_boost_round = 300):
    best = grid_search(search_space, X_train, y_train, X_test, y_test, num_boost_round=num_boost_round)

    best_params = best.best_params_
    dump = best.best_estimator_.get_booster().get_dump()
    leaves = sum([i.count('leaf') for i in dump])
    iterations = len(dump)
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    name =  "different_sizes/" + model_name +  f"_leaves_{leaves}"
    train_and_save_model(best_params, iterations, dtrain, dtest, MODEL_PATH, name)

#### Training models with various sizes - depth is set to be fixed to control size of a model


In [85]:
grid_params = {
    'max_depth': [1, 2,],# 3, 4],
    'eta': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'subsample': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}

In [86]:
for d in range(1, 10):
    grid_params['max_depth'] = [d] 
    search_hyperparameters_and_save_model(grid_params,
                                      X_train,
                                      y_train, 
                                      X_test, 
                                      y_test,
                                      housing_model_name,
                                      MODEL_PATH);

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_cate

ValueError: 
All the 500 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
42 fits failed with the following error:
Traceback (most recent call last):
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/sklearn.py", line 1051, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/sklearn.py", line 534, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/sklearn.py", line 954, in _create_dmatrix
    return QuantileDMatrix(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 1528, in __init__
    self._init(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 1587, in _init
    it.reraise()
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 575, in reraise
    raise exc  # pylint: disable=raising-bad-type
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 556, in _handle_exception
    return fn()
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 640, in <lambda>
    return self._handle_exception(lambda: self.next(input_data), 0)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/data.py", line 1260, in next
    input_data(**self.kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 632, in input_data
    self.proxy.set_info(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 931, in set_info
    self.set_label(label)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 1069, in set_label
    dispatch_meta_backend(self, label, "label", "float")
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/data.py", line 1202, in dispatch_meta_backend
    _meta_from_numpy(data, name, dtype, handle)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/data.py", line 1139, in _meta_from_numpy
    _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface_str))
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 281, in _check_call
    raise XGBoostError(py_str(_LIB.XGBGetLastError()))
xgboost.core.XGBoostError: [09:16:25] /workspace/src/data/data.cc:507: Check failed: valid: Label contains NaN, infinity or a value too large.
Stack trace:
  [bt] (0) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x3581ea) [0x7490829581ea]
  [bt] (1) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x389b7d) [0x749082989b7d]
  [bt] (2) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x38a4b1) [0x74908298a4b1]
  [bt] (3) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(XGDMatrixSetInfoFromInterface+0xb0) [0x74908275e210]
  [bt] (4) /lib/x86_64-linux-gnu/libffi.so.8(+0x7e2e) [0x7490b71fae2e]
  [bt] (5) /lib/x86_64-linux-gnu/libffi.so.8(+0x4493) [0x7490b71f7493]
  [bt] (6) /usr/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0xa3e9) [0x7490b5d583e9]
  [bt] (7) /usr/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0x9a00) [0x7490b5d57a00]
  [bt] (8) /home/mateusz/Documents/tmp/prediction-gap/venv/bin/python(_PyObject_MakeTpCall+0x25b) [0x5df0ad455a7b]



--------------------------------------------------------------------------------
38 fits failed with the following error:
Traceback (most recent call last):
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/sklearn.py", line 1051, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/sklearn.py", line 534, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/sklearn.py", line 954, in _create_dmatrix
    return QuantileDMatrix(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 1528, in __init__
    self._init(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 1587, in _init
    it.reraise()
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 575, in reraise
    raise exc  # pylint: disable=raising-bad-type
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 556, in _handle_exception
    return fn()
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 640, in <lambda>
    return self._handle_exception(lambda: self.next(input_data), 0)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/data.py", line 1260, in next
    input_data(**self.kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 632, in input_data
    self.proxy.set_info(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 931, in set_info
    self.set_label(label)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 1069, in set_label
    dispatch_meta_backend(self, label, "label", "float")
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/data.py", line 1202, in dispatch_meta_backend
    _meta_from_numpy(data, name, dtype, handle)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/data.py", line 1139, in _meta_from_numpy
    _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface_str))
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 281, in _check_call
    raise XGBoostError(py_str(_LIB.XGBGetLastError()))
xgboost.core.XGBoostError: [09:16:25] /workspace/src/data/data.cc:507: Check failed: valid: Label contains NaN, infinity or a value too large.
Stack trace:
  [bt] (0) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x3581ea) [0x771448d581ea]
  [bt] (1) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x389b7d) [0x771448d89b7d]
  [bt] (2) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x38a4b1) [0x771448d8a4b1]
  [bt] (3) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(XGDMatrixSetInfoFromInterface+0xb0) [0x771448b5e210]
  [bt] (4) /lib/x86_64-linux-gnu/libffi.so.8(+0x7e2e) [0x77147b8fae2e]
  [bt] (5) /lib/x86_64-linux-gnu/libffi.so.8(+0x4493) [0x77147b8f7493]
  [bt] (6) /usr/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0xa3e9) [0x77147c0d33e9]
  [bt] (7) /usr/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0x9a00) [0x77147c0d2a00]
  [bt] (8) /home/mateusz/Documents/tmp/prediction-gap/venv/bin/python(_PyObject_MakeTpCall+0x25b) [0x5fdafbe0ca7b]



--------------------------------------------------------------------------------
36 fits failed with the following error:
Traceback (most recent call last):
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/sklearn.py", line 1051, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/sklearn.py", line 534, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/sklearn.py", line 954, in _create_dmatrix
    return QuantileDMatrix(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 1528, in __init__
    self._init(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 1587, in _init
    it.reraise()
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 575, in reraise
    raise exc  # pylint: disable=raising-bad-type
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 556, in _handle_exception
    return fn()
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 640, in <lambda>
    return self._handle_exception(lambda: self.next(input_data), 0)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/data.py", line 1260, in next
    input_data(**self.kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 632, in input_data
    self.proxy.set_info(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 931, in set_info
    self.set_label(label)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 1069, in set_label
    dispatch_meta_backend(self, label, "label", "float")
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/data.py", line 1202, in dispatch_meta_backend
    _meta_from_numpy(data, name, dtype, handle)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/data.py", line 1139, in _meta_from_numpy
    _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface_str))
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 281, in _check_call
    raise XGBoostError(py_str(_LIB.XGBGetLastError()))
xgboost.core.XGBoostError: [09:16:25] /workspace/src/data/data.cc:507: Check failed: valid: Label contains NaN, infinity or a value too large.
Stack trace:
  [bt] (0) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x3581ea) [0x74de2b9581ea]
  [bt] (1) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x389b7d) [0x74de2b989b7d]
  [bt] (2) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x38a4b1) [0x74de2b98a4b1]
  [bt] (3) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(XGDMatrixSetInfoFromInterface+0xb0) [0x74de2b75e210]
  [bt] (4) /lib/x86_64-linux-gnu/libffi.so.8(+0x7e2e) [0x74de5f5f5e2e]
  [bt] (5) /lib/x86_64-linux-gnu/libffi.so.8(+0x4493) [0x74de5f5f2493]
  [bt] (6) /usr/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0xa3e9) [0x74de5f0723e9]
  [bt] (7) /usr/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0x9a00) [0x74de5f071a00]
  [bt] (8) /home/mateusz/Documents/tmp/prediction-gap/venv/bin/python(_PyObject_MakeTpCall+0x25b) [0x58d264849a7b]



--------------------------------------------------------------------------------
37 fits failed with the following error:
Traceback (most recent call last):
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/sklearn.py", line 1051, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/sklearn.py", line 534, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/sklearn.py", line 954, in _create_dmatrix
    return QuantileDMatrix(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 1528, in __init__
    self._init(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 1587, in _init
    it.reraise()
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 575, in reraise
    raise exc  # pylint: disable=raising-bad-type
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 556, in _handle_exception
    return fn()
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 640, in <lambda>
    return self._handle_exception(lambda: self.next(input_data), 0)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/data.py", line 1260, in next
    input_data(**self.kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 632, in input_data
    self.proxy.set_info(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 931, in set_info
    self.set_label(label)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 1069, in set_label
    dispatch_meta_backend(self, label, "label", "float")
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/data.py", line 1202, in dispatch_meta_backend
    _meta_from_numpy(data, name, dtype, handle)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/data.py", line 1139, in _meta_from_numpy
    _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface_str))
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 281, in _check_call
    raise XGBoostError(py_str(_LIB.XGBGetLastError()))
xgboost.core.XGBoostError: [09:16:25] /workspace/src/data/data.cc:507: Check failed: valid: Label contains NaN, infinity or a value too large.
Stack trace:
  [bt] (0) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x3581ea) [0x78eb3f9581ea]
  [bt] (1) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x389b7d) [0x78eb3f989b7d]
  [bt] (2) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x38a4b1) [0x78eb3f98a4b1]
  [bt] (3) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(XGDMatrixSetInfoFromInterface+0xb0) [0x78eb3f75e210]
  [bt] (4) /lib/x86_64-linux-gnu/libffi.so.8(+0x7e2e) [0x78eb72cf8e2e]
  [bt] (5) /lib/x86_64-linux-gnu/libffi.so.8(+0x4493) [0x78eb72cf5493]
  [bt] (6) /usr/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0xa3e9) [0x78eb72d083e9]
  [bt] (7) /usr/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0x9a00) [0x78eb72d07a00]
  [bt] (8) /home/mateusz/Documents/tmp/prediction-gap/venv/bin/python(_PyObject_MakeTpCall+0x25b) [0x6137d74b6a7b]



--------------------------------------------------------------------------------
101 fits failed with the following error:
Traceback (most recent call last):
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/sklearn.py", line 1051, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/sklearn.py", line 534, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/sklearn.py", line 954, in _create_dmatrix
    return QuantileDMatrix(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 1528, in __init__
    self._init(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 1587, in _init
    it.reraise()
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 575, in reraise
    raise exc  # pylint: disable=raising-bad-type
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 556, in _handle_exception
    return fn()
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 640, in <lambda>
    return self._handle_exception(lambda: self.next(input_data), 0)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/data.py", line 1260, in next
    input_data(**self.kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 632, in input_data
    self.proxy.set_info(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 931, in set_info
    self.set_label(label)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 1069, in set_label
    dispatch_meta_backend(self, label, "label", "float")
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/data.py", line 1202, in dispatch_meta_backend
    _meta_from_numpy(data, name, dtype, handle)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/data.py", line 1139, in _meta_from_numpy
    _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface_str))
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 281, in _check_call
    raise XGBoostError(py_str(_LIB.XGBGetLastError()))
xgboost.core.XGBoostError: [09:16:26] /workspace/src/data/data.cc:507: Check failed: valid: Label contains NaN, infinity or a value too large.
Stack trace:
  [bt] (0) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x3581ea) [0x7490829581ea]
  [bt] (1) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x389b7d) [0x749082989b7d]
  [bt] (2) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x38a4b1) [0x74908298a4b1]
  [bt] (3) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(XGDMatrixSetInfoFromInterface+0xb0) [0x74908275e210]
  [bt] (4) /lib/x86_64-linux-gnu/libffi.so.8(+0x7e2e) [0x7490b71fae2e]
  [bt] (5) /lib/x86_64-linux-gnu/libffi.so.8(+0x4493) [0x7490b71f7493]
  [bt] (6) /usr/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0xa3e9) [0x7490b5d583e9]
  [bt] (7) /usr/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0x9a00) [0x7490b5d57a00]
  [bt] (8) /home/mateusz/Documents/tmp/prediction-gap/venv/bin/python(_PyObject_MakeTpCall+0x25b) [0x5df0ad455a7b]



--------------------------------------------------------------------------------
81 fits failed with the following error:
Traceback (most recent call last):
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/sklearn.py", line 1051, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/sklearn.py", line 534, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/sklearn.py", line 954, in _create_dmatrix
    return QuantileDMatrix(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 1528, in __init__
    self._init(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 1587, in _init
    it.reraise()
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 575, in reraise
    raise exc  # pylint: disable=raising-bad-type
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 556, in _handle_exception
    return fn()
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 640, in <lambda>
    return self._handle_exception(lambda: self.next(input_data), 0)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/data.py", line 1260, in next
    input_data(**self.kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 632, in input_data
    self.proxy.set_info(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 931, in set_info
    self.set_label(label)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 1069, in set_label
    dispatch_meta_backend(self, label, "label", "float")
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/data.py", line 1202, in dispatch_meta_backend
    _meta_from_numpy(data, name, dtype, handle)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/data.py", line 1139, in _meta_from_numpy
    _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface_str))
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 281, in _check_call
    raise XGBoostError(py_str(_LIB.XGBGetLastError()))
xgboost.core.XGBoostError: [09:16:26] /workspace/src/data/data.cc:507: Check failed: valid: Label contains NaN, infinity or a value too large.
Stack trace:
  [bt] (0) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x3581ea) [0x771448d581ea]
  [bt] (1) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x389b7d) [0x771448d89b7d]
  [bt] (2) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x38a4b1) [0x771448d8a4b1]
  [bt] (3) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(XGDMatrixSetInfoFromInterface+0xb0) [0x771448b5e210]
  [bt] (4) /lib/x86_64-linux-gnu/libffi.so.8(+0x7e2e) [0x77147b8fae2e]
  [bt] (5) /lib/x86_64-linux-gnu/libffi.so.8(+0x4493) [0x77147b8f7493]
  [bt] (6) /usr/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0xa3e9) [0x77147c0d33e9]
  [bt] (7) /usr/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0x9a00) [0x77147c0d2a00]
  [bt] (8) /home/mateusz/Documents/tmp/prediction-gap/venv/bin/python(_PyObject_MakeTpCall+0x25b) [0x5fdafbe0ca7b]



--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/sklearn.py", line 1051, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/sklearn.py", line 534, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/sklearn.py", line 954, in _create_dmatrix
    return QuantileDMatrix(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 1528, in __init__
    self._init(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 1587, in _init
    it.reraise()
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 575, in reraise
    raise exc  # pylint: disable=raising-bad-type
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 556, in _handle_exception
    return fn()
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 640, in <lambda>
    return self._handle_exception(lambda: self.next(input_data), 0)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/data.py", line 1260, in next
    input_data(**self.kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 632, in input_data
    self.proxy.set_info(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 931, in set_info
    self.set_label(label)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 1069, in set_label
    dispatch_meta_backend(self, label, "label", "float")
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/data.py", line 1202, in dispatch_meta_backend
    _meta_from_numpy(data, name, dtype, handle)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/data.py", line 1139, in _meta_from_numpy
    _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface_str))
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 281, in _check_call
    raise XGBoostError(py_str(_LIB.XGBGetLastError()))
xgboost.core.XGBoostError: [09:16:26] /workspace/src/data/data.cc:507: Check failed: valid: Label contains NaN, infinity or a value too large.
Stack trace:
  [bt] (0) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x3581ea) [0x78eb3f9581ea]
  [bt] (1) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x389b7d) [0x78eb3f989b7d]
  [bt] (2) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x38a4b1) [0x78eb3f98a4b1]
  [bt] (3) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(XGDMatrixSetInfoFromInterface+0xb0) [0x78eb3f75e210]
  [bt] (4) /lib/x86_64-linux-gnu/libffi.so.8(+0x7e2e) [0x78eb72cf8e2e]
  [bt] (5) /lib/x86_64-linux-gnu/libffi.so.8(+0x4493) [0x78eb72cf5493]
  [bt] (6) /usr/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0xa3e9) [0x78eb72d083e9]
  [bt] (7) /usr/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0x9a00) [0x78eb72d07a00]
  [bt] (8) /home/mateusz/Documents/tmp/prediction-gap/venv/bin/python(_PyObject_MakeTpCall+0x25b) [0x6137d74b6a7b]



--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/sklearn.py", line 1051, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/sklearn.py", line 534, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/sklearn.py", line 954, in _create_dmatrix
    return QuantileDMatrix(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 1528, in __init__
    self._init(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 1587, in _init
    it.reraise()
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 575, in reraise
    raise exc  # pylint: disable=raising-bad-type
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 556, in _handle_exception
    return fn()
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 640, in <lambda>
    return self._handle_exception(lambda: self.next(input_data), 0)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/data.py", line 1260, in next
    input_data(**self.kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 632, in input_data
    self.proxy.set_info(
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 931, in set_info
    self.set_label(label)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 1069, in set_label
    dispatch_meta_backend(self, label, "label", "float")
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/data.py", line 1202, in dispatch_meta_backend
    _meta_from_numpy(data, name, dtype, handle)
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/data.py", line 1139, in _meta_from_numpy
    _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface_str))
  File "/home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/core.py", line 281, in _check_call
    raise XGBoostError(py_str(_LIB.XGBGetLastError()))
xgboost.core.XGBoostError: [09:16:26] /workspace/src/data/data.cc:507: Check failed: valid: Label contains NaN, infinity or a value too large.
Stack trace:
  [bt] (0) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x3581ea) [0x74de2b9581ea]
  [bt] (1) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x389b7d) [0x74de2b989b7d]
  [bt] (2) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x38a4b1) [0x74de2b98a4b1]
  [bt] (3) /home/mateusz/Documents/tmp/prediction-gap/venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(XGDMatrixSetInfoFromInterface+0xb0) [0x74de2b75e210]
  [bt] (4) /lib/x86_64-linux-gnu/libffi.so.8(+0x7e2e) [0x74de5f5f5e2e]
  [bt] (5) /lib/x86_64-linux-gnu/libffi.so.8(+0x4493) [0x74de5f5f2493]
  [bt] (6) /usr/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0xa3e9) [0x74de5f0723e9]
  [bt] (7) /usr/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0x9a00) [0x74de5f071a00]
  [bt] (8) /home/mateusz/Documents/tmp/prediction-gap/venv/bin/python(_PyObject_MakeTpCall+0x25b) [0x58d264849a7b]




#### Two best models - depth is included in grid search 

In [None]:
grid_params = {
    'max_depth': [1, 2, 3, 4],
    'eta': [0.01],#, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'subsample': [0.01]#, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}

In [None]:
grid_params_big = {
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'eta': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'subsample': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}

In [None]:
%%time
best = grid_search(grid_params, X_train, y_train, X_test, y_test, num_boost_round=40)

In [None]:
%%time
best_big = grid_search(grid_params_big, X_train, y_train, X_test, y_test, num_boost_round = 20)

In [None]:
best.best_params_

{'eta': 0.01, 'max_depth': 4, 'subsample': 0.01}

In [None]:
len(best.best_estimator_.get_booster().get_dump())

40

In [None]:
best_big.best_params_

{'eta': 0.2, 'max_depth': 10, 'subsample': 0.9}

In [None]:
len(best_big.best_estimator_.get_booster().get_dump())

20

In [None]:
# training hyperparameters
param = {
    'eta': 0.1,
    'max_depth': 4,
    'objective': 'reg:squarederror',
    'seed': 42,
    'subsample': 0.01,
}
steps = 40

In [None]:
# training hyperparameters
param_big = {
    'eta': 0.2,
    'max_depth': 10,
    'objective': 'reg:squarederror',
    'seed': 42,
    'subsample': 0.9,
}
steps = 1

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [None]:
%%time
train_and_save_model(param, steps, dtrain, dtest, MODEL_PATH, housing_model_name)

[0]	test-rmse:11.25719	train-rmse:10.97680
[39]	test-rmse:6.54802	train-rmse:6.28841
CPU times: user 2.98 s, sys: 6.59 ms, total: 2.99 s
Wall time: 221 ms


<xgboost.core.Booster at 0x7575a7496620>

In [None]:
%%time
train_and_save_model(param_big, steps, dtrain, dtest, MODEL_PATH, housing_model_name + '_big')

[0]	test-rmse:100785.67308	train-rmse:96867.96499
CPU times: user 3.87 s, sys: 7.02 ms, total: 3.88 s
Wall time: 278 ms


<xgboost.core.Booster at 0x7f8dda97a080>

### Single tree

In [None]:
grid_params = {
    'max_depth': [3],
    'eta': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'subsample': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}

In [None]:
%%time
best = grid_search(grid_params, X_train, y_train, X_test, y_test, num_boost_round = 1)

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)

CPU times: user 744 ms, sys: 90 ms, total: 834 ms
Wall time: 4.27 s


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical

In [None]:
best.best_params_

{'eta': 0.9, 'max_depth': 3, 'subsample': 0.9}

In [None]:
len(best.best_estimator_.get_booster().get_dump())

1

In [None]:
# training hyperparameters
param = {
    'eta': 0.9,
    'max_depth': 3,
    'objective': 'reg:squarederror',
    'seed': 42,
    'subsample': 0.9,
}
steps = 1

In [None]:
%%time
train_and_save_model(param, steps, dtrain, dtest, MODEL_PATH, housing_model_name + '_single')

[0]	test-rmse:81710.35211	train-rmse:82611.51414
CPU times: user 992 ms, sys: 0 ns, total: 992 ms
Wall time: 76.5 ms


<xgboost.core.Booster at 0x7f99e8b2aa10>

### Red Wine Dataset

In [None]:
DATA_DIR = Path('data/wine_quality')
file_name = 'winequality_red.csv'
scaled_file_name = 'winequality_red_scaled.csv'
train_file_name = 'train_winequality_red_scaled.csv'
test_file_name = 'test_winequality_red_scaled.csv'
scaler_params_file = 'winequality_red_scaling_params.csv'
MODEL_PATH = Path('models/')
wine_model_name = 'winequality_red'

In [None]:
df = pd.read_csv(DATA_DIR / file_name, sep=';')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [None]:
#scaled_df, train_df, test_df = scale_split_df(
#    df, 'quality', 0.2, DATA_DIR, scaled_file_name, train_file_name, test_file_name, scaler_params_file)
#scaled_df.describe()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,3.554936e-16,1.733031e-16,-8.887339000000001e-17,-1.244227e-16,3.732682e-16,-6.221137e-17,4.4436690000000005e-17,-3.473172e-14,2.861723e-15,6.754377e-16,1.066481e-16,5.636023
std,1.000313,1.000313,1.000313,1.000313,1.000313,1.000313,1.000313,1.000313,1.000313,1.000313,1.000313,0.807569
min,-2.137045,-2.27828,-1.391472,-1.162696,-1.603945,-1.4225,-1.230584,-3.538731,-3.700401,-1.936507,-1.898919,3.0
25%,-0.7007187,-0.7699311,-0.9293181,-0.4532184,-0.371229,-0.8487156,-0.7440403,-0.6077557,-0.6551405,-0.6382196,-0.8663789,5.0
50%,-0.2410944,-0.04368911,-0.05636026,-0.240375,-0.1799455,-0.1793002,-0.2574968,0.001760083,-0.007212705,-0.2251281,-0.2093081,6.0
75%,0.5057952,0.6266881,0.7652471,0.04341614,0.05384542,0.4901152,0.4723184,0.5768249,0.5759223,0.4240158,0.6354971,6.0
max,4.355149,5.877976,3.743574,9.195681,11.12703,5.367284,7.375154,3.680055,4.528282,7.918677,4.202453,8.0


In [None]:
train_df = pd.read_csv("data/wine_quality/train_winequality_red_scaled.csv")
test_df = pd.read_csv("data/wine_quality/test_winequality_red_scaled.csv")

In [None]:
X_train = train_df.loc[:, train_df.columns != 'quality']
y_train = train_df.loc[:, train_df.columns == 'quality']
X_test = test_df.loc[:, test_df.columns != 'quality']
y_test = test_df.loc[:, test_df.columns == 'quality']

### Wine model training


#### Grid search

In [None]:
grid_params = {
    'max_depth': [1, 2, 3, 4],
    'eta': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'subsample': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}

In [None]:
for d in range(1, 10):
    grid_params['max_depth'] = [d] 
    search_hyperparameters_and_save_model(grid_params,
                                      X_train,
                                      y_train, 
                                      X_test, 
                                      y_test,
                                      wine_model_name,
                                      MODEL_PATH)

In [None]:
grid_params = {
    'max_depth': [1, 2, 3, 4],
    'eta': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'subsample': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}

In [None]:
grid_params_big = {
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'eta': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'subsample': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}

In [None]:
%%time
best = grid_search(grid_params, X_train, y_train, X_test, y_test, num_boost_round=40)

In [None]:
%%time
best_big = grid_search(grid_params_big, X_train, y_train, X_test, y_test, num_boost_round =20)

In [None]:
best.best_params_

{'eta': 0.2, 'max_depth': 4, 'subsample': 0.9}

In [None]:
len(best.best_estimator_.get_booster().get_dump())

40

In [None]:
best_big.best_params_

{'eta': 0.2, 'max_depth': 6, 'subsample': 0.9}

In [None]:
len(best_big.best_estimator_.get_booster().get_dump())

20

In [None]:
# training hyperparameters
param = {
    'eta': 0.2,
    'max_depth': 4,
    'objective': 'reg:squarederror',
    'seed': 42,
    'subsample': 0.8
}
steps = 40

In [None]:
# training hyperparameters
param_big = {
    'eta': 0.2,
    'max_depth': 6,
    'objective': 'reg:squarederror',
    'seed': 42,
    'subsample': 0.9
}
steps = 20

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [None]:
train_and_save_model(param, steps, dtrain, dtest, MODEL_PATH, wine_model_name)

[0]	test-rmse:0.77840	train-rmse:0.74013
[31]	test-rmse:0.62305	train-rmse:0.45214


<xgboost.core.Booster at 0x7f8dda8b9ba0>

In [None]:
train_and_save_model(param_big, steps, dtrain, dtest, MODEL_PATH, wine_model_name + "_big")

[0]	test-rmse:0.76863	train-rmse:0.71958


[19]	test-rmse:0.59941	train-rmse:0.34475


<xgboost.core.Booster at 0x7f8dda97bca0>

### Single tree

In [None]:
grid_params = {
    'max_depth': [3],
    'eta': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'subsample': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}

In [None]:
%%time
best = grid_search(grid_params, X_train, y_train, X_test, y_test, num_boost_round = 1)

In [None]:
best.best_params_

{'eta': 0.9, 'max_depth': 3, 'subsample': 0.9}

In [None]:
len(best.best_estimator_.get_booster().get_dump())

1

In [None]:
# training hyperparameters
param = {
    'eta': 0.9,
    'max_depth': 3,
    'objective': 'reg:squarederror',
    'seed': 42,
    'subsample': 0.9,
}
steps = 1

In [None]:
%%time
train_and_save_model(param, steps, dtrain, dtest, MODEL_PATH, wine_model_name + '_single')

[0]	test-rmse:0.65343	train-rmse:0.66162
CPU times: user 2.21 s, sys: 6.17 ms, total: 2.22 s
Wall time: 169 ms


<xgboost.core.Booster at 0x7f9a7855e0e0>

#### Mniejsze modele (maks. głębokość 4 zamiast 10) mają trochę gorsze wyniki ale mają znacznie mniej liści:
- Housing  RMSE: 51 51393.33848 vs 47937.65673, spadek liczby liści z ok 20 0000 do 1500
- Wine  RSE: 0.61376 vs 0.63269, spadek liczby liści z ok. 15 000 do 600

## Parkinsons Telemonitoring

### Dataset

In [298]:

DATA_DIR = Path('data/telemetry')
file_name = 'parkinsons_updrs.data'
scaled_file_name = 'telemetry_scaled.csv'
train_file_name = 'train_telemetry_scaled.csv'
test_file_name = 'test_telemetry_scaled.csv'
scaler_params_file = 'telemetry_scaling_params.csv'
MODEL_PATH = Path('models/')
telemetry_model_name = 'telemetry'

In [299]:
df = pd.read_csv(DATA_DIR / file_name, sep=',')
df.columns = df.columns.str.replace("[:]", "_", regex=True)
df.columns = df.columns.str.replace("[()]", "_", regex=True)
df.columns = df.columns.str.replace("[%]", "8", regex=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5875 entries, 0 to 5874
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   subject#       5875 non-null   int64  
 1   age            5875 non-null   int64  
 2   sex            5875 non-null   int64  
 3   test_time      5875 non-null   float64
 4   motor_UPDRS    5875 non-null   float64
 5   total_UPDRS    5875 non-null   float64
 6   Jitter_8_      5875 non-null   float64
 7   Jitter_Abs_    5875 non-null   float64
 8   Jitter_RAP     5875 non-null   float64
 9   Jitter_PPQ5    5875 non-null   float64
 10  Jitter_DDP     5875 non-null   float64
 11  Shimmer        5875 non-null   float64
 12  Shimmer_dB_    5875 non-null   float64
 13  Shimmer_APQ3   5875 non-null   float64
 14  Shimmer_APQ5   5875 non-null   float64
 15  Shimmer_APQ11  5875 non-null   float64
 16  Shimmer_DDA    5875 non-null   float64
 17  NHR            5875 non-null   float64
 18  HNR     

In [300]:
df.head()

Unnamed: 0,subject#,age,sex,test_time,motor_UPDRS,total_UPDRS,Jitter_8_,Jitter_Abs_,Jitter_RAP,Jitter_PPQ5,...,Shimmer_dB_,Shimmer_APQ3,Shimmer_APQ5,Shimmer_APQ11,Shimmer_DDA,NHR,HNR,RPDE,DFA,PPE
0,1,72,0,5.6431,28.199,34.398,0.00662,3.4e-05,0.00401,0.00317,...,0.23,0.01438,0.01309,0.01662,0.04314,0.01429,21.64,0.41888,0.54842,0.16006
1,1,72,0,12.666,28.447,34.894,0.003,1.7e-05,0.00132,0.0015,...,0.179,0.00994,0.01072,0.01689,0.02982,0.011112,27.183,0.43493,0.56477,0.1081
2,1,72,0,19.681,28.695,35.389,0.00481,2.5e-05,0.00205,0.00208,...,0.181,0.00734,0.00844,0.01458,0.02202,0.02022,23.047,0.46222,0.54405,0.21014
3,1,72,0,25.647,28.905,35.81,0.00528,2.7e-05,0.00191,0.00264,...,0.327,0.01106,0.01265,0.01963,0.03317,0.027837,24.445,0.4873,0.57794,0.33277
4,1,72,0,33.642,29.187,36.375,0.00335,2e-05,0.00093,0.0013,...,0.176,0.00679,0.00929,0.01819,0.02036,0.011625,26.126,0.47188,0.56122,0.19361


In [301]:
df = df.drop(columns=["subject#", "sex", "age", "total_UPDRS"])

In [302]:
df

Unnamed: 0,test_time,motor_UPDRS,Jitter_8_,Jitter_Abs_,Jitter_RAP,Jitter_PPQ5,Jitter_DDP,Shimmer,Shimmer_dB_,Shimmer_APQ3,Shimmer_APQ5,Shimmer_APQ11,Shimmer_DDA,NHR,HNR,RPDE,DFA,PPE
0,5.6431,28.199,0.00662,0.000034,0.00401,0.00317,0.01204,0.02565,0.230,0.01438,0.01309,0.01662,0.04314,0.014290,21.640,0.41888,0.54842,0.16006
1,12.6660,28.447,0.00300,0.000017,0.00132,0.00150,0.00395,0.02024,0.179,0.00994,0.01072,0.01689,0.02982,0.011112,27.183,0.43493,0.56477,0.10810
2,19.6810,28.695,0.00481,0.000025,0.00205,0.00208,0.00616,0.01675,0.181,0.00734,0.00844,0.01458,0.02202,0.020220,23.047,0.46222,0.54405,0.21014
3,25.6470,28.905,0.00528,0.000027,0.00191,0.00264,0.00573,0.02309,0.327,0.01106,0.01265,0.01963,0.03317,0.027837,24.445,0.48730,0.57794,0.33277
4,33.6420,29.187,0.00335,0.000020,0.00093,0.00130,0.00278,0.01703,0.176,0.00679,0.00929,0.01819,0.02036,0.011625,26.126,0.47188,0.56122,0.19361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5870,142.7900,22.485,0.00406,0.000031,0.00167,0.00168,0.00500,0.01896,0.160,0.00973,0.01133,0.01549,0.02920,0.025137,22.369,0.64215,0.55314,0.21367
5871,149.8400,21.988,0.00297,0.000025,0.00119,0.00147,0.00358,0.02315,0.215,0.01052,0.01277,0.01904,0.03157,0.011927,22.886,0.52598,0.56518,0.12621
5872,156.8200,21.495,0.00349,0.000025,0.00152,0.00187,0.00456,0.02499,0.244,0.01371,0.01456,0.01877,0.04112,0.017701,25.065,0.47792,0.57888,0.14157
5873,163.7300,21.007,0.00281,0.000020,0.00128,0.00151,0.00383,0.01484,0.131,0.00693,0.00870,0.01307,0.02078,0.007984,24.422,0.56865,0.56327,0.14204


In [303]:
scaled_df, train_df, test_df = scale_split_df(
    df, 'motor_UPDRS', 0.2, DATA_DIR, scaled_file_name, train_file_name, test_file_name, scaler_params_file)
scaled_df.describe()

Unnamed: 0,test_time,Jitter_8_,Jitter_Abs_,Jitter_RAP,Jitter_PPQ5,Jitter_DDP,Shimmer,Shimmer_dB_,Shimmer_APQ3,Shimmer_APQ5,Shimmer_APQ11,Shimmer_DDA,NHR,HNR,RPDE,DFA,PPE,motor_UPDRS
count,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0
mean,3.50736e-17,1.451321e-16,1.161057e-16,1.935095e-17,-1.935095e-17,1.693208e-16,1.741586e-16,1.83834e-16,4.8377380000000006e-17,1.548076e-16,-3.87019e-17,9.675476000000001e-17,-3.87019e-17,-6.385814e-16,1.935095e-16,4.450719e-16,1.741586e-16,21.296229
std,1.000085,1.000085,1.000085,1.000085,1.000085,1.000085,1.000085,1.000085,1.000085,1.000085,1.000085,1.000085,1.000085,1.000085,1.000085,1.000085,1.000085,8.129282
min,-1.817446,-0.946657,-1.161111,-0.850693,-0.7629815,-0.8517702,-1.199058,-1.237697,-1.174502,-1.092523,-1.250516,-1.174245,-0.5333439,-4.665987,-3.866744,-1.963436,-2.15985,5.0377
25%,-0.861065,-0.4576594,-0.6001095,-0.4505077,-0.3904491,-0.4515863,-0.5773716,-0.5905295,-0.5950218,-0.5613909,-0.5912529,-0.5952688,-0.3545954,-0.529862,-0.7099418,-0.8048365,-0.6913181,15.0
50%,-0.02508787,-0.2229405,-0.2639533,-0.2360084,-0.2108831,-0.2360206,-0.2525927,-0.2517438,-0.2610839,-0.2523139,-0.2387287,-0.2608275,-0.2290577,0.05605219,0.007695473,-0.135969,-0.1539957,20.871
75%,0.8529263,0.1149124,0.2586939,0.09694579,0.04908555,0.09693236,0.2212206,0.2347177,0.2583332,0.216703,0.2619106,0.2585891,-0.01100432,0.6442969,0.7186976,0.819445,0.4907692,27.5965
max,2.294608,16.68571,11.16063,17.46499,17.76448,17.46598,9.08122,7.800931,10.99383,8.814745,12.40861,10.99408,11.99822,3.774533,4.20498,2.99538,5.597736,39.511


In [304]:
train_df = pd.read_csv("data/telemetry/train_telemetry_scaled.csv")
test_df = pd.read_csv("data/telemetry/test_telemetry_scaled.csv")

In [305]:

X_train = train_df.loc[:, train_df.columns != 'motor_UPDRS']
y_train = train_df.loc[:, train_df.columns == 'motor_UPDRS']
X_test = test_df.loc[:, test_df.columns != 'motor_UPDRS']
y_test = test_df.loc[:, test_df.columns == 'motor_UPDRS']

In [306]:
X_test

Unnamed: 0,test_time,Jitter_8_,Jitter_Abs_,Jitter_RAP,Jitter_PPQ5,Jitter_DDP,Shimmer,Shimmer_dB_,Shimmer_APQ3,Shimmer_APQ5,Shimmer_APQ11,Shimmer_DDA,NHR,HNR,RPDE,DFA,PPE
0,-1.535313,-0.345634,-0.316204,-0.184785,-0.224284,-0.183730,0.276189,0.199970,0.419636,0.441459,0.218627,0.419640,-0.346470,-0.379072,-1.197923,1.679649,-0.418831
1,1.421497,4.994219,5.700422,5.549070,3.088307,5.547970,2.565513,2.888539,2.698271,1.663964,2.091083,2.698021,1.756660,-2.025178,1.254413,1.487819,3.585400
2,1.548553,-0.167817,-0.373458,-0.107949,-0.213563,-0.106895,-0.179043,-0.064977,-0.094115,-0.273319,-0.457399,-0.094111,-0.284162,0.093575,-0.020925,0.194374,0.403329
3,1.545559,-0.178486,-0.445164,-0.194389,-0.243044,-0.194401,-0.556468,-0.560126,-0.542136,-0.477370,-0.594505,-0.542383,-0.240651,1.240466,0.089793,1.144499,0.315342
4,-1.544893,-0.281620,-0.086356,-0.556157,-0.326127,-0.555101,-0.590146,-0.620933,-0.764257,-0.598000,-0.374835,-0.764252,-0.298553,0.301465,0.573764,-0.218343,-0.284391
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1170,-1.130474,-0.404314,-0.620537,-0.293635,-0.331487,-0.294714,-0.551823,-0.564469,-0.549691,-0.533184,-0.686077,-0.549938,-0.315559,1.425283,-1.900656,-0.878042,-1.004465
1171,-0.452381,-0.212271,-0.472679,-0.130359,-0.200163,-0.131439,0.079928,-0.004170,0.237557,-0.026658,-0.089613,0.237309,-0.236212,0.215932,-0.407547,-0.938130,-0.201323
1172,0.756091,-0.361638,-0.702805,-0.184785,-0.315407,-0.184797,-0.622275,-0.534065,-0.588978,-0.604602,-0.742120,-0.588721,-0.336602,1.205973,-0.832197,-0.674223,-0.670442
1173,0.111979,-0.580353,-0.853998,-0.511336,-0.457451,-0.512414,-0.695438,-0.681741,-0.644130,-0.716229,-0.872221,-0.644125,-0.075088,1.188260,-0.866561,-1.452405,-1.489793


In [307]:
grid_params = {
    'max_depth': [1, 2, 3, 4],
    'eta': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'subsample': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}

In [308]:
grid_params_big = {
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'eta': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'subsample': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}

In [None]:
%%time
best = grid_search(grid_params, X_train, y_train, X_test, y_test, num_boost_round=40)

In [267]:
best.best_params_

{'eta': 0.3, 'max_depth': 4, 'subsample': 0.8}

In [268]:
len(best.best_estimator_.get_booster().get_dump())

40

In [None]:
%%time
best = grid_search(grid_params_big, X_train, y_train, X_test, y_test, num_boost_round=20)

In [270]:
best.best_params_

{'eta': 0.2, 'max_depth': 9, 'subsample': 0.8}

In [271]:
len(best.best_estimator_.get_booster().get_dump())

20

In [272]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [273]:
grid_params_single = {
    'max_depth': [3],
    'eta': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'subsample': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}

In [274]:
%%time
best = grid_search(grid_params_single, X_train, y_train, X_test, y_test, num_boost_round=1)

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  if is_sparse(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_cate

CPU times: user 1.07 s, sys: 140 ms, total: 1.21 s
Wall time: 4.6 s


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)

In [275]:
best.best_params_

{'eta': 0.9, 'max_depth': 3, 'subsample': 0.8}

In [276]:
len(best.best_estimator_.get_booster().get_dump())

1

In [277]:
param_big= {
    'eta': 0.2,
    'max_depth': 7,
    'objective': 'reg:squarederror',
    'seed': 42,
    'subsample': 0.8,
}
steps = 20

In [278]:
%%time
train_and_save_model(param_big, steps, dtrain, dtest, MODEL_PATH, telemetry_model_name + '_big')

[0]	test-rmse:7.74401	train-rmse:7.54965
[19]	test-rmse:6.41928	train-rmse:4.64089
CPU times: user 2.32 s, sys: 0 ns, total: 2.32 s
Wall time: 165 ms


<xgboost.core.Booster at 0x70c5942db6d0>

In [293]:
param = {
    'eta': 0.2,
    'max_depth': 4,
    'objective': 'reg:squarederror',
    'seed': 42,
    'subsample': 0.8,
}
steps = 40

In [294]:
%%time
train_and_save_model(param, steps, dtrain, dtest, MODEL_PATH, telemetry_model_name )

[0]	test-rmse:7.89711	train-rmse:7.85245
[39]	test-rmse:6.76884	train-rmse:5.86271
CPU times: user 11.3 s, sys: 16.9 ms, total: 11.3 s
Wall time: 854 ms


<xgboost.core.Booster at 0x70c5941803d0>

In [295]:
param_single = {
    'eta': 0.9,
    'max_depth': 3,
    'objective': 'reg:squarederror',
    'seed': 42,
    'subsample': 0.9,
}
steps=1

In [296]:
%%time
train_and_save_model(param_single, steps, dtrain, dtest, MODEL_PATH, telemetry_model_name+'_single' )

[0]	test-rmse:7.65970	train-rmse:7.56464
CPU times: user 3.58 s, sys: 6.62 ms, total: 3.58 s
Wall time: 282 ms


<xgboost.core.Booster at 0x70c5942db4c0>