In [1]:
import polars as pl
from pathlib import Path

In [2]:
import json
DIR = Path('/home/ubuntu/experiments/subsample_new_with_recsys_train_small')

train_ds = pl.read_parquet(DIR / 'train_ds.parquet')
with open(DIR / 'data_info.json') as data_info_file:
        data_info = json.load(data_info_file)

In [8]:
# import math
# BATCH_SIZE = math.ceil(len(train_ds) / 5)
# for i, slice in enumerate(train_ds.iter_slices(BATCH_SIZE)):
#     dir_path = DIR / 'slices' / 'train'
#     dir_path.mkdir(exist_ok=True, parents=True)
#     slice.write_parquet(dir_path / f'slice_{i}.parquet')

In [3]:
import gc
import numpy as np

def preprocess_slice(train_ds: pl.DataFrame):        
    if 'postcode' in train_ds.columns:
        train_ds = train_ds.with_columns(pl.col('postcode').fill_null(5))
    if 'article_type' in train_ds.columns:
        train_ds = train_ds.with_columns(pl.col('article_type').fill_null('article_default'))
    if 'impression_time' in train_ds.columns:
        train_ds = train_ds.drop(['impression_time'])
    
    train_ds = train_ds.drop(['impression_id', 'article', 'user_id']).to_pandas()
    train_ds[data_info['categorical_columns']] = train_ds[data_info['categorical_columns']].astype('category')
        
    X = train_ds.drop(columns=['target'])
    X = X.replace([np.inf, -np.inf], np.nan)
    y = train_ds['target']
    return X, y

In [5]:
X, y = preprocess_slice(train_ds[0:1])

In [10]:
import os
import time
from typing import List, Callable
import xgboost
from sklearn.datasets import load_svmlight_file
from polimi.utils._custom import read_json

class Iterator(xgboost.DataIter):
  def __init__(self, file_paths: List[str]):
    self._file_paths = file_paths
    self._it = 0
    
    # # XGBoost will generate some cache files under current directory with the prefix
    # # "cache"
    # super().__init__(cache_prefix=os.path.join(".", "cache"))
    super().__init__()


  def next(self, input_data: Callable):
    """Advance the iterator by 1 step and pass the data to XGBoost.  This function is
    called by XGBoost during the construction of ``DMatrix``

    """
    if self._it == len(self._file_paths):
      # return 0 to let XGBoost know this is the end of iteration
      return 0

    # input_data is a function passed in by XGBoost who has the exact same signature of
    # ``DMatrix``
    train_ds = pl.read_parquet(self._file_paths[self._it])
    X, y = preprocess_slice(train_ds)
    input_data(data=X, label=y)
    self._it += 1
    # Return 1 to let XGBoost know we haven't seen all the files yet.
    return 1

  def reset(self):
    """Reset the iterator to its beginning"""
    self._it = 0

slices_paths = sorted(list((DIR / 'slices' / 'train').glob('slice_*.parquet')), key=lambda x: int(x.stem.split('_')[1]))
print(slices_paths)
it = Iterator(slices_paths)

params = read_json(Path('/home/ubuntu/RecSysChallenge2024/configuration_files') / 'xgb_cls_new_with_recsys_noK.json')
print(params)

X, y = preprocess_slice(train_ds)
print('Creating complete QuantilDMatrix...')
start_time = time.time()
dmatrix = xgboost.QuantileDMatrix(X, label=y, enable_categorical=True, max_bin=params['max_bin'])
print(f'Elapsed time: {((time.time() - start_time)/60):.2f} min')

print('Creating batch QuantilDMatrix...')
start_time = time.time()
dmatrix = xgboost.QuantileDMatrix(it, enable_categorical=True, max_bin=params['max_bin'])
print(f'Elapsed time: {((time.time() - start_time)/60):.2f} min')

[PosixPath('/home/ubuntu/experiments/subsample_new_with_recsys_train_small/slices/train/slice_0.parquet'), PosixPath('/home/ubuntu/experiments/subsample_new_with_recsys_train_small/slices/train/slice_1.parquet'), PosixPath('/home/ubuntu/experiments/subsample_new_with_recsys_train_small/slices/train/slice_2.parquet'), PosixPath('/home/ubuntu/experiments/subsample_new_with_recsys_train_small/slices/train/slice_3.parquet'), PosixPath('/home/ubuntu/experiments/subsample_new_with_recsys_train_small/slices/train/slice_4.parquet')]
{'n_estimators': 4860, 'learning_rate': 0.007673025629394672, 'reg_alpha': 0.0002675014847368601, 'reg_lambda': 0.009527258861955984, 'max_depth': 9, 'max_leaves': 459, 'grow_policy': 'lossguide', 'max_bin': 185, 'gamma': 0.0002936826358088138, 'min_child_weight': 0.14432522926107985, 'subsample': 0.4781490371876336, 'colsample_bytree': 0.5859934501543211}
Creating complete QuantilDMatrix...
Elapsed time: 0.23 min
Creating batch QuantilDMatrix...
Elapsed time: 0.37