In [1]:
import polars as pl
from pathlib import Path

In [1]:
import json
DIR = Path('/Users/lorecampa/Desktop/Projects/RecSysChallenge2024/dataset/preprocessing/subsample_new_with_recsys_small')

train_ds = pl.read_parquet(DIR / 'train_ds.parquet')
with open(DIR / 'data_info.json') as data_info_file:
        data_info = json.load(data_info_file)

NameError: name 'Path' is not defined

In [None]:
import math
BATCH_SIZE = math.ceil(len(train_ds) / 5)
for i, slice in enumerate(train_ds.iter_slices(BATCH_SIZE)):
    slice.write_parquet(DIR / 'slices' / f'slice_{i}.parquet')

In [None]:
import gc
import numpy as np

def preprocess_slice(train_ds: pl.DataFrame):        
    if 'postcode' in train_ds.columns:
        train_ds = train_ds.with_columns(pl.col('postcode').fill_null(5))
    if 'article_type' in train_ds.columns:
        train_ds = train_ds.with_columns(pl.col('article_type').fill_null('article_default'))
    if 'impression_time' in train_ds.columns:
        train_ds = train_ds.drop(['impression_time'])
    
    train_ds = train_ds.drop(['impression_id', 'article', 'user_id']).to_pandas()
    train_ds[data_info['categorical_columns']] = train_ds[data_info['categorical_columns']].astype('category')
        
    X = train_ds.drop(columns=['target'])
    X = X.replace([np.inf, -np.inf], np.nan)
    y = train_ds['target']
    return X, y

In [None]:
import os
from typing import List, Callable
import xgboost
from sklearn.datasets import load_svmlight_file
from polimi.utils._custom import read_json

class Iterator(xgboost.DataIter):
  def __init__(self, file_paths: List[str]):
    self._file_paths = file_paths
    self._it = 0
    
    # # XGBoost will generate some cache files under current directory with the prefix
    # # "cache"
    # super().__init__(cache_prefix=os.path.join(".", "cache"))
    super().__init__()


  def next(self, input_data: Callable):
    """Advance the iterator by 1 step and pass the data to XGBoost.  This function is
    called by XGBoost during the construction of ``DMatrix``

    """
    if self._it == len(self._file_paths):
      # return 0 to let XGBoost know this is the end of iteration
      return 0

    # input_data is a function passed in by XGBoost who has the exact same signature of
    # ``DMatrix``
    train_ds = pl.read_parquet(self._file_paths[self._it])
    X, y = preprocess_slice(train_ds)
    input_data(data=X, label=y)
    self._it += 1
    # Return 1 to let XGBoost know we haven't seen all the files yet.
    return 1

  def reset(self):
    """Reset the iterator to its beginning"""
    self._it = 0

slices_paths = sorted(list((DIR / 'slices').glob('slice_*.parquet')), key=lambda x: int(x.stem.split('_')[1]))
print(slices_paths)
slices_paths = slices_paths
it = Iterator(slices_paths)

params = read_json(Path('/Users/lorecampa/Desktop/Projects/RecSysChallenge2024/configuration_files') / 'xgb_cls_new_with_recsys_noK.json')
print(params)

Xy = xgboost.QuantileDMatrix(it, enable_categorical=True, max_bin=params['max_bin'])

# The ``approx`` also work, but with low performance. GPU implementation is different from CPU.
# as noted in following sections.
booster = xgboost.train(params, Xy, evals=[(Xy, 'train')], verbose_eval=100)


In [None]:
val_ds = pl.read_parquet(DIR / 'validation_ds.parquet')
if 'postcode' in val_ds.columns:
    val_ds = val_ds.with_columns(pl.col('postcode').fill_null(5))
if 'article_type' in val_ds.columns:
    val_ds = val_ds.with_columns(pl.col('article_type').fill_null('article_default'))
if 'impression_time' in val_ds.columns:
    val_ds = val_ds.drop(['impression_time'])

val_ds = val_ds.to_pandas()
val_ds[data_info['categorical_columns']] = val_ds[data_info['categorical_columns']].astype('category')
X_val = val_ds[Xy.feature_names]
evaluation_ds = pl.from_pandas(val_ds[['impression_id', 'article', 'target']])

In [None]:
dval = xgboost.DMatrix(X_val, enable_categorical=True)
booster.predict(X_val)

In [None]:


prediction_ds = evaluation_ds.with_columns(pl.Series(booster.predict(X_val)[:, 1]).alias('prediction')) \
                .group_by('impression_id').agg(pl.col('target'), pl.col('prediction'))