In [16]:
import polars as pl
import pandas as pd
from tqdm import tqdm
import os
import json
import numpy as np
import math
from lightgbm import LGBMClassifier
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from polimi.utils._inference import _inference
from ebrec.evaluation.metrics_protocols import *
from ebrec.utils._behaviors import sampling_strategy_wu2019
from polimi.utils._polars import reduce_polars_df_memory_size
from polimi.test.level_2_ensemble.build_model_predictions import require_subsampled_set, train_predict_model
from fastauc.fastauc.fast_auc import CppAuc
import os
import logging
from lightgbm import LGBMClassifier, LGBMRanker
from datetime import datetime
import argparse
import pandas as pd
import joblib
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing_extensions import List, Tuple, Dict, Type
import polars as pl
from polimi.utils._tuning_params import get_models_params
import gc
from polars import testing


In [17]:
params = {
         "n_estimators": 2400,
          "max_depth": 5, 
          "num_leaves": 255,
          "subsample_freq": 4,
          "subsample": 0.6004099528013062, 
          "learning_rate": 0.020058747735265076, 
          "colsample_bytree": 0.28816104133228293, 
          "colsample_bynode": 0.9436687124253154,
          "reg_lambda": 0.0009096841984127709, 
          "reg_alpha": 0.00229692020127837, 
          "min_split_gain": 0.06569239337571059, 
          "min_child_weight": 0.0025913515338086167, 
          "min_child_samples": 53, 
          "extra_trees": True, 
          "max_bin": 8,
          "verbosity": -1,
        }
cpp_auc = CppAuc()


In [18]:
dataset_path = '/home/ubuntu/experiments/stacking/sub_features_pred_icm'
train_ds = pl.read_parquet('/home/ubuntu/experiments/stacking/sub_features_pred_icm/train_ds.parquet')
val_ds = pl.read_parquet('/home/ubuntu/experiments/stacking/features_pred_icm/validation_ds.parquet')
with open(os.path.join(dataset_path, 'data_info.json')) as data_info_file:
        data_info = json.load(data_info_file)

In [19]:
click_pred_train= pl.read_parquet('/home/ubuntu/dataset/click_predictors/train_click_predictor.parquet')
click_columns = [col for col in click_pred_train.columns if 'user_id' not in col and 'article' not in col]
train_ds = train_ds.join(click_pred_train, on=['user_id','article'], how='left')
train_ds = train_ds.with_columns(
    *[(pl.col(col)/pl.col(col).max().over('impression_id')).alias(f'normalized_{col}') for col in click_columns]
).with_columns(
    pl.sum_horizontal([pl.col(f'normalized_{col}') for col in click_columns]).alias('sum_click_pred')
).with_columns(
    (pl.col('sum_click_pred')/pl.col('sum_click_pred').max().over('impression_id')).alias('normalize_sum_click_pred')
)

In [20]:
click_pred_val = pl.read_parquet('/home/ubuntu/dataset/click_predictors/validation_click_predictor.parquet')
click_columns = [col for col in click_pred_train.columns if 'user_id' not in col and 'article' not in col]
val_ds = val_ds.join(click_pred_val, on=['user_id','article'], how='left')
val_ds = val_ds.with_columns(
    *[(pl.col(col)/pl.col(col).max().over('impression_id')).alias(f'normalized_{col}') for col in click_columns]
).with_columns(
    pl.sum_horizontal([pl.col(f'normalized_{col}') for col in click_columns]).alias('sum_click_pred')
).with_columns(
    (pl.col('sum_click_pred')/pl.col('sum_click_pred').max().over('impression_id')).alias('normalize_sum_click_pred')
)

In [21]:
print(train_ds)
if 'postcode' in train_ds.columns:
    train_ds = train_ds.with_columns(pl.col('postcode').fill_null(5))
if 'article_type' in train_ds.columns:
    train_ds = train_ds.with_columns(pl.col('article_type').fill_null('article_default'))
if 'impression_time' in train_ds.columns:
    train_ds = train_ds.drop(['impression_time'])

shape: (298_335, 447)
┌────────────┬─────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ impression ┆ article ┆ predictio ┆ predictio ┆ … ┆ normalize ┆ normalize ┆ sum_click ┆ normalize │
│ _id        ┆ ---     ┆ n_catboos ┆ n_catboos ┆   ┆ d_SP%W_cl ┆ d_readtim ┆ _pred     ┆ _sum_clic │
│ ---        ┆ i32     ┆ t_ranker  ┆ t_classif ┆   ┆ ick_predi ┆ e_click_p ┆ ---       ┆ k_pred    │
│ u32        ┆         ┆ ---       ┆ ier       ┆   ┆ cto…      ┆ red…      ┆ f32       ┆ ---       │
│            ┆         ┆ f32       ┆ ---       ┆   ┆ ---       ┆ ---       ┆           ┆ f32       │
│            ┆         ┆           ┆ f32       ┆   ┆ f32       ┆ f32       ┆           ┆           │
╞════════════╪═════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 157014     ┆ 9776223 ┆ 0.138048  ┆ 0.181434  ┆ … ┆ 0.785215  ┆ 0.663443  ┆ 17.44491  ┆ 0.641632  │
│ 157014     ┆ 9776438 ┆ 0.359669  ┆ 0.3867    ┆ … ┆ 1.0       ┆ 1.0 

In [22]:
train_ds_pandas = train_ds.drop(['impression_id','article','user_id']).to_pandas()
train_ds_pandas[data_info['categorical_columns']] = train_ds_pandas[data_info['categorical_columns']].astype('category')
X = train_ds_pandas.drop(columns=['target'])
y = train_ds_pandas['target']

In [23]:
if 'postcode' in val_ds.columns:
        val_ds = val_ds.with_columns(pl.col('postcode').fill_null(5))
if 'article_type' in val_ds.columns:
        val_ds = val_ds.with_columns(pl.col('article_type').fill_null('article_default'))  
if 'impression_time' in val_ds.columns:
        val_ds = val_ds.drop(columns = ['impression_time']) 
        
val_ds = val_ds.to_pandas()
val_ds[data_info['categorical_columns']] = val_ds[data_info['categorical_columns']].astype('category')

X_val = val_ds[X.columns]
evaluation_ds = pl.from_pandas(val_ds[['impression_id','article', 'target']])

print('Baseline')
model = LGBMClassifier(**params)
model.fit(X, y)
evaluation_ds_copy = evaluation_ds
evaluation_ds_copy = evaluation_ds_copy.with_columns(pl.Series(model.predict_proba(X_val)[:,1]).alias('prediction')).group_by('impression_id').agg(pl.col('target'), pl.col('prediction'))
result = np.mean(
        [cpp_auc.roc_auc_score(np.array(y_t).astype(bool), np.array(y_s).astype(np.float32)) 
            for y_t, y_s in zip(evaluation_ds_copy['target'].to_list(), 
                                evaluation_ds_copy['prediction'].to_list())]
    )
print(f'Baseline AUC: {result}')



  val_ds = val_ds.drop(columns = ['impression_time'])


Baseline
Baseline AUC: 0.8239305693953106
