In [27]:
import os
import configparser
from tqdm import tqdm
from helpers.helper_functions import *
from helpers.helper_classes import *
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)

In [76]:
# Read config.ini file
config = configparser.ConfigParser()
config.read('src/config.ini')
os.chdir(config['PATH']['ROOT_DIR'])

# # Load data
df = pd.read_parquet(config['PATH']['INT_DIR'] + '/training_set_preprocessed.parquet', engine = 'fastparquet')
df_test = pd.read_parquet(config['PATH']['INT_DIR'] + '/test_set_preprocessed.parquet', engine = 'fastparquet')

# mini df for testing quickly 
df_mini = df[df['srch_id'] < 10000]


In [29]:
# Perform logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import roc_auc_score

def train_test_split(df, target_str, test_size=.2):
    splitter = GroupShuffleSplit(test_size=test_size, n_splits=2, random_state = 7)
    split = splitter.split(df, groups=df['srch_id'])
    train_inds, test_inds = next(split)

    df_ideal = df.iloc[test_inds].copy().sort_values(by=['srch_id', target_str], ascending=[True, False], inplace=False)

    X = df.drop([target_str], axis=1)
    y = df[target_str]
    X_train, X_test, y_train, y_test, test_ideal = X.iloc[train_inds], X.iloc[test_inds], y.iloc[train_inds], y.iloc[test_inds], df_ideal, 


    return X_train, X_test, y_train, y_test, test_ideal[['srch_id', 'prop_id', target_str]]

def construct_pred_ideal(df_in, df_ideal, y_pred):
    df = df_in.copy()
    df['pred_grades'] = y_pred
    df = df.sort_values(by=['srch_id', 'pred_grades'], ascending=[True, False], inplace=False)

    # Merge grades from ideal on srch_id and prop_id
    df = df.merge(df_ideal, on=['srch_id', 'prop_id'], how='left')

    # Return srch_id, prop_id and pred_grades
    return df[['srch_id', 'prop_id', 'pred_grades', 'target']]

def construct_pred_submission(df_in, y_pred):
    df = df_in.copy()
    df['pred_grades'] = y_pred
    df = df.sort_values(by=['srch_id', 'pred_grades'], ascending=[True, False], inplace=False)

    # Return srch_id, prop_id and pred_grades
    return df[['srch_id', 'prop_id']]

def constructs_predictions(model, data, ideal_df = None):
    y_pred = model.predict_proba(data)
    pred_grades = y_pred @ [0, 1, 5]

    if ideal_df is not None:
        pred_df = construct_pred_ideal(data, ideal_df, pred_grades)
    else:
        pred_df = construct_pred_submission(data, pred_grades)
    return pred_df


def calc_NDCG(df_ideal, df_pred, k = 5):
    # Group by 5
    df_ideal = df_ideal.groupby('srch_id').head(k)
    df_pred = df_pred.groupby('srch_id').head(k)

    assert df_ideal.shape[0] % k == 0
    assert df_pred.shape[0] % k == 0
    
    # Get grades matrices
    ideal_grades = df_ideal['target'].values.reshape(int(df_ideal.shape[0] / k), k)
    pred_grades = df_pred['target'].values.reshape(int(df_pred.shape[0] / k), k)

    discount_vec = [1/np.log2(i+2) for i in range(k)]

    # Calculate NDCG
    NDCG = (pred_grades @ discount_vec).sum() / (ideal_grades @ discount_vec).sum()

    return NDCG


In [49]:
df_cur = df
X_train, X_test, y_train, y_test, test_ideal = train_test_split(df_cur, 'target')

In [60]:
# LightGBM ranker
import lightgbm as lgb
import optuna.integration.lightgbm as optuna_lgb

# Create dataset
group_train = X_train.groupby('srch_id').size().values
group_val = X_test.groupby('srch_id').size().values
group_test = df_test.groupby('srch_id').size().values

X_train_lgb = X_train.drop(['srch_id'], axis=1)
X_val_lgb = X_test.drop(['srch_id'], axis=1)

# Creating the ranker object
ranker = lgb.LGBMRanker(
                    objective="lambdarank",
                    boosting_type = "gbdt",
                    n_estimators = 300,
                    importance_type = "gain",
                    metric= "ndcg",
                    num_leaves = 10,
                    learning_rate = 0.05,
                    max_depth = 10)
                    # label_gain =[i for i in range(max(y_train.max(), y_test.max()) + 1)])

# Training the model
ranker.fit(
      X=X_train_lgb,
      y=y_train,
      group=group_train,
      eval_set=[(X_train_lgb, y_train),(X_val_lgb, y_test)],
      eval_group=[group_train, group_val],
      eval_at=[5])



[1]	training's ndcg@5: 0.260898	valid_1's ndcg@5: 0.257203
[2]	training's ndcg@5: 0.286388	valid_1's ndcg@5: 0.282774
[3]	training's ndcg@5: 0.307505	valid_1's ndcg@5: 0.305398
[4]	training's ndcg@5: 0.32327	valid_1's ndcg@5: 0.321026
[5]	training's ndcg@5: 0.331178	valid_1's ndcg@5: 0.328779
[6]	training's ndcg@5: 0.336028	valid_1's ndcg@5: 0.3334
[7]	training's ndcg@5: 0.337903	valid_1's ndcg@5: 0.334434
[8]	training's ndcg@5: 0.342956	valid_1's ndcg@5: 0.339629
[9]	training's ndcg@5: 0.344279	valid_1's ndcg@5: 0.340288
[10]	training's ndcg@5: 0.345494	valid_1's ndcg@5: 0.341548
[11]	training's ndcg@5: 0.347928	valid_1's ndcg@5: 0.344023
[12]	training's ndcg@5: 0.348417	valid_1's ndcg@5: 0.344556
[13]	training's ndcg@5: 0.350871	valid_1's ndcg@5: 0.346388
[14]	training's ndcg@5: 0.353239	valid_1's ndcg@5: 0.348623
[15]	training's ndcg@5: 0.353023	valid_1's ndcg@5: 0.348551
[16]	training's ndcg@5: 0.353705	valid_1's ndcg@5: 0.348446
[17]	training's ndcg@5: 0.355641	valid_1's ndcg@5: 0

In [73]:
# Predicting the scores
# test = X_test
test = df_test

test_input = test.drop(['srch_id'], axis=1)

y_pred = ranker.predict(test_input)
df_res = test.copy()
df_res['pred_grades'] = y_pred
df_res = df_res.sort_values(by=['srch_id', 'pred_grades'], ascending=[True, False], inplace=False)
# df_res = df_res.merge(test_ideal, on=['srch_id', 'prop_id'], how='left')
# df_res



In [77]:
lgbm_submission = df_res[['srch_id', 'prop_id']]
lgbm_submission.to_csv(config['PATH']['SUBMISSION_DIR'] + '/lgbm_submission.csv', index=False)

In [71]:
print(f"LGBM: {calc_NDCG(test_ideal, df_res)}, Random: {calc_NDCG(test_ideal, pred_random)}")


LGBM: 0.39285307523155705, Random: 0.004541951536113568


In [31]:
# # Random forest
# from sklearn.ensemble import RandomForestClassifier
# print('Training RF')
# rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
# rf.fit(X_train, y_train)

# XGBoost
print('Training XGB')
import xgboost as xgb
y_train_xgb = y_train.astype(int)
y_train_xgb[y_train == 5] = 2
xgb_model = xgb.XGBClassifier(objective="multi:softprob", random_state=42)
xgb_model.fit(X_train, y_train_xgb)

# pred_ideal_rf = constructs_predictions(rf, X_test, ideal_df=test_ideal)
pred_xgb = constructs_predictions(xgb_model, X_test, ideal_df=test_ideal)
pred_random = construct_pred_ideal(X_test, test_ideal, np.random.rand(len(X_test)))

Training XGB


## Evaluation


In [66]:
pred_xgb

Unnamed: 0,srch_id,prop_id,pred_grades,target
0,1,68914,1.333926,5.0
1,1,88218,0.736154,0.0
2,1,97247,0.577588,0.0
3,1,29604,0.575803,0.0
4,1,95307,0.480201,0.0
...,...,...,...,...
29766,9988,39329,0.089737,0.0
29767,9988,1495,0.088096,0.0
29768,9988,77411,0.054477,0.0
29769,9988,139360,0.045843,0.0


In [32]:
# print(f"RF: {calc_NDCG(test_ideal, pred_ideal_rf)}\n,XGB: {calc_NDCG(test_ideal, pred_xgb_optimized)},\nRandom: {calc_NDCG(test_ideal, pred_random)}")
print(f"XGB: {calc_NDCG(test_ideal, pred_xgb)}, Random: {calc_NDCG(test_ideal, pred_random)}")

XGB: 0.3375755506487008, Random: 0.15050172446700524


## Optuna + XGBRegressor

In [96]:
# Optimize XGB with optuna
import optuna
from functools import partial

def objective(trial, X_train, y_train, X_test, test_ideal):
    y_train_xgb = y_train.astype(int)
    y_train_xgb[y_train == 5] = 2

    params = {
        "objective": "multi:softprob",
        "random_state": 42,
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 1),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-4, 1e-1, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-4, 1e-1, log=True),
    }

    xgb_model = xgb.XGBClassifier(**params)
    xgb_model.fit(X_train, y_train_xgb)

    pred_xgb = constructs_predictions(xgb_model, X_test, ideal_df=test_ideal)
    ndcg = calc_NDCG(test_ideal, pred_xgb)

    return ndcg

print("Training XGB")
# Assuming you have defined X_train, y_train, X_test, and test_ideal before this point.

# Wrap the objective function with the input data
objective_with_data = partial(objective, X_train=X_train, y_train=y_train, X_test=X_test, test_ideal=test_ideal)

# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction="maximize")
study.optimize(objective_with_data, n_trials=20)


# Train the final model with the best hyperparameters
y_train_xgb = y_train.astype(int)
y_train_xgb[y_train == 5] = 2

best_params = study.best_params
xgb_model_optimized = xgb.XGBClassifier(objective="multi:softprob", random_state=42, **best_params)
xgb_model_optimized.fit(X_train, y_train_xgb)

# Evaluate the optimized model
pred_xgb_optimized = constructs_predictions(xgb_model_optimized, X_test, ideal_df=test_ideal)
pred_xgb_submission = constructs_predictions(xgb_model_optimized, df_test)
print(f"XGB Optimized: {calc_NDCG(test_ideal, pred_xgb_optimized)}")

# pred_submission.to_csv(config['PATH']['DATA_DIR'] + '/submission_RF.csv', index=False)
pred_xgb_submission.to_csv(config['PATH']['DATA_DIR'] + '/submission_XGB.csv', index=False)

[32m[I 2023-05-09 13:56:31,830][0m A new study created in memory with name: no-name-3a7cbff6-7ed5-4fc1-8c20-9778f5cce14a[0m


Training XGB


[32m[I 2023-05-09 13:58:02,635][0m Trial 0 finished with value: 0.3422170675116014 and parameters: {'n_estimators': 218, 'max_depth': 10, 'learning_rate': 0.002795642578981349, 'subsample': 0.8932459525721343, 'colsample_bytree': 0.6546014752508442, 'gamma': 0.4545479889258107, 'reg_alpha': 0.0006735472057143736, 'reg_lambda': 0.05659086785788689}. Best is trial 0 with value: 0.3422170675116014.[0m
[32m[I 2023-05-09 13:58:34,110][0m Trial 1 finished with value: 0.3296238751532763 and parameters: {'n_estimators': 122, 'max_depth': 5, 'learning_rate': 0.00020205115375924383, 'subsample': 0.6995347755906247, 'colsample_bytree': 0.9885228465832642, 'gamma': 0.19381601429279216, 'reg_alpha': 0.03803815623242628, 'reg_lambda': 0.00015357257740569215}. Best is trial 0 with value: 0.3422170675116014.[0m
[32m[I 2023-05-09 13:59:37,760][0m Trial 2 finished with value: 0.3368665578611753 and parameters: {'n_estimators': 472, 'max_depth': 4, 'learning_rate': 0.004549507912707027, 'subsampl

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2], got [0 1 5]