In [1]:
%load_ext autoreload
%autoreload 2

from __future__ import annotations # For type hinting my own class!
from DecisionTreeRegressor import DecisionTreeRegressor


In [2]:
import numpy as np

def load_letor_file(file_path):
    """
    Load a LETOR-formatted file into features and relevance labels.

    Args:
        file_path (str): Path to the LETOR file.

    Returns:
        X (np.ndarray): Feature matrix of shape (num_samples, num_features).
        y (np.ndarray): Relevance labels of shape (num_samples, ).
        qids (list): Query IDs corresponding to each sample.
    """
    features = []
    labels = []
    query_ids = []

    with open(file_path, 'r') as file:
        for line in file:
            # Remove comments and split the line
            line = line.split('#')[0].strip()
            parts = line.split()

            # Extract relevance score, query ID, and features
            relevance = int(parts[0])
            qid = int(parts[1].split(':')[1])
            feature_values = [float(x.split(':')[1]) for x in parts[2:]]

            # Append to respective lists
            labels.append(relevance)
            query_ids.append(qid)
            features.append(feature_values)

    # Convert to NumPy arrays
    X = np.array(features)
    y = np.array(labels)
    return X, y, query_ids

# Paths to the dataset files
train_file = 'OHSUMED/Data/Fold1/trainingset.txt'  # Replace with actual path
test_file = 'OHSUMED/Data/Fold1/testset.txt'
valid_file = 'OHSUMED/Data/Fold1/validationset.txt'

# Load datasets
X_train, y_train, qids_train = load_letor_file(train_file)
X_valid, y_valid, qids_valid = load_letor_file(valid_file)
X_test, y_test, qids_test = load_letor_file(test_file)

# Print dataset shapes
print(f"Train set: X shape {X_train.shape}, y shape {y_train.shape}")
print(f"Validation set: X shape {X_valid.shape}, y shape {y_valid.shape}")
print(f"Test set: X shape {X_test.shape}, y shape {y_test.shape}")

# Example output (first 5 rows)
print("First 5 training samples:\n", X_train[:5])
print("First 5 relevance labels:\n", y_train[:5])
print("First 5 query IDs:\n", qids_train[:5])


Train set: X shape (9219, 25), y shape (9219,)
Validation set: X shape (3538, 25), y shape (3538,)
Test set: X shape (3383, 25), y shape (3383,)
First 5 training samples:
 [[ 3.00000000e+00  2.07944154e+00  2.72727270e-01  2.61034130e-01
   3.73305651e+01  1.14312412e+01  3.72997501e+01  1.13865735e+00
   1.55242894e+01  8.83129655e+00  1.20000000e+01  5.37527841e+00
   8.75912400e-02  8.64936400e-02  2.83030646e+01  9.34002375e+00
   2.48087847e+01  3.93090680e-01  5.74165170e+01  3.29489291e+00
   2.50231000e+01  3.21979940e+00 -3.87098000e+00 -3.90273000e+00
  -3.87512000e+00]
 [ 3.00000000e+00  2.07944154e+00  4.28571430e-01  4.00594180e-01
   3.73305651e+01  1.14312412e+01  3.72997501e+01  1.81447983e+00
   1.74549923e+01  1.16179306e+01  1.00000000e+01  5.19295685e+00
   8.54700900e-02  8.45371100e-02  2.83030646e+01  9.34002375e+00
   2.48087847e+01  3.49204570e-01  4.32406261e+01  2.65472417e+00
   2.34903000e+01  3.15658757e+00 -3.96838000e+00 -4.00865000e+00
  -3.98670000e+00

# LightGBM

In [3]:
import lightgbm as lgb

# Convert query IDs to group lengths (LightGBM requires group info for ranking tasks)
def get_group_sizes(qids):
    """
    Compute group sizes for LightGBM from query IDs.

    Args:
        qids (list): List of query IDs.

    Returns:
        group_sizes (list): Number of samples per query group.
    """
    from collections import Counter
    qid_counts = Counter(qids)
    return [qid_counts[qid] for qid in sorted(qid_counts.keys())]

# Group sizes
group_train = get_group_sizes(qids_train)
group_valid = get_group_sizes(qids_valid)

# Prepare LightGBM datasets
train_data = lgb.Dataset(X_train, label=y_train, group=group_train)
valid_data = lgb.Dataset(X_valid, label=y_valid, group=group_valid, reference=train_data)

# LightGBM parameters for LambdaMART
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'max_position': 10,  # Evaluate NDCG@10
    'label_gain': [0, 1, 3, 7],  # Relevance levels (adjust based on dataset)
    'learning_rate': 0.05,
    'num_leaves': 31,
    'min_data_in_leaf': 20,
    'num_iterations': 100,  # Number of boosting iterations
}

# Train the LambdaMART model
eval_result = {}
print("Training LightGBM LambdaMART model...")
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    valid_names=['train', 'valid'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(),
        lgb.record_evaluation(eval_result)
    ]
)


Training LightGBM LambdaMART model...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000399 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4077
[LightGBM] [Info] Number of data points in the train set: 9219, number of used features: 25
[1]	train's ndcg@1: 0.444444	train's ndcg@2: 0.430117	train's ndcg@3: 0.452901	train's ndcg@4: 0.451371	train's ndcg@5: 0.452569	valid's ndcg@1: 0.190476	valid's ndcg@2: 0.294865	valid's ndcg@3: 0.318789	valid's ndcg@4: 0.321234	valid's ndcg@5: 0.306383
Training until validation scores don't improve for 50 rounds
[2]	train's ndcg@1: 0.772487	train's ndcg@2: 0.6856	train's ndcg@3: 0.639455	train's ndcg@4: 0.600256	train's ndcg@5: 0.581698	valid's ndcg@1: 0.222222	valid's ndcg@2: 0.320471	valid's ndcg@3: 0.345835	valid's ndcg@4: 0.354408	valid's ndcg@5: 0.356392
[3]	train's ndcg@1: 0.777778	train's ndcg@2: 0.702461	train's ndcg@3: 0.665371	train's ndcg@4: 0.637833	tra



# H E A V Y gbm.

In [5]:
from LambdaMARTy import LambdaMART
from collections import Counter

sorted_indices = np.argsort(qids_train)[::-1]
q_train2 = [Counter(qids_train)[i+1] for i in range(len(set(qids_train)))]
x_train2 = X_train[sorted_indices]
y_train2 = y_train[sorted_indices].reshape(-1,1)

marty = LambdaMART(
    min_samples_per_node=5,
    max_depth=20,
    impurity_measure='variance',
    learning_rate=0.1,
    num_trees=20
)

marty.build_forest(q_train2, x_train2, y_train2)

Training Tree 1
Precomputing Lambdas
Training Regressor
Evaluating NDCG
NDCG: 0.7691412518043562
Training Tree 2
Precomputing Lambdas
Training Regressor
Evaluating NDCG
NDCG: 0.776891532705439
Training Tree 3
Precomputing Lambdas
Training Regressor
Evaluating NDCG
NDCG: 0.7853137349546829
Training Tree 4
Precomputing Lambdas
Training Regressor
Evaluating NDCG
NDCG: 0.7811929831633312
Training Tree 5
Precomputing Lambdas
Training Regressor
Evaluating NDCG
NDCG: 0.7898327315656435
Training Tree 6
Precomputing Lambdas
Training Regressor
Evaluating NDCG
NDCG: 0.7966073036955552
Training Tree 7
Precomputing Lambdas
Training Regressor
Evaluating NDCG
NDCG: 0.7962148133401229
Training Tree 8
Precomputing Lambdas
Training Regressor
Evaluating NDCG
NDCG: 0.7989808756882015
Training Tree 9
Precomputing Lambdas
Training Regressor
Evaluating NDCG
NDCG: 0.7994412597551389
Training Tree 10
Precomputing Lambdas
Training Regressor
Evaluating NDCG
NDCG: 0.8016322821929259
Training Tree 11
Precomputing 

In [26]:
wawa = np.array([1,2,3]).reshape(-1,1)

In [None]:
sorted(wawa.reshape(-1))[::-1]

[3, 2, 1]