In [1]:
import pandas as pd 
import numpy as np 

#visualization
import matplotlib.pyplot as plt
import seaborn as sns


#statistical analysis 
import statsmodels.api as sm

#ML models 
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, IsolationForest
import xgboost as xgb
from sklearn.metrics import ndcg_score
from sklearn.model_selection import GroupKFold

#preprocessing
from sklearn.model_selection import train_test_split  # This line was missing
from sklearn.model_selection import GroupShuffleSplit

from tqdm import tqdm 

## Reading data 

In [2]:
df = pd.read_csv('cleaned_data.csv')
df.head(5)

Unnamed: 0.1,Unnamed: 0,q_id,d_id,relevancy_score_1,relevancy_score_2,category_id,brand_id,price,target_score,relevancy_score_1_rank,relevancy_score_2_rank,price_rank,discount_rank,search_view_mean,search_view_std,search_click_std,acceptance_ratio_mean,search_view_rank,search_sales_rank,acceptance_ratio_rank
0,0,0,351626,1029.9591,0.692121,1.0,1.0,2555000.0,0.621563,20.5,23.0,7.0,20.0,1851.6,251.503567,26.601796,0.047858,10.0,10.0,9.0
1,1,0,392222,1029.9591,0.27383,1.0,1.0,2450000.0,0.589059,20.5,2.0,5.5,20.0,1660.3,209.06833,22.299975,0.060965,9.0,9.0,10.0
2,2,0,475019,1029.9591,0.316562,1.0,1.0,,0.584239,20.5,7.0,,20.0,,,,,,,
3,3,0,480445,1029.9591,0.318788,1.0,1.0,1317500.0,0.579027,20.5,10.0,1.0,40.0,728.6,170.300519,9.989439,0.036558,7.0,7.0,7.0
4,4,0,286374,1029.9591,0.397037,1.0,1.0,5380000.0,0.577228,20.5,14.0,8.0,20.0,1079.8,212.732383,10.622827,0.044706,8.0,3.5,8.0


## Train-Test Split:
In this step, we partitioned the data into training and testing sets using an 80-20 split. Crucially, we treated `q_id` as our grouping variable to ensure that all instances of a given `q_id` are exclusively in either the training or testing set, but not both. 

In [3]:
#to split train and test without conflict with `q_id`s 
gss = GroupShuffleSplit(test_size=0.20, n_splits=1, random_state = 0).split(df, groups=df['q_id'])

X_train_inds, X_test_inds = next(gss)

#train data 
train_data= df.iloc[X_train_inds]
X_train = train_data.loc[:, ~train_data.columns.isin(['d_id','target_score'])]
y_train = train_data.loc[:, train_data.columns.isin(['target_score'])]
#test data 
test_data= df.iloc[X_test_inds]
X_test = test_data.loc[:, ~test_data.columns.isin(['target_score','d_id'])]
y_test = test_data.loc[:, test_data.columns.isin(['target_score','q_id'])]

## Cross-Validation and Hyperparameter Tuning:
1. Initially, we define an evaluation function leveraging NDCG scores, which aligns perfectly with the requirements of our ranking task. 
2. Subsequently, we employed a combination of grid search and cross-validation techniques exclusively on the training data to identify the optimal set of hyperparameters. 

In [5]:
# evaluate the model using NDCG score.
def eval_metric(model, X_val, y_val, groups_val):

    # predictions from the model
    preds = model.predict(X_val)
    
    # to handle groups 
    group_start_idx = 0
    
    ndcg_scores = []
    
    for group_size in groups_val:
        end_idx = group_start_idx + group_size
        
        # true labels and predictions for the current group
        true_labels = y_val.iloc[group_start_idx:end_idx].to_numpy().reshape(-1)
        group_preds = preds[group_start_idx:end_idx]
        ndcg_score_val = ndcg_score([true_labels], [group_preds])
        ndcg_scores.append(ndcg_score_val)
        
        group_start_idx = end_idx  # move to the next group
    
    return np.mean(ndcg_scores)


In [6]:
# define the parameter grid
param_grid = {
    'learning_rate': [0.1,0.01,0.001],
    'n_estimators': [50,100,150,200],
    'max_depth': [3, 6, 9]
    
}

# best parameters and score
best_params = None
best_score = float('inf')

# cross-validation based on group ('q_id')
gkf = GroupKFold(n_splits=5)
for n_estimators in tqdm(param_grid['n_estimators']):
    for learning_rate in param_grid['learning_rate']:
        for max_depth in param_grid['max_depth']:
            scores = []
            for train_idx, val_idx in gkf.split(X_train, y_train, groups=train_data['q_id']):
                X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
                y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
                groups_train_fold = train_data.iloc[train_idx].groupby('q_id').size().to_numpy()
                groups_val_fold = train_data.iloc[val_idx].groupby('q_id').size().to_numpy()

                model = xgb.XGBRanker(
                    objective='rank:pairwise',
                    learning_rate=learning_rate,
                    n_estimators=n_estimators,
                    max_depth=max_depth,
                )
                
                model.fit(X_train_fold, y_train_fold, group=groups_train_fold, verbose=False)
                score = eval_metric(model, X_val_fold, y_val_fold, groups_val_fold) 
                scores.append(score)
            
            avg_score = np.mean(scores)
            if avg_score < best_score:
                best_score = avg_score
                best_params = {
                    'learning_rate': learning_rate,
                    'n_estimators': n_estimators,
                    'max_depth': max_depth,
                }

# train model with the best parameters
model = xgb.XGBRanker(
    objective='rank:pairwise',
    **best_params
)
groups = train_data.groupby('q_id').size().to_numpy()
model.fit(X_train, y_train, group=groups, verbose=True)

print("Best parameters found: ", best_params)


100%|████████████████████████████████████████████| 4/4 [33:29<00:00, 502.41s/it]


Best parameters found:  {'learning_rate': 0.001, 'n_estimators': 50, 'max_depth': 3}


## Testing Phase:
In the final stage of our modeling process, we:
1. Train the model on the training data using the best set of hyperparameters.
2. Evaluate the model's performance on the test data by calculating the NDCG score. 
3. Save the trained model for future use.

In [7]:

# predict the scores using the trained model
y_pred = model.predict(X_test)

# calculate NDCG scores 
ndcg_scores = []

#grouping by 'q_id' 
test_groups = test_data.groupby('q_id').size().to_numpy()

start_idx = 0
for group_size in test_groups:
    end_idx = start_idx + group_size
    true_relevance = [y_test[start_idx:end_idx]['target_score'].values]
    scores_pred = np.asarray([y_pred[start_idx:end_idx]])
    ndcg_score_val = ndcg_score(true_relevance, scores_pred)
    ndcg_scores.append(ndcg_score_val)
    start_idx = end_idx

mean_ndcg_score = np.mean(ndcg_scores)

print("Mean NDCG Score across all groups in test data:", mean_ndcg_score)

Mean NDCG Score across all groups in test data: 0.9260870670857707


In [8]:
# save the model
model.save_model('xgbranker_model.json')