In [12]:
import numpy as np
import pandas as pd
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Activation, Dense, Input, Subtract
from tensorflow.keras.models import Model
import numpy as np
import math

In [4]:
class Ranker(object):

    def __init__(self, input_size, hidden_layer_sizes=(100,), activation=('relu',), solver='adam'):
        """
        Parameters
        ----------
        input_size : integer
            Number of input features.
        hidden_layer_sizes : tuple, length = n_layers - 2, default (100,)
            The ith element represents the number of neurons in the ith
            hidden layer.
        activation : tuple, length = n_layers - 2, default ('relu',)
            The ith element represents activation function in the ith
            hidden layer.
        solver : {'adam', 'sgd', 'rmsprop', 'adagrad', 'adadelta', adamax},
        default 'adam'
            The solver for weight optimization.
            - 'adam' refers to a stochastic gradient-based optimizer proposed
              by Kingma, Diederik, and Jimmy Ba
        """
        if len(hidden_layer_sizes) != len(activation):
            raise ValueError('hidden_layer_sizes and activation should have the same size.')
        self.model = self._build_model(input_size, hidden_layer_sizes, activation)
        self.model.compile(optimizer=solver, loss="binary_crossentropy")

    @staticmethod
    def _build_model(input_shape, hidden_layer_sizes, activation):
        """
        Build Keras Ranker model (Ranknet / LambdaRank).
        """
        # Neural network structure
        hidden_layers = []
        for i in range(len(hidden_layer_sizes)):
            hidden_layers.append(Dense(hidden_layer_sizes[i], activation=activation[i], name=str(activation[i]) + '_layer' + str(i)))
        h0 = Dense(1, activation='linear', name='Identity_layer')
        input1 = Input(shape=(input_shape,), name='Input_layer1')
        input2 = Input(shape=(input_shape,), name='Input_layer2')
        x1 = input1
        x2 = input2
        for i in range(len(hidden_layer_sizes)):
            x1 = hidden_layers[i](x1)
            x2 = hidden_layers[i](x2)
        x1 = h0(x1)
        x2 = h0(x2)
        # Subtract layer
        subtracted = Subtract(name='Subtract_layer')([x1, x2])
        # sigmoid
        out = Activation('sigmoid', name='Activation_layer')(subtracted)
        # build model
        model = Model(inputs=[input1, input2], outputs=out)
        return model

    @staticmethod
    def _CalcDCG(labels):
        sumdcg = 0.0
        for i in range(len(labels)):
            rel = labels[i]
            if rel != 0:
                sumdcg += ((2 ** rel) - 1) / math.log2(i + 2)
        return sumdcg

    def _fetch_qid_data(self, y, qid, eval_at=None):
        """Fetch indices, relevances, idcg and dcg for each query id.
        Parameters
        ----------
        y : array, shape (n_samples,)
            Target labels.
        qid: array, shape (n_samples,)
            Query id that represents the grouping of samples.
        eval_at: integer
            The rank postion to evaluate dcg and idcg.
        Returns
        -------
        qid2indices : array, shape (n_unique_qid,)
            Start index for each qid.
        qid2rel : array, shape (n_unique_qid,)
            A list of target labels (relevances) for each qid.
        qid2idcg: array, shape (n_unique_qid,)
            Calculated idcg@eval_at for each qid.
        qid2dcg: array, shape (n_unique_qid,)
            Calculated dcg@eval_at for each qid.
        """
        qid_unique, qid2indices, qid_inverse_indices = np.unique(qid, return_index=True, return_inverse=True)
        # get item releveance for each query id
        qid2rel = [[] for _ in range(len(qid_unique))]
        for i, qid_unique_index in enumerate(qid_inverse_indices):
            qid2rel[qid_unique_index].append(y[i])
        # get dcg, idcg for each query id @eval_at
        if eval_at:
            qid2dcg = [self._CalcDCG(qid2rel[i][:eval_at]) for i in range(len(qid_unique))]
            qid2idcg = [self._CalcDCG(sorted(qid2rel[i], reverse=True)[:eval_at]) for i in range(len(qid_unique))]
        else:
            qid2dcg = [self._CalcDCG(qid2rel[i]) for i in range(len(qid_unique))]
            qid2idcg = [self._CalcDCG(sorted(qid2rel[i], reverse=True)) for i in range(len(qid_unique))]
        return qid2indices, qid2rel, qid2idcg, qid2dcg


    def _transform_pairwise(self, X, y, qid):
        return None, None, None, None


    def fit(self, X, y, qid, batch_size=None, epochs=1, verbose=1, validation_split=0.0):
        """Transform data and fit model.
        Parameters
        ----------
        X : array, shape (n_samples, n_features)
            Features.
        y : array, shape (n_samples,)
            Target labels.
        qid: array, shape (n_samples,)
            Query id that represents the grouping of samples.
        """
        X1_trans, X2_trans, y_trans, weight = self._transform_pairwise(X, y, qid)
        self.model.fit([X1_trans, X2_trans], y_trans, sample_weight=weight, batch_size=batch_size, epochs=epochs,
                       verbose=verbose, validation_split=validation_split)
        fit_temp = self.evaluate(X, y, qid)
        return fit_temp

    def predict(self, X):
        """Predict output.
        Parameters
        ----------
        X : array, shape (n_samples, n_features)
            Features.
        Returns
        -------
        y_pred: array, shape (n_samples,)
            Model prediction.
        """
        ranker_output = K.function([self.model.layers[0].input], [self.model.layers[-3].get_output_at(0)])
        return ranker_output([X])[0].ravel()

    def evaluate(self, X, y, qid, eval_at=None):
        """Predict and evaluate ndcg@eval_at.
        Parameters
        ----------
        X : array, shape (n_samples, n_features)
            Features.
        y : array, shape (n_samples,)
            Target labels.
        qid: array, shape (n_samples,)
            Query id that represents the grouping of samples.
        eval_at: integer
            The rank postion to evaluate NDCG.
        Returns
        -------
        ndcg@eval_at: float
        """
        ndcg_temp = []
        y_pred = self.predict(X)
        tmp = np.array(np.hstack([y.reshape(-1, 1), y_pred.reshape(-1, 1), qid.reshape(-1, 1)]))
        tmp = tmp[np.lexsort((-tmp[:, 1], tmp[:, 2]))]
        y_sorted = tmp[:, 0]
        qid_sorted = tmp[:, 2]
        ndcg = self._EvalNDCG(y_sorted, qid_sorted, eval_at)
        if eval_at:
            print('ndcg@' + str(eval_at) + ': ' + str(ndcg))
        else:
            print('ndcg: ' + str(ndcg))
            ndcg_temp.append(ndcg)
        return ndcg

    def _EvalNDCG(self, y, qid, eval_at=None):
        """Evaluate ndcg@eval_at.
        Calculated ndcg@n is consistent with ndcg@n- in xgboost.
        """
        _, _, qid2idcg, qid2dcg = self._fetch_qid_data(y, qid, eval_at)
        sumndcg = 0
        count = 0.0
        for qid_unique_idx in range(len(qid2idcg)):
            count += 1
            if qid2idcg[qid_unique_idx] == 0:
                continue
            idcg = qid2idcg[qid_unique_idx]
            dcg = qid2dcg[qid_unique_idx]
            sumndcg += dcg / idcg
        return sumndcg / count

class LambdaRank(Ranker):

    def __init__(self, input_size, hidden_layer_sizes=(100,), activation=('relu',), solver='adam'):
        super(LambdaRank, self).__init__(input_size, hidden_layer_sizes, activation, solver)

    def _transform_pairwise(self, X, y, qid):
        """Transform data into lambdarank pairs with balanced labels
        for binary classification.
        Parameters
        ----------
        X : array, shape (n_samples, n_features)
            Features.
        y : array, shape (n_samples,)
            Target labels.
        qid: array, shape (n_samples,)
            Query id that represents the grouping of samples.
        Returns
        -------
        X1_trans : array, shape (k, n_feaures)
            Features of pair 1
        X2_trans : array, shape (k, n_feaures)
            Features of pair 2
        weight: array, shape (k, n_faetures)
            Sample weight lambda.
        y_trans : array, shape (k,)
            Output class labels, where classes have values {0, 1}
        """
        qid2indices, qid2rel, qid2idcg, _ = self._fetch_qid_data(y, qid)
        X1 = []
        X2 = []
        weight = []
        Y = []
        for qid_unique_idx in range(len(qid2indices)):
            if qid2idcg[qid_unique_idx] == 0:
                continue
            IDCG = 1.0 / qid2idcg[qid_unique_idx]
            rel_list = qid2rel[qid_unique_idx]
            qid_start_idx = qid2indices[qid_unique_idx]
            for pos_idx in range(len(rel_list)):
                for neg_idx in range(len(rel_list)):
                    if rel_list[pos_idx] <= rel_list[neg_idx]:
                        continue
                    # calculate lambda
                    pos_loginv = 1.0 / math.log2(pos_idx + 2)
                    neg_loginv = 1.0 / math.log2(neg_idx + 2)
                    pos_label = rel_list[pos_idx]
                    neg_label = rel_list[neg_idx]
                    original = ((1 << pos_label) - 1) * pos_loginv + ((1 << neg_label) - 1) * neg_loginv
                    changed = ((1 << neg_label) - 1) * pos_loginv + ((1 << pos_label) - 1) * neg_loginv
                    delta = (original - changed) * IDCG
                    if delta < 0:
                        delta = -delta
                    # balanced class
                    if 1 != (-1) ** (qid_unique_idx + pos_idx + neg_idx):
                        X1.append(X[qid_start_idx + pos_idx])
                        X2.append(X[qid_start_idx + neg_idx])
                        weight.append(delta)
                        Y.append(1)
                    else:
                        X1.append(X[qid_start_idx + neg_idx])
                        X2.append(X[qid_start_idx + pos_idx])
                        weight.append(delta)
                        Y.append(0)
        return np.asarray(X1), np.asarray(X2), np.asarray(Y), np.asarray(weight)


In [5]:
def adjustLETOR(df):
    df[96] = df[0]
    df[0] = df[2]
    df[2] = df[97]
    drop_cols = list(range(1, 96, 2))
    drop_cols.extend(range(97, 104))
    df_adjusted = df.drop(drop_cols, 1)
    df_adjusted.columns = list(range(0, 49))
    df_adjusted[49] = df_adjusted[48] > 0
    df_adjusted.infer_objects()
    df_adjusted[49] = df_adjusted[49].apply(int)
    return df_adjusted

In [51]:
def preprocessing_df(Path):
    df_raw = pd.read_csv(Path, " |:", header=None, engine='python')
    df = adjustLETOR(df_raw)
    return df

def learn_to_rank(Path, epochs_val):
    train_df = preprocessing_df(Path + "train.txt")
    val_df = preprocessing_df(Path + "vali.txt")
    test_df = preprocessing_df(Path + "test.txt")
    X_train = train_df.iloc[:,2:48]
    X_train = X_train.to_numpy()
    Y_train = train_df.iloc[:,49]
    Y_train = Y_train.to_numpy()
    qid_train = train_df.iloc[:,:1]
    qid_train = qid_train.to_numpy().flatten()
    
    # train model
    ranker = LambdaRank(input_size=X_train.shape[1], hidden_layer_sizes=(16,8,), activation=('relu', 'relu',), solver='adam')
    ndcg = ranker.fit(X_train, Y_train, qid_train, epochs=epochs_val)
    y_pred = ranker.predict(X_train)
    return ndcg

def ndcg_eval(Path, eval_val):
    train_df = preprocessing_df(Path + "train.txt")
    val_df = preprocessing_df(Path + "vali.txt")
    test_df = preprocessing_df(Path + "test.txt")
    X_train = train_df.iloc[:,2:48]
    X_train = X_train.to_numpy()
    Y_train = train_df.iloc[:,49]
    Y_train = Y_train.to_numpy()
    qid_train = train_df.iloc[:,:1]
    qid_train = qid_train.to_numpy().flatten()
    
    # evaluate model
    ranker = LambdaRank(input_size=X_train.shape[1], hidden_layer_sizes=(16,8,), activation=('relu', 'relu',), solver='adam')
    ndcg_eval = ranker.evaluate(X_train, Y_train, qid_train, eval_at=eval_val)
    return ndcg_eval

In [52]:
list_ndcg = []
epochs = 5
for idx in range(1,6):
    directory = "MQ2008/Fold"+str(idx)+"/"
    dir_ndcg = learn_to_rank(directory, epochs)
    list_ndcg.append(dir_ndcg)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
ndcg: 0.5748880181368702
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
ndcg: 0.6050986062167453
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
ndcg: 0.6001564358780641
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
ndcg: 0.5670915651442661
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
ndcg: 0.5437057449486725


In [123]:
list_ndcg_eval = []
list_eval = [1,3,5,10]
temp = []
for fold in range(1,6):
    print("NDCG Eval for fold" + str(fold) + ":")
    for idx in range(len(list_eval)):
        directory = "MQ2008/Fold"+str(idx+1)+"/"
        dir_ndcg = ndcg_eval(directory, list_eval[idx])
        temp.append(dir_ndcg)
list_ndcg_eval.append(temp)

NDCG Eval for fold1:
ndcg@1: 0.17197452229299362
ndcg@3: 0.2609337968578328
ndcg@5: 0.36966646339211134
ndcg@10: 0.3688230307598316
NDCG Eval for fold2:
ndcg@1: 0.28662420382165604
ndcg@3: 0.36741360605509954
ndcg@5: 0.4433940972500616
ndcg@10: 0.4527724711928939
NDCG Eval for fold3:
ndcg@1: 0.14012738853503184
ndcg@3: 0.35673311844196365
ndcg@5: 0.24145519084343575
ndcg@10: 0.45511303224601674
NDCG Eval for fold4:
ndcg@1: 0.13800424628450106
ndcg@3: 0.28762830970463577
ndcg@5: 0.18690519785799095
ndcg@10: 0.44753609125776045
NDCG Eval for fold5:
ndcg@1: 0.06157112526539278
ndcg@3: 0.355936695144012
ndcg@5: 0.2705458672705965
ndcg@10: 0.3324718774390957


In [131]:
for idx in range(1,6):
    print("NDGC for fold " + str(idx) + " : ", round(list_ndcg[idx-1], 3))
print("NDGC Average : " + str(round(sum(list_ndcg) / len(list_ndcg), 3)))
print("------------------------------")
count = 0;
num = 0;
eval_num  = 0;
for fold in range(1,6):
    eval_num = 0;
    print("NDGC Evaluate for fold " + str(fold) + " :")
    num += 4;
    while(count != num):
        print("NDGC@" + str(list_eval[eval_num]) + ": " + str(round(list_ndcg_eval[0][count], 3)))
        count+=1;
        eval_num+=1;

NDGC for fold 1 :  0.575
NDGC for fold 2 :  0.605
NDGC for fold 3 :  0.6
NDGC for fold 4 :  0.567
NDGC for fold 5 :  0.544
NDGC Average : 0.578
------------------------------
NDGC Evaluate for fold 1 :
NDGC@1: 0.172
NDGC@3: 0.261
NDGC@5: 0.37
NDGC@10: 0.369
NDGC Evaluate for fold 2 :
NDGC@1: 0.287
NDGC@3: 0.367
NDGC@5: 0.443
NDGC@10: 0.453
NDGC Evaluate for fold 3 :
NDGC@1: 0.14
NDGC@3: 0.357
NDGC@5: 0.241
NDGC@10: 0.455
NDGC Evaluate for fold 4 :
NDGC@1: 0.138
NDGC@3: 0.288
NDGC@5: 0.187
NDGC@10: 0.448
NDGC Evaluate for fold 5 :
NDGC@1: 0.062
NDGC@3: 0.356
NDGC@5: 0.271
NDGC@10: 0.332
