In [1]:
import numpy as np
import math
import random
import copy
from sklearn.tree import DecisionTreeRegressor
import pandas as pd

# Return the list of the dcg by browsing the list of documensts
def dcg(scores):
    return np.sum([(np.power(2, scores[i]) - 1) / np.log2(i + 2) for i in range(len(scores))]) 

# Return the list of the dcg by browsing the list of documents 
#(and truncating, that is, taking only the first k documents)
def dcg_k(scores, k):
    return np.sum([(np.power(2, scores[i]) - 1) / np.log2(i + 2) for i in range(len(scores[:k]))])

# Return the list of the dcg by browsing the list of documents in the descending order of the scores 
# In other words, the documents are well classified
def ideal_dcg(scores):
    scores = [score for score in sorted(scores)[::-1]]
    return dcg(scores)

# Return the list of the dcg by browsing the list of documents in the descending order of the scores
#(and truncating, that is, taking only the first k documents)
def ideal_dcg_k(scores, k):
    scores = [score for score in sorted(scores)[::-1]]
    return dcg_k(scores, k)

# Returns the dcg of two documents 
def single_dcg(scores, i, j):
    return (np.power(2, scores[i]) - 1) / np.log2(j + 2)

# Calculate the lambdas (which will allow the gradient descent)
def compute_lambda(args):
     # We take as argument the list of real scores, the list of predicted scores, 
    # pairs of documents such as document i has a higher score than document j,
    # the list of documents idcg and the list of requests  
    true_scores, predicted_scores, good_ij_pairs, idcg, query_key = args  
    # Count the number of documents
    num_docs = len(true_scores)
    # Sort the places of the documents in descending order of the scores
    # Otherwise we have first place in the list of the highest score document etc.
    sorted_indexes = np.argsort(predicted_scores)[::-1]
    # Sort the places of the documents in ascending order of the scores
    # Otherwise we have first place in the list of document at the lowest score etc.
    rev_indexes = np.argsort(sorted_indexes)
    # We take the real scores of the documents sorted according to the order of the scores
    true_scores = true_scores[sorted_indexes]
    # We take the predicted scores of the documents sorted according to the order of the scores
    predicted_scores = predicted_scores[sorted_indexes]
    # We create an empty vector of size the number of documents for lambdas
    lambdas = np.zeros(num_docs)
    # We create an empty vector of the size the number of documents for the weights
    w = np.zeros(num_docs)
    # We create a dictionary so the keys are the document pairs and the dcg values between the two documents 
    single_dcgs = {}
    # We go through the pairs of documents so the document i has a score higher than the document j
    # We add to the dictator the cdg between i and j and between j and i, then between i and i and j and j if they do not already exist
    for i,j in good_ij_pairs:
        if (i,i) not in single_dcgs:
            single_dcgs[(i,i)] = single_dcg(true_scores, i, i)
        single_dcgs[(i,j)] = single_dcg(true_scores, i, j)
        if (j,j) not in single_dcgs:
            single_dcgs[(j,j)] = single_dcg(true_scores, j, j)
        single_dcgs[(j,i)] = single_dcg(true_scores, j, i) 
    for i,j in good_ij_pairs:
        # Calculation of the NDCG for each pair (differences of dcg on dcg of i (the largest dcg))
        z_ndcg = abs(single_dcgs[(i,j)] - single_dcgs[(i,i)] + single_dcgs[(j,i)] - single_dcgs[(j,j)]) / idcg
        # Function defined in the article for the calculation of lambdas
        rho = 1 / (1 + np.exp(predicted_scores[i] - predicted_scores[j]))
        # rho_complement ?
        rho_complement = 1.0 - rho
        # Multiply the NDCG for the pair i, j by rho
        lambda_val = z_ndcg * rho
        # We sum lambda i, j to obtain lambda i and lambda j
        lambdas[i] += lambda_val
        lambdas[j] -= lambda_val
        # We update the weights too 
        w_val = rho * rho_complement * z_ndcg
        w[i] += w_val
        w[j] += w_val
    # We return the lambdas and weights for the sorted documents in ascending order of the scores
    # In other words, first, we have the document with the lowest score
    return lambdas[rev_indexes], w[rev_indexes], query_key

def group_queries(training_data, qid_index):
    # We create a dictionary whose keys are the queries and the values the lists of places of the documents in the list
    # documents used in the train database (list of lists containing the query the score and the features associated with the documents)
    query_indexes = {}
    index = 0
    for record in training_data:
        query_indexes.setdefault(record[qid_index], [])
        query_indexes[record[qid_index]].append(index)
        index += 1
    return query_indexes
 
# At the input of the function, we have a list of score lists according to the request
def get_pairs(scores):
    #We define a list 
    query_pair = []
    # Here we go through the lists in the list 
    for query_scores in scores:
        # We sort the scores in descending order (highest scores in premiere)
        temp = sorted(query_scores, reverse=True)
        pairs = []
        for i in range(len(temp)):
            for j in range(len(temp)):
                if temp[i] > temp[j]:
                    pairs.append((i,j))
        # We add to the final list 
        query_pair.append(pairs)
    return query_pair

# We define the LambdaMART class
class LambdaMART:

    #We fix 5 trees and a learning rate of 0.1 .. we can test other learning rate after !!
    def __init__(self, training_data=None, number_of_trees=5, learning_rate=0.1):

        self.training_data = training_data
        self.number_of_trees = number_of_trees
        self.learning_rate = learning_rate
        self.trees = []
    
    # The function fit allows to fitter the lambdas tree
    def fit(self):
        # We define as many predicted scores as lines in the training set
        predicted_scores = np.zeros(len(self.training_data))
        # On définit notre dictionnaire dont les clés sont les requêtes et les valeurs les places des documents
        # in the list of documents (the 1 because the second value of the list for each document has the second place the request)
        query_indexes = group_queries(self.training_data, 1)
        # We retrieve the list of requests
        query_keys = query_indexes.keys()
        # We obtain here a list of lists of scores, by request
        true_scores = [self.training_data[query_indexes[query], 0] for query in query_keys]
        # We obtain here the pairs of documents for each list of scores, so for each request
        good_ij_pairs = get_pairs(true_scores)
        # Here we obtain the set of feature vectors for documents
        tree_data = pd.DataFrame(self.training_data[:, 2:7])
        # We obtain here all the scores for the documents (the labels)
        labels = self.training_data[:, 0]
        # The list of dcg is calculated for the lists of scores ranked in descending order (ideal ranking)
        # We get a list of lists again
        idcg = [ideal_dcg(scores) for scores in true_scores]
        # We go through the number of trees 
        for k in range(self.number_of_trees):
            # We create as many lambdas as we have lines in our training set
            lambdas = np.zeros(len(predicted_scores))
            # We create as much weight as we have lines in our training set
            w = np.zeros(len(predicted_scores))
            # Here we obtain a list of predicted scores lists, by query
            pred_scores = [predicted_scores[query_indexes[query]] for query in query_keys]
            # pool allows to parrallelize already on the 4 hearts of the computer, by request 
            # Here, we obtain the lambdas, the weights for the documents of each request
            # In input of compute_lambda, one has the lists of the real scores for the docs of each request,
            # lists of predicted scores, lists of pairs of docs such as the first element has a higher score than the second,
            # the list of the dcg of the ideal classification of the documents, by request
            for lambda_val, w_val, query_key in map(compute_lambda, zip(true_scores, pred_scores, good_ij_pairs, idcg, query_keys)):
                # Here, we obtain the list of the places of the documents associated with the request 
                indexes = query_indexes[query_key]
                # We get the lambdas of the docs associated with the input request, we enter the final lambda vector
                lambdas[indexes] = lambda_val
                # We obtain the weights of the docs associated with the input request, we enter the vector final weight
                w[indexes] = w_val
            # Implementation sklearn of the three
            tree = DecisionTreeRegressor(max_depth=50)
            # We made the lambdas tree
            tree.fit(self.training_data[:,2:], lambdas)
            # We add the tree to our list of trees
            self.trees.append(tree)
            # We predict thanks to our tree and our features (the predict function is below)
            # Calculate the prediction for each document thanks to our previous trees and the new
            prediction = tree.predict(self.training_data[:,2:])
            # We update our predictions using our recent prediciton and learning_rate
            predicted_scores += prediction * self.learning_rate

    
    def predict(self, data):
        data = np.array(data)
        query_indexes = group_queries(data, 0)
        predicted_scores = np.zeros(len(data))
        for query in query_indexes:
            results = np.zeros(len(query_indexes[query]))
            for tree in self.trees:
                results += self.learning_rate * tree.predict(data[query_indexes[query], 1:])
            predicted_scores[query_indexes[query]] = results
        return predicted_scores

    # Validate function that predicts on the test base according to all the trees we have built, which also calculates the NDCG average
    def validate(self, data, k):
        data = np.array(data)
        query_indexes = group_queries(data, 1)
        average_ndcg = []
        predicted_scores = np.zeros(len(data))
        for query in query_indexes:
            results = np.zeros(len(query_indexes[query]))
            for tree in self.trees:
                results += self.learning_rate * tree.predict(data[query_indexes[query], 2:])
            predicted_sorted_indexes = np.argsort(results)[::-1]
            t_results = data[query_indexes[query], 0]
            t_results = t_results[predicted_sorted_indexes]
            predicted_scores[query_indexes[query]] = results
            dcg_val = dcg_k(t_results, k)
            idcg_val = ideal_dcg_k(t_results, k)
            ndcg_val = (dcg_val / idcg_val)
            average_ndcg.append(ndcg_val)
        average_ndcg = np.nanmean(average_ndcg)
        return average_ndcg, predicted_scores

def group_queries(training_data, qid_index):
    # We create a dictionary whose keys are the queries and the values the lists of places of the documents in the list
    # documents used in the train database (list of lists containing the query the score and the features associated with the documents)
    query_indexes = {}
    index = 0
    for record in training_data:
        query_indexes.setdefault(record[qid_index], [])
        query_indexes[record[qid_index]].append(index)
        index += 1
    return query_indexes
 
# At the input of the function, we have a list of score lists according to the request
def get_pairs(scores):
    #At the input of the function, we have a list of score lists according to the request
    query_pair = []
    # Here we go through the lists in the list 
    for query_scores in scores:
        # We sort the scores in descending order (highest scores in premiere)
        temp = sorted(query_scores, reverse=True)
        pairs = []
        for i in range(len(temp)):
            for j in range(len(temp)):
                if temp[i] > temp[j]:
                    pairs.append((i,j))
        # We add to the final list 
        query_pair.append(pairs)
    return query_pair

# We define the LambdaMART class
class LambdaMART:

    #We fix 5 trees and a learning rate of 0.1 .. we can test other learning rate after !!
    def __init__(self, training_data=None, number_of_trees=5, learning_rate=0.1):

        self.training_data = training_data
        self.number_of_trees = number_of_trees
        self.learning_rate = learning_rate
        self.trees = []
    
    # The function fit allows to fitter the lambdas tree
    def fit(self):
        # We define as many predicted scores as lines in the training set
        predicted_scores = np.zeros(len(self.training_data))
        # We define our dictionary whose keys are the requests and the values the places of the documents
        # in the list of documents (the 1 because the second value of the list for each document has the second place the request)
        query_indexes = group_queries(self.training_data, 1)
        # We retrieve the list of requests
        query_keys = query_indexes.keys()
        # We obtain here a list of lists of scores, by request
        true_scores = [self.training_data[query_indexes[query], 0] for query in query_keys]
        # We obtain here the pairs of documents for each list of scores, so for each request
        good_ij_pairs = get_pairs(true_scores)
        # Here we obtain the set of feature vectors for documents
        tree_data = pd.DataFrame(self.training_data[:, 2:7])
        # We obtain here all the scores for the documents (the labels)
        labels = self.training_data[:, 0]
        # The list of dcg is calculated for the lists of scores ranked in descending order (ideal ranking)
        # We get a list of lists again
        idcg = [ideal_dcg(scores) for scores in true_scores]
        # We go through the number of trees 
        for k in range(self.number_of_trees):
            # We create as many lambdas as we have lines in our training set
            lambdas = np.zeros(len(predicted_scores))
            # We create as much weight as we have lines in our training set
            w = np.zeros(len(predicted_scores))
            # Here we obtain a list of predicted scores lists, by query
            pred_scores = [predicted_scores[query_indexes[query]] for query in query_keys]
            # pool allows to parrallelize already on the 4 hearts of the computer, by request
            # Here, we obtain the lambdas, the weights for the documents of each request
            # In input of compute_lambda, one has the lists of the real scores for the docs of each request,
            # lists of predicted scores, lists of pairs of docs such as the first element has a higher score than the second,
            # the list of the dcg of the ideal classification of the documents, by request
            for lambda_val, w_val, query_key in map(compute_lambda, zip(true_scores, pred_scores, good_ij_pairs, idcg, query_keys)):
                # Here, we obtain the list of the places of the documents associated with the request 
                indexes = query_indexes[query_key]
                # We get the lambdas of the docs associated with the input request, we enter the final lambda vector
                lambdas[indexes] = lambda_val
                # We obtain the weights of the docs associated with the input request, we enter the vector final weight
                w[indexes] = w_val
            # Implementation sklearn of the three
            tree = DecisionTreeRegressor(max_depth=50)
            # We made the lambdas tree
            tree.fit(self.training_data[:,2:], lambdas)
            # We add the tree to our list of trees
            self.trees.append(tree)
            # We predict thanks to our tree and our features (the predict function is below)
            # Calculate the prediction for each document thanks to our previous trees and the new
            prediction = tree.predict(self.training_data[:,2:])
            # We update our predictions using our recent prediciton and learning_rate
            predicted_scores += prediction * self.learning_rate

    
    def predict(self, data):
        data = np.array(data)
        query_indexes = group_queries(data, 0)
        predicted_scores = np.zeros(len(data))
        for query in query_indexes:
            results = np.zeros(len(query_indexes[query]))
            for tree in self.trees:
                results += self.learning_rate * tree.predict(data[query_indexes[query], 1:])
            predicted_scores[query_indexes[query]] = results
        return predicted_scores

    #Validate function that predicts on the test base according to all the trees we have built, which also calculates the NDCG average
    def validate(self, data, k):
        data = np.array(data)
        query_indexes = group_queries(data, 1)
        average_ndcg = []
        predicted_scores = np.zeros(len(data))
        for query in query_indexes:
            results = np.zeros(len(query_indexes[query]))
            for tree in self.trees:
                results += self.learning_rate * tree.predict(data[query_indexes[query], 2:])
            predicted_sorted_indexes = np.argsort(results)[::-1]
            t_results = data[query_indexes[query], 0]
            t_results = t_results[predicted_sorted_indexes]
            predicted_scores[query_indexes[query]] = results
            dcg_val = dcg_k(t_results, k)
            idcg_val = ideal_dcg_k(t_results, k)
            ndcg_val = (dcg_val / idcg_val)
            average_ndcg.append(ndcg_val)
        average_ndcg = np.nanmean(average_ndcg)
        return average_ndcg, predicted_scores

def group_queries(training_data, qid_index):
    # We create a dictionary whose keys are the queries and the values the lists of places of the documents in the list
    # documents used in the train database (list of lists containing the query the score and the features associated with the documents)
    query_indexes = {}
    index = 0
    for record in training_data:
        query_indexes.setdefault(record[qid_index], [])
        query_indexes[record[qid_index]].append(index)
        index += 1
    return query_indexes
 
# At the input of the function, we have a list of score lists according to the request
def get_pairs(scores):
    # We define a list 
    query_pair = []
    # Here we go through the lists in the list
    for query_scores in scores:
        # We sort the scores in descending order (highest scores in premiere)
        temp = sorted(query_scores, reverse=True)
        pairs = []
        for i in range(len(temp)):
            for j in range(len(temp)):
                if temp[i] > temp[j]:
                    pairs.append((i,j))
        # We add to the final list 
        query_pair.append(pairs)
    return query_pair

# We define the LambdaMART class
class LambdaMART:

    # On fixed 5 trees and a learning rate of 0.1 .. we can test other learning rate after !!
    def __init__(self, training_data=None, number_of_trees=5, learning_rate=0.1):

        self.training_data = training_data
        self.number_of_trees = number_of_trees
        self.learning_rate = learning_rate
        self.trees = []
    
    #The function fit allows to fitter the lambdas tree
    def fit(self):
        # We define as many predicted scores as lines in the training set
        predicted_scores = np.zeros(len(self.training_data))
        # We define our dictionary whose keys are the requests and the values the places of the documents
        # in the list of documents (the 1 because the second value of the list for each document has the second place the request)
        query_indexes = group_queries(self.training_data, 1)
        # We retrieve the list of requests
        query_keys = query_indexes.keys()
        # We obtain here a list of lists of scores, by request
        true_scores = [self.training_data[query_indexes[query], 0] for query in query_keys]
        # We obtain here the pairs of documents for each list of scores, so for each request
        good_ij_pairs = get_pairs(true_scores)
        # Here we obtain the set of feature vectors for documents
        tree_data = pd.DataFrame(self.training_data[:, 2:7])
        # We obtain here all the scores for the documents (the labels)
        labels = self.training_data[:, 0]
        # The list of dcg is calculated for the lists of scores ranked in descending order (ideal ranking)
        # We get a list of lists again
        idcg = [ideal_dcg(scores) for scores in true_scores]
        # We go through the number of trees 
        for k in range(self.number_of_trees):
            # We create as many lambdas as we have lines in our training set
            lambdas = np.zeros(len(predicted_scores))
            # We create as much weight as we have lines in our training set
            w = np.zeros(len(predicted_scores))
            # Here we obtain a list of predicted scores lists, by query
            pred_scores = [predicted_scores[query_indexes[query]] for query in query_keys]
            # pool allows to parrallelize already on the 4 hearts of the computer, by request
            # Here, we obtain the lambdas, the weights for the documents of each request
            # In input of compute_lambda, one has the lists of the real scores for the docs of each request,
            # lists of predicted scores, lists of pairs of docs such as the first element has a higher score than the second,
            # the list of the dcg of the ideal classification of the documents, by request
            for lambda_val, w_val, query_key in map(compute_lambda, zip(true_scores, pred_scores, good_ij_pairs, idcg, query_keys)):
                # Here, we obtain the list of the places of the documents associated with the request 
                indexes = query_indexes[query_key]
                # We get the lambdas of the docs associated with the input request, we enter the final lambda vector
                lambdas[indexes] = lambda_val
                # We obtain the weights of the docs associated with the input request, we enter the vector final weight
                w[indexes] = w_val
            # Implementation sklearn of the three
            tree = DecisionTreeRegressor(max_depth=50)
            # We made the lambdas tree
            tree.fit(self.training_data[:,2:], lambdas)
            # We add the tree to our list of trees
            self.trees.append(tree)
            # We predict thanks to our tree and our features (the predict function is below)
            # Calcule la prédiction pour chaque document grâce à nos précédents arbres et au nouveau
            prediction = tree.predict(self.training_data[:,2:])
            # We update our predictions using our recent prediciton and learning_rate
            predicted_scores += prediction * self.learning_rate

    
    def predict(self, data):
        data = np.array(data)
        query_indexes = group_queries(data, 0)
        predicted_scores = np.zeros(len(data))
        for query in query_indexes:
            results = np.zeros(len(query_indexes[query]))
            for tree in self.trees:
                results += self.learning_rate * tree.predict(data[query_indexes[query], 1:])
            predicted_scores[query_indexes[query]] = results
        return predicted_scores

    #Validate function that predicts on the test base according to all the trees we have built, which also calculates the NDCG average
    def validate(self, data, k):
        data = np.array(data)
        query_indexes = group_queries(data, 1)
        average_ndcg = []
        predicted_scores = np.zeros(len(data))
        for query in query_indexes:
            results = np.zeros(len(query_indexes[query]))
            for tree in self.trees:
                results += self.learning_rate * tree.predict(data[query_indexes[query], 2:])
            predicted_sorted_indexes = np.argsort(results)[::-1]
            t_results = data[query_indexes[query], 0]
            t_results = t_results[predicted_sorted_indexes]
            predicted_scores[query_indexes[query]] = results
            dcg_val = dcg_k(t_results, k)
            idcg_val = ideal_dcg_k(t_results, k)
            ndcg_val = (dcg_val / idcg_val)
            average_ndcg.append(ndcg_val)
        average_ndcg = np.nanmean(average_ndcg)
        return average_ndcg, predicted_scores



In [2]:
import numpy as np
import pandas as pd
import time


# Main function
def main():
    start_time = time.time()
    total_ndcg = 0.0
    df=pd.read_csv('train2.csv')
    values=[[df.loc[(df.index == j),][str(u)][j] for u in list(range(1,49))] for j in range(len(df.index))][0:18]
    training_data = np.asarray(values)
    df2=pd.read_csv('test2.csv')
    values2=[[df2.loc[(df2.index == j),][str(u)][j] for u in list(range(1,49))] for j in range(len(df2.index))][0:18]
    test_data = np.asarray(values2)
    # We build the 100 trees on the test base
    model = LambdaMART(training_data, 100, 0.001)
    model.fit()
    # We truncate on the first 10 docs for the calculations of the DCG average
    # Predicted scores are calculated on the test basis
    average_ndcg, predicted_scores = model.validate(test_data, 20) 
    print (average_ndcg)
    print("--- %s seconds ---" % (time.time() - start_time))


    
if __name__ == '__main__':
    main()

0.793643251190486
--- 13.553044796 seconds ---
