In [1]:
!pip install pyltr
!pip install more_itertools



In [2]:
import warnings
import pyltr
warnings.filterwarnings('ignore')
import numpy as np
from more_itertools import sort_together
import collections
import heapq
from sklearn.metrics import precision_score
from sklearn.cluster import KMeans
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
import pandas as pd



In [3]:
# input array of qids of each query-document pair
# output sorted qids 
def get_qids(Qids):
    qs = list(set(Qids))
    qs.sort()
    
    return qs

In [4]:
## input evaluation data and cluster number
## output an array of ndcg score for each qids
def evaluate_BFC_NDCG_selective_cluster(EX, Ey, Eqids, E_k, models):
    
    # get a list of qids
    eqs = get_qids(Eqids)
    metric = pyltr.metrics.NDCG(k=5)
    ndcg_qs = np.array([])
    
    for i, eq in enumerate(eqs):
        idxs = np.where(Eqids == eq)[0]
        EX_i = [EX[idx] for idx in idxs] # target score
        relevance_list = [Ey[idx] for idx in idxs] # true score
        
        # select model from predicted cluster
        model = models[E_k[i]]
        Epred_i = model.predict(EX_i)
        
        score = metric.evaluate_preds(i, np.asarray(relevance_list), np.asarray(Epred_i))
        ndcg_qs = np.append(ndcg_qs, [score], axis = 0)
        
    return ndcg_qs

In [5]:
## input evaluation data and cluster number
## output an array of ndcg score for each qids
def evaluate_BFC_NDCG_cluster_oracle(EX, Ey, Eqids, no_cluster, models):
    
    # get a list of qids
    eqs = get_qids(Eqids)
    metric = pyltr.metrics.NDCG(k=5)
    ndcg_qs = np.array([])
    
    for i, eq in enumerate(eqs):
        idxs = np.where(Eqids == eq)[0]
        EX_i = [EX[idx] for idx in idxs] # target score
        relevance_list = [Ey[idx] for idx in idxs] # true score
        
        best = 0
        # choose highest performance of 5 models
        for k in range(no_cluster):
            model = models[k]
            Epred_i = model.predict(EX_i)
            score = metric.evaluate_preds(i, np.asarray(relevance_list), np.asarray(Epred_i))
            if score > best:
                best = score
        

        ndcg_qs = np.append(ndcg_qs, [best], axis = 0)
        
    return ndcg_qs

In [6]:
## input evaluation data and cluster number
## output an array of ndcg score for each qids
def evaluate_BFC_NDCG_cluster_fusion(EX, Ey, Eqids, prob_dist, no_cluster,  models):
    
    # get a list of qids
    eqs = get_qids(Eqids)
    metric = pyltr.metrics.NDCG(k=5)
    ndcg_qs = np.array([])
    
    for i, eq in enumerate(eqs):
        idxs = np.where(Eqids == eq)[0]
        EX_i = [EX[idx] for idx in idxs] # target score
        relevance_list = [Ey[idx] for idx in idxs] # true score
        
        Epred_i = np.zeros(len(idxs))
        for k in range(no_cluster):
            model = models[k]
            pred = model.predict(EX_i) * prob_dist[i][k]
            Epred_i = Epred_i + pred
        
        score = metric.evaluate_preds(i, np.asarray(relevance_list), np.asarray(Epred_i))
        

        ndcg_qs = np.append(ndcg_qs, [score], axis = 0)
        
    return ndcg_qs

In [7]:
## input evaluation data and cluster number
## output an array of MRR score for each qids
def evaluate_BFC_MRR_selective_cluster(EX, Ey, Eqids, E_k, models):
    
    # get a list of qids
    eqs = get_qids(Eqids)
    mrr_qs = np.array([])
    
    
    for i, eq in enumerate(eqs):
        idxs = np.where(Eqids == eq)[0]
        EX_i = [EX[idx] for idx in idxs] # target score
        relevance_list = [Ey[idx] > 0 for idx in idxs] # true score, convert a boolean list (relevant or non relevant)
        
        # select model from predicted cluster
        model = models[E_k[i]]
        Epred_i = model.predict(EX_i)
        
        # get the evaluation score for the query
        score = compute_MRR(relevance_list, np.asarray(Epred_i))
        mrr_qs = np.append(mrr_qs, [score], axis = 0)

    return mrr_qs
    

In [8]:
## input evaluation data and cluster number
## output an array of MRR score for each qids
def evaluate_BFC_MRR_cluster_oracle(EX, Ey, Eqids, no_cluster, models):
    
    # get a list of qids
    eqs = get_qids(Eqids)
    mrr_qs = np.array([])
    
    
    for i, eq in enumerate(eqs):
        idxs = np.where(Eqids == eq)[0]
        EX_i = [EX[idx] for idx in idxs] # target score
        relevance_list = [Ey[idx] > 0 for idx in idxs] # true score, convert a boolean list (relevant or non relevant)
        
        best = 0
        # choose highest performance of 5 models
        for k in range(no_cluster):
            model = models[k]
            Epred_i = model.predict(EX_i)
            score = compute_MRR(relevance_list, np.asarray(Epred_i))
            if score > best:
                best = score

        mrr_qs = np.append(mrr_qs, [best], axis = 0)

    return mrr_qs
    

In [9]:
## input evaluation data and probability distributions of clusters
## output an array of MRR score for each qids
def evaluate_BFC_MRR_cluster_fusion(EX, Ey, Eqids, prob_dist, no_cluster, models):
    # get a list of qids
    eqs = get_qids(Eqids)
    mrr_qs = np.array([])
    
    
    for i, eq in enumerate(eqs):
        idxs = np.where(Eqids == eq)[0]
        EX_i = [EX[idx] for idx in idxs] # target score
        relevance_list = [Ey[idx] > 0 for idx in idxs] # true score, convert a boolean list (relevant or non relevant)
        
        
        Epred_i = np.zeros(len(idxs))
        for k in range(no_cluster):
            model = models[k]
            pred = model.predict(EX_i) * prob_dist[i][k]
            Epred_i = Epred_i + pred
            
        score = compute_MRR(relevance_list, np.asarray(Epred_i))
        mrr_qs = np.append(mrr_qs, [score], axis = 0)

    return mrr_qs
    

In [10]:
def compute_precision(relevance_list, feature_list, k):
    feature_list, relevance_list = (list(t) for t in zip(*sorted(zip(feature_list, relevance_list), reverse = True)))
    if len(feature_list) > k:
        feature_list = feature_list[0:k]
        relevance_list = relevance_list[0:k]
    else:
        k = len(feature_list)
    
    return np.count_nonzero(relevance_list)/k

In [11]:
def compute_MRR(relevance_list, feature_list):
    # sort arrays by the feature value
    feature_list, relevance_list = (list(t) for t in zip(*sorted(zip(feature_list, relevance_list), reverse = True)))
    # MRR@100, only look at top 100
    if len(feature_list) > 100:
        feature_list = feature_list[0:100]
        relevance_list = relevance_list[0:100]

    idx = np.where(relevance_list)[0]#get the indexesof relevant document
    if any(idx):
        return 1/(idx[0]+1) # index starts from 0 so add 1
    else:
        return 0

In [12]:
direc_name = './MQ2007/Fold'
train_name = '/train.txt'
valid_name = '/vali.txt'
test_name = '/test.txt'

In [13]:
Qid = np.array([]) # store all the qids
e_ndcg_sc = np.array([]) # store all ndcg scores of selective cluster
e_mrr_sc = np.array([]) # store all mrr scores of selective cluster
e_ndcg_co = np.array([]) # store all ndcg scores of cluster oracle
e_mrr_co = np.array([]) # store all mrr scores of cluster oracle
e_ndcg_cf = np.array([]) # store all ndcg scores of cluster fusion
e_mrr_cf = np.array([]) # store all mrr scores of cluster fusion

for fold_no in range(5):
    print('fold number: ' + str(fold_no))
    with open(direc_name + str(fold_no+1) + train_name) as trainfile, \
             open(direc_name + str(fold_no+1) + test_name) as evalfile:
        TX, Ty, Tqids, _ = pyltr.data.letor.read_dataset(trainfile)
        EX, Ey, Eqids, _ = pyltr.data.letor.read_dataset(evalfile)
    
    Tqids = np.array([int(i) for i in Tqids])
    Eqids = np.array([int(i) for i in Eqids])
    
    Qids_e = np.array(get_qids(Eqids))
    
    # get the qids in the trainind data
    qids = get_qids(Tqids)
    
    ### Average of features of relevant documents for query IDs
    
    avg_features_qids = np.empty((0,np.size(TX, 1)), float)

    # iterate over qid
    for i in qids: 
        idxs = np.where(Tqids == i)[0] # retrieve indexes of corresponding qid
        relevance_list = [Ty[idx] > 0 for idx in idxs] # list of boolean(relevant) of q-d pairs
        rel_idxs =np.where(relevance_list)[0] # retrieve indexes of relevant docs
    
        ### if np where is never empty
        rel_doc_list = [TX[idx] for idx in rel_idxs] # retrieve relevant docs
        avg_features = np.mean(rel_doc_list, axis=0) #average each feature of relevant docs
    
        if any(rel_idxs):
            rel_doc_list = [TX[idx] for idx in rel_idxs]
            avg_features = np.mean(rel_doc_list, axis=0) #average each feature of relevant docs
        else:
            avg_features = np.zeros(np.size(TX, 1)) # if there is no relevant document in training, give features 0s
    
        avg_features_qids = np.append(avg_features_qids, [avg_features], axis = 0)

    q_vec = avg_features_qids
    
    
    ###  Cluster once with average features of relevant docs, and twice more using evaluation metrics
    
    NO_OF_CLUSTER=5

    for count in range(3):
        print('loop count: ' + str(count))
        ##cluster the feature into n clusters as stated in the research paper
        km_f_avg = KMeans(n_clusters= NO_OF_CLUSTER, random_state=0).fit(q_vec)
    
        cluster_label =km_f_avg.labels_
        #print(collections.Counter(cluster_label))
    
    
        new_q = np.empty([len(qids), 8*NO_OF_CLUSTER])
        cluster_models = [0]*NO_OF_CLUSTER


        for k in range(NO_OF_CLUSTER):
            l = np.where(cluster_label == k)[0]
            qids_cluster = [qids[idx] for idx in l]

            # retrieve qids from indexes of the cluster
            qids_cluster = [qids[idx] for idx in l]

            TX_cluster = np.empty((0,np.size(TX, 1)), float)
            Tqids_cluster = np.array([])
            Ty_cluster = np.array([])

            for qs in qids_cluster:
                idxs = np.where(Tqids == qs)[0] # retrieve indexes of corresponding qid

                # create data set that only contains the qids from the cluster
                for i in idxs:
                    TX_cluster = np.append(TX_cluster, [TX[i]], axis = 0)
                    Tqids_cluster = np.append(Tqids_cluster, [Tqids[i]])
                    Ty_cluster = np.append(Ty_cluster, [Ty[i]])



            # train cluster
            metric = pyltr.metrics.NDCG(k=5)
            model = pyltr.models.LambdaMART(
                metric=metric,
                n_estimators=500,
                verbose=0,
            )
            model.fit(TX_cluster, Ty_cluster, Tqids_cluster)

            # store the fitted model
            cluster_models[k] = model


            # metrics from pyltr
            metric1 = pyltr.metrics.NDCG(k=3)
            metric2 = pyltr.metrics.NDCG(k=5)
            metric3 = pyltr.metrics.NDCG(k=10)
            metric4 = pyltr.metrics.AP(k=100)

            for i, qid in enumerate(qids):
                idxs = np.where(Tqids == qid)[0] # retrieve indexes of corresponding qid
                TX_i = [TX[idx] for idx in idxs] # data
                Ty_i = [Ty[idx] for idx in idxs] #l abels
                rel_i = [Ty[idx] > 0 for idx in idxs]# boolean(relevant)

                #prediction using the trained model
                Ty_i = np.asarray(Ty_i)
                Tpred_i = np.asarray(model.predict(TX_i))

                # store metrics
                first_idx = 8*k

                new_q[i][first_idx] = metric1.evaluate_preds(i, Ty_i, Tpred_i) # ndcg@3
                new_q[i][first_idx + 1] = metric2.evaluate_preds(i, Ty_i, Tpred_i) # ndcg@5
                new_q[i][first_idx + 2] = metric3.evaluate_preds(i, Ty_i, Tpred_i) # ndcg@10
                new_q[i][first_idx + 3] = metric4.evaluate_preds(i, Ty_i, Tpred_i) # MAP@100
                new_q[i][first_idx + 4] = compute_MRR(rel_i, Tpred_i) # MRR@100
                new_q[i][first_idx + 5] = compute_precision(rel_i, Tpred_i, 3) # p@3
                new_q[i][first_idx + 6] = compute_precision(rel_i, Tpred_i, 5) # p@5
                new_q[i][first_idx + 7] = compute_precision(rel_i, Tpred_i, 10)# p@10
            #print('cluster ' + str(k) + ' done')
        q_vec = new_q
    
    
    
    
    
    ### Testing
    
    # from BFC, we know the strongest feature is index 38 for every fold
    best_feature = 38
    
    # get the qids in the trainind data
    e_qids = get_qids(Eqids)
    t_qids = get_qids(Tqids)
    
    
    ## Get top 10 values
    
    TX_top10 = np.empty((0,np.size(TX, 1)), float)
    EX_top10 = np.empty((0,np.size(TX, 1)), float)

    # iterate over qid for training
    for i in t_qids: 
        idxs = np.where(Tqids == i)[0] # retrieve indexes of corresponding qid
        TX_feature = np.array([TX[idx].tolist()for idx in idxs]) # list of best feature for all docs
        sorted_array = TX_feature[TX_feature[:, best_feature].argsort()]
        top10 = sorted_array[-10:]

        TX_top10 = np.append(TX_top10, [np.mean(top10, axis = 0)], axis = 0)

    # iterate over qid for evaluation
    for i in e_qids: 
        idxs = np.where(Eqids == i)[0] # retrieve indexes of corresponding qid
        EX_feature = np.array([EX[idx].tolist()for idx in idxs]) # list of best feature for all docs
        sorted_array = EX_feature[EX_feature[:, best_feature].argsort()]
        top10 = sorted_array[-10:]

        EX_top10 = np.append(EX_top10, [np.mean(top10, axis = 0).tolist()], axis = 0)
    
    
    
    ### logistics regression 
    clf = LogisticRegression(random_state=0).fit(TX_top10, cluster_label)
    
    #cluster model label of evaluation data set for selective cluster
    label_SC = clf.predict(EX_top10)
    prob = clf.predict_proba(EX_top10)
    
    
    
    # evaluation of selective cluster
    ndcg_sc = evaluate_BFC_NDCG_selective_cluster(EX, Ey, Eqids, label_SC, cluster_models)
    mrr_sc = evaluate_BFC_MRR_selective_cluster(EX, Ey, Eqids, label_SC, cluster_models)
    
    # evaluation of cluster oracle
    ndcg_co = evaluate_BFC_NDCG_cluster_oracle(EX, Ey, Eqids, NO_OF_CLUSTER, cluster_models)
    mrr_co = evaluate_BFC_MRR_cluster_oracle(EX, Ey, Eqids, NO_OF_CLUSTER, cluster_models)
    
    # evaluation of cluster fusion
    ndcg_cf = evaluate_BFC_NDCG_cluster_fusion(EX, Ey, Eqids, prob, NO_OF_CLUSTER, cluster_models)
    mrr_cf = evaluate_BFC_MRR_cluster_fusion(EX, Ey, Eqids, prob, NO_OF_CLUSTER, cluster_models)
    
    
    # finally, add the evaluations
    
    Qid = np.append(Qid, Qids_e)
    e_ndcg_sc = np.append(e_ndcg_sc, ndcg_sc)
    e_mrr_sc = np.append(e_mrr_sc, mrr_sc)
    e_ndcg_co = np.append(e_ndcg_co, ndcg_co)
    e_mrr_co = np.append(e_mrr_co, mrr_co)
    e_ndcg_cf = np.append(e_ndcg_cf, ndcg_cf)
    e_mrr_cf = np.append(e_mrr_cf, mrr_cf)
    

fold number: 0
loop count: 0
loop count: 1
loop count: 2
fold number: 1
loop count: 0
loop count: 1
loop count: 2
fold number: 2
loop count: 0
loop count: 1
loop count: 2
fold number: 3
loop count: 0
loop count: 1
loop count: 2
fold number: 4
loop count: 0
loop count: 1
loop count: 2


In [14]:
Qid_df = pd.DataFrame.from_dict(Qid) 
e_ndcg_sc_df = pd.DataFrame.from_dict(e_ndcg_sc) 
e_mrr_sc_df = pd.DataFrame.from_dict(e_mrr_sc) 
e_ndcg_co_df = pd.DataFrame.from_dict(e_ndcg_co) 
e_mrr_co_df = pd.DataFrame.from_dict(e_mrr_co) 
e_ndcg_cf_df = pd.DataFrame.from_dict(e_ndcg_cf) 
e_mrr_cf_df = pd.DataFrame.from_dict(e_mrr_cf) 

In [15]:
Qid_df.columns=['Q_ID']
e_ndcg_sc_df.columns=['NDCG@5']
e_ndcg_co_df.columns=['NDCG@5']
e_mrr_sc_df.columns=['MRR@100']
e_mrr_co_df.columns=['MRR@100']
e_ndcg_cf_df.columns=['NDCG@5']
e_mrr_cf_df.columns=['MRR@100']

In [16]:
Qid_df.to_csv('Qid_kmeans.csv', index = False)

e_ndcg_sc_df.to_csv('ndcg_sc_kmeans.csv', index = False)
e_ndcg_co_df.to_csv('ndcg_co_kmeans.csv', index = False)
e_mrr_sc_df.to_csv('mrr_sc_kmeans.csv', index = False)
e_mrr_co_df.to_csv('mrr_co_kmeans.csv', index = False)
e_ndcg_cf_df.to_csv('ndcg_cf_kmeans.csv', index = False)
e_mrr_cf_df.to_csv('mrr_cf_kmeans.csv', index = False)

array([7968., 7979., 7993., ..., 7943., 7953., 7959.])