In [1]:
!pip install pyltr
!pip install more_itertools



In [108]:
import warnings
import pyltr
warnings.filterwarnings('ignore')
import numpy as np
from more_itertools import sort_together
import collections
import heapq
from sklearn.metrics import precision_score
from sklearn.cluster import KMeans
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

In [109]:
# input array of qids of each query-document pair
# output sorted qids 
def get_qids(Qids):
    qs = list(set(Qids))
    qs.sort()
    
    return qs

In [110]:
with open('./MQ2007/Fold1/train.txt') as trainfile, \
         open('./MQ2007/Fold1/vali.txt') as valifile, \
         open('./MQ2007/Fold1/test.txt') as evalfile:
    TX, Ty, Tqids, _ = pyltr.data.letor.read_dataset(trainfile)
    VX, Vy, Vqids, _ = pyltr.data.letor.read_dataset(valifile)
    EX, Ey, Eqids, _ = pyltr.data.letor.read_dataset(evalfile)

Tqids = np.array([int(i) for i in Tqids])
Vqids = np.array([int(i) for i in Vqids])
Eqids = np.array([int(i) for i in Eqids])

In [111]:
# get the qids in the trainind data
qids = get_qids(Tqids)

## Average of features of relevant documents for query IDs

In [112]:
avg_features_qids = np.empty((0,np.size(TX, 1)), float)

# iterate over qid
for i in qids: 
    idxs = np.where(Tqids == i)[0] # retrieve indexes of corresponding qid
    relevance_list = [Ty[idx] > 0 for idx in idxs] # list of boolean(relevant) of q-d pairs
    rel_idxs =np.where(relevance_list)[0] # retrieve indexes of relevant docs
    
    ### if np where is never empty
    rel_doc_list = [TX[idx] for idx in rel_idxs] # retrieve relevant docs
    avg_features = np.mean(rel_doc_list, axis=0) #average each feature of relevant docs
    
    if any(rel_idxs):
        rel_doc_list = [TX[idx] for idx in rel_idxs]
        avg_features = np.mean(rel_doc_list, axis=0) #average each feature of relevant docs
    else:
        avg_features = np.zeros(np.size(TX, 1)) # if there is no relevant document in training, give features 0s
    
    avg_features_qids = np.append(avg_features_qids, [avg_features], axis = 0)

q_vec = avg_features_qids

## Cluster queries

In [113]:
def compute_precision(relevance_list, feature_list, k):
    feature_list, relevance_list = (list(t) for t in zip(*sorted(zip(feature_list, relevance_list), reverse = True)))
    if len(feature_list) > k:
        feature_list = feature_list[0:k]
        relevance_list = relevance_list[0:k]
    else:
        k = len(feature_list)
    
    return np.count_nonzero(relevance_list)/k

In [114]:
def compute_MRR(relevance_list, feature_list):
    # sort arrays by the feature value
    feature_list, relevance_list = (list(t) for t in zip(*sorted(zip(feature_list, relevance_list), reverse = True)))
    # MRR@100, only look at top 100
    if len(feature_list) > 100:
        feature_list = feature_list[0:100]
        relevance_list = relevance_list[0:100]

    idx = np.where(relevance_list)[0]#get the indexesof relevant document
    if any(idx):
        return 1/(idx[0]+1) # index starts from 0 so add 1
    else:
        return 0

## Cluster once with average features of relevant docs, and twice more using evaluation metrics

In [115]:
NO_OF_CLUSTER=5

for count in range(3):
    ##cluster the feature into n clusters as stated in the research paper
    km_f_avg = KMeans(n_clusters= NO_OF_CLUSTER, random_state=0).fit(q_vec)
    
    cluster_label =km_f_avg.labels_
    print(collections.Counter(cluster_label))
    
    
    new_q = np.empty([len(qids), 8*NO_OF_CLUSTER])
    cluster_models = [0]*NO_OF_CLUSTER
    
    
    for k in range(NO_OF_CLUSTER):
        l = np.where(cluster_label == k)[0]
        qids_cluster = [qids[idx] for idx in l]
    
        # retrieve qids from indexes of the cluster
        qids_cluster = [qids[idx] for idx in l]

        TX_cluster = np.empty((0,np.size(TX, 1)), float)
        Tqids_cluster = np.array([])
        Ty_cluster = np.array([])

        for qs in qids_cluster:
            idxs = np.where(Tqids == qs)[0] # retrieve indexes of corresponding qid
    
            # create data set that only contains the qids from the cluster
            for i in idxs:
                TX_cluster = np.append(TX_cluster, [TX[i]], axis = 0)
                Tqids_cluster = np.append(Tqids_cluster, [Tqids[i]])
                Ty_cluster = np.append(Ty_cluster, [Ty[i]])
    
    
    
        # train cluster
        metric = pyltr.metrics.NDCG(k=5)
        model = pyltr.models.LambdaMART(
            metric=metric,
            n_estimators=50,
            verbose=0,
        )
        model.fit(TX_cluster, Ty_cluster, Tqids_cluster)
    
        # store the fitted model
        cluster_models[k] = model

    
        # metrics from pyltr
        metric1 = pyltr.metrics.NDCG(k=3)
        metric2 = pyltr.metrics.NDCG(k=5)
        metric3 = pyltr.metrics.NDCG(k=10)
        metric4 = pyltr.metrics.AP(k=100)

        for i, qid in enumerate(qids):
            idxs = np.where(Tqids == qid)[0] # retrieve indexes of corresponding qid
            TX_i = [TX[idx] for idx in idxs] # data
            Ty_i = [Ty[idx] for idx in idxs] #l abels
            rel_i = [Ty[idx] > 0 for idx in idxs]# boolean(relevant)
    
            Tpred_i = model.predict(TX_i)
            Ty_i = np.asarray(Ty_i)
            Tpred_i = np.asarray(Tpred_i)
    
            # store metrics
            first_idx = 8*k

            new_q[i][first_idx] = metric1.evaluate_preds(i, Ty_i, Tpred_i) # ndcg@3
            new_q[i][first_idx + 1] = metric2.evaluate_preds(i, Ty_i, Tpred_i) # ndcg@5
            new_q[i][first_idx + 2] = metric3.evaluate_preds(i, Ty_i, Tpred_i) # ndcg@10
            new_q[i][first_idx + 3] = metric4.evaluate_preds(i, Ty_i, Tpred_i) # MAP@100
            new_q[i][first_idx + 4] = compute_MRR(rel_i, Tpred_i) # MRR@100
            new_q[i][first_idx + 5] = compute_precision(rel_i, Tpred_i, 3) # p@3
            new_q[i][first_idx + 6] = compute_precision(rel_i, Tpred_i, 5) # p@5
            new_q[i][first_idx + 7] = compute_precision(rel_i, Tpred_i, 10)# p@10
        print('cluster ' + str(k) + ' done')
    q_vec = new_q

Counter({1: 521, 3: 173, 0: 156, 4: 144, 2: 23})
cluster 0 done
cluster 1 done
cluster 2 done
cluster 3 done
cluster 4 done
Counter({1: 268, 0: 209, 4: 195, 3: 186, 2: 159})
cluster 0 done
cluster 1 done
cluster 2 done
cluster 3 done
cluster 4 done
Counter({0: 298, 1: 203, 4: 183, 3: 178, 2: 155})
cluster 0 done
cluster 1 done
cluster 2 done
cluster 3 done
cluster 4 done


In [116]:
#after clustering 3 times, cluster_models give the models trained with the clusters for the 3rd iteration
cluster_models

[<pyltr.models.lambdamart.LambdaMART at 0x7f7401818f10>,
 <pyltr.models.lambdamart.LambdaMART at 0x7f740182d050>,
 <pyltr.models.lambdamart.LambdaMART at 0x7f740182d110>,
 <pyltr.models.lambdamart.LambdaMART at 0x7f740182ddd0>,
 <pyltr.models.lambdamart.LambdaMART at 0x7f7401826050>]

# Testing

## choosing clusters

In [117]:
# from BFC, we know the strongest feature is index 38 for every fold
best_feature = 38

In [118]:
# get the qids in the trainind data
e_qids = get_qids(Eqids)
t_qids = get_qids(Tqids)

### Get top 10 values 

In [218]:
TX_top10 = np.empty((0,np.size(TX, 1)), float)
EX_top10 = np.empty((0,np.size(TX, 1)), float)

# iterate over qid for training
for i in t_qids: 
    idxs = np.where(Tqids == i)[0] # retrieve indexes of corresponding qid
    TX_feature = np.array([TX[idx].tolist()for idx in idxs]) # list of best feature for all docs
    sorted_array = TX_feature[TX_feature[:, best_feature].argsort()]
    top10 = sorted_array[-10:]
    
    TX_top10 = np.append(TX_top10, [np.mean(top10, axis = 0)], axis = 0)

# iterate over qid for evaluation
for i in e_qids: 
    idxs = np.where(Eqids == i)[0] # retrieve indexes of corresponding qid
    EX_feature = np.array([EX[idx].tolist()for idx in idxs]) # list of best feature for all docs
    sorted_array = EX_feature[EX_feature[:, best_feature].argsort()]
    top10 = sorted_array[-10:]
    
    EX_top10 = np.append(EX_top10, [np.mean(top10, axis = 0).tolist()], axis = 0)
    

In [220]:
len(EX_top10[0])

46

#### use logistic regression

In [221]:
clf = LogisticRegression(random_state=0).fit(TX_top10, cluster_label)

In [227]:
#cluster model label of evaluation data set for selective cluster
label_SC = clf.predict(EX_top10)