In [1]:
!pip install pyltr

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


In [2]:
!pip install more_itertools

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


In [3]:
import warnings
import pyltr
warnings.filterwarnings('ignore')

In [4]:
from sklearn.cluster import AgglomerativeClustering
import numpy as np
from more_itertools import sort_together

In [5]:
## Finding the key inside a dict
def find_matching_key_index(id, data):
    for i in range(0,len(data)):
        if data[i]['id'] == id:
            return i
    return -1


## partition data cluster wise
def partition_data_cluster_wise(X_d,Y_d,T_q,labels):
    data_dict = []
    for i in range(0, len(labels)):
        idx = find_matching_key_index(labels[i],data_dict )
        if idx == -1:
            data = {}
            data['id'] = labels[i]
            data['data'] = X_d[i:i+1]
            data['label'] = Y_d[i:i+1]
            data['tqid'] = T_q[i:i+1]
            data_dict.append(data)
        else :
            data = {}
            d =  data_dict[idx]['data']
            la =  data_dict[idx]['label']
            tq =  data_dict[idx]['tqid']
            data_dict[idx]['data'] = np.concatenate((d, X_d[i:i+1]), axis=0)
            data_dict[idx]['label'] = np.concatenate((la, Y_d[i:i+1]), axis=0)
            data_dict[idx]['tqid'] = np.concatenate((tq, T_q[i:i+1]), axis=0)

    return data_dict

## concatenate all the evaluation across clusters
def concatenate_cluster_eval(p_data):
    m_data = {}
    m_data['data'] = np.array([[np.array(xx)] for xx in  p_data[0]['eval']])
    m_data['label'] = p_data[0]['label']
    m_data['tqid'] = p_data[0]['tqid']
    for j in range(1, len(p_data)):
        dd = np.array([[np.array(xx)] for xx in  p_data[j]['eval']])
        m_data['data'] = np.concatenate((m_data['data'] ,dd), axis=0)
        m_data['label'] = np.concatenate((m_data['label'] , p_data[j]['label']), axis=0)
        m_data['tqid'] = np.concatenate((m_data['tqid'] , p_data[j]['tqid']), axis=0)
    return m_data

## train model using LamdaMart 
def train_model_using_LambdaMART(p_data):
    for j in range(0,len(p_data)):
        metric = pyltr.metrics.NDCG(k=5)
    
        model = pyltr.models.LambdaMART(
            metric=metric,
            n_estimators=500,
            verbose=1,
        )

        model.fit(p_data[j]['data'], p_data[j]['label'], p_data[j]['tqid'])
        p_data[j]['model'] = model
        p_data[j]['metric'] = metric
      
    return p_data

##evaluate partiotioned data
def evaluate_partitioned_data(p_data) :
    for j in range(0,len(p_data)):
        Epred = p_data[j]['model'].predict(p_data[j]['data'])
        evl_arr = evalute_preds(p_data[j]['label'],Epred,p_data[j]['tqid'], p_data[j]['metric'])
        p_data[j]['eval'] = evl_arr
    return p_data

##evaluate on all features
def evaluate_features(p_data, X, Y, tqids):
    eval_dict_arr = []
    eval_dict = {}
    for j in range(0,len(p_data)):
        Epred = p_data[j]['model'].predict(X)
        evl_arr = evalute_preds(Y, Epred, tqids, p_data[j]['metric'])
        eval_dict['id'] = j
        eval_dict['features'] = X
        eval_dict['eval'] = evl_arr
        eval_dict_arr.append(eval_dict)
    return eval_dict_arr
    
"""
Evaluate prediction
"""
def evalute_preds(Y,Y_pred,Qids, met):
    ev_arr = []
    query_groups = pyltr.util.group.get_groups(Qids)
    for qd, a,b in query_groups:
        m = met.evaluate_preds(qd,Y[a:b], Y_pred[a:b] )
        #copy the same value b-a times
        ev_arr.extend([m]*(b-a))
    return  ev_arr

"""
Group data inside a cluster based on ids as ids needs to be  contiguous.
It may happen that after clustering the tids are not contigous then LambdaMart will throw error
"""
def group_tqids(p_data):
    for j in range(0, len(p_data)):
        aa = p_data[j]['data']
        bb = p_data[j]['label']
        cc = p_data[j]['tqid']
        ord_data = sort_together([cc,aa.flatten(),bb])
        p_data[j]['data'] = np.array([[np.array(d)] for d in ord_data[1]])
        p_data[j]['label'] = np.array(ord_data[2])
        p_data[j]['tqid'] = np.array(ord_data[0])
    return p_data

In [6]:
with open('./MQ2007/Fold1/train.txt') as trainfile, \
         open('./MQ2007/Fold1/vali.txt') as valifile, \
         open('./MQ2007/Fold1/test.txt') as evalfile:
    TX, Ty, Tqids, _ = pyltr.data.letor.read_dataset(trainfile)
    VX, Vy, Vqids, _ = pyltr.data.letor.read_dataset(valifile)
    EX, Ey, Eqids, _ = pyltr.data.letor.read_dataset(evalfile)

### Train Base Line model on all features

In [7]:
base_metric = pyltr.metrics.NDCG(k=5)
    
base_model = pyltr.models.LambdaMART(
            metric=base_metric,
            n_estimators=200,
            verbose=1,
)

base_model.fit(TX,Ty, Tqids)

 Iter  Train score    Remaining                           Monitor Output 
    1       0.1600        2.54m                                         
    2       0.1986        2.77m                                         
    3       0.2943        2.78m                                         
    4       0.3486        2.79m                                         
    5       0.3530        2.80m                                         
    6       0.3555        2.81m                                         
    7       0.3613        2.89m                                         
    8       0.3678        2.87m                                         
    9       0.3696        2.86m                                         
   10       0.3730        2.85m                                         
   15       0.3894        2.79m                                         
   20       0.4031        2.71m                                         
   25       0.4104        2.64m                   

<pyltr.models.lambdamart.LambdaMART at 0x1a1f9d5050>

#### Step 1, take the average of the features

In [8]:
%%time
##Step 1
## compute the average of the feature
f_avg = np.array([[np.mean(subarray)] for subarray in TX])
##cluster the feature into 5 clusters as stated in the research paper
clustering = AgglomerativeClustering(n_clusters=4).fit(f_avg)
clustering.labels_

CPU times: user 1min 2s, sys: 1min 16s, total: 2min 19s
Wall time: 5min 10s


array([2, 0, 0, ..., 3, 2, 1])

#### Train the model on each cluster

In [9]:
## Arrange the data cluster wise
partitioned_data = partition_data_cluster_wise(f_avg,Ty,Tqids,clustering.labels_)
##train model for each cluster
partitioned_data = train_model_using_LambdaMART(partitioned_data)

 Iter  Train score    Remaining                           Monitor Output 
    1       0.1534       54.10s                                         
    2       0.1592       49.80s                                         
    3       0.1713       48.67s                                         
    4       0.1792       47.72s                                         
    5       0.1976       47.20s                                         
    6       0.2016       46.73s                                         
    7       0.2024       46.15s                                         
    8       0.2022       45.22s                                         
    9       0.2070       44.76s                                         
   10       0.2070       44.19s                                         
   15       0.2084       42.65s                                         
   20       0.2139       41.98s                                         
   25       0.2155       41.58s                   

  140       0.4044       53.63s                                         
  160       0.4136       50.79s                                         
  180       0.4172       47.81s                                         
  200       0.4218       44.83s                                         
  220       0.4331       41.83s                                         
  240       0.4373       38.84s                                         
  260       0.4418       35.83s                                         
  280       0.4437       32.84s                                         
  300       0.4497       29.84s                                         
  320       0.4528       26.86s                                         
  340       0.4580       23.87s                                         
  360       0.4591       20.88s                                         
  380       0.4648       17.89s                                         
  400       0.4703       14.91s                    

#### Evaluate partitioned data and create new features set based in the evaluation

In [10]:
##evaluate partiotioned data
partitioned_data = evaluate_partitioned_data(partitioned_data)
##merge partitioned data as mentioned in the paper [q11,q12, ..q1n, q21,q22, ...., q41,...]
merg_data = concatenate_cluster_eval(partitioned_data)

### Step 2 cluster data on new feature set created from evaluation and train it again

In [11]:
clustering = AgglomerativeClustering(n_clusters=4).fit(merg_data['data'])
clustering.labels_

array([1, 1, 1, ..., 3, 3, 3])

In [12]:
partitioned_data = partition_data_cluster_wise(merg_data['data'],merg_data['label'],merg_data['tqid'],clustering.labels_)
##group the ids as it needs to be contogous
partitioned_data = group_tqids(partitioned_data)
##train model again on this dataset
partitioned_data = train_model_using_LambdaMART(partitioned_data)

 Iter  Train score    Remaining                           Monitor Output 
    1       0.0868        1.15m                                         
    2       0.0920        1.11m                                         
    3       0.0926        1.09m                                         
    4       0.0910        1.10m                                         
    5       0.0907        1.10m                                         
    6       0.0940        1.09m                                         
    7       0.0885        1.09m                                         
    8       0.0929        1.09m                                         
    9       0.0933        1.08m                                         
   10       0.0973        1.08m                                         
   15       0.0957        1.07m                                         
   20       0.0990        1.06m                                         
   25       0.0986        1.04m                   

  140       0.0000       25.28s                                         
  160       0.0000       23.88s                                         
  180       0.0000       22.41s                                         
  200       0.0000       20.98s                                         
  220       0.0000       19.54s                                         
  240       0.0000       18.12s                                         
  260       0.0000       16.70s                                         
  280       0.0000       15.29s                                         
  300       0.0000       13.89s                                         
  320       0.0000       12.50s                                         
  340       0.0000       11.11s                                         
  360       0.0000        9.71s                                         
  380       0.0000        8.32s                                         
  400       0.0000        6.93s                    

#### Evaluate partitioned data and create new feature set based in second clusters

In [13]:
##evaluate partiotioned data
partitioned_data = evaluate_partitioned_data(partitioned_data)
##merge partitioned data as mentioned in the paper [q11,q12, ..q1n, q21,q22, ...., q41,...]
merg_data = concatenate_cluster_eval(partitioned_data)

#### Step 3 cluster data for the last time  on new feature set created from evaluation and train it again

In [14]:
clustering = AgglomerativeClustering(n_clusters=4).fit(merg_data['data'])
clustering.labels_
partitioned_data = partition_data_cluster_wise(merg_data['data'],merg_data['label'],merg_data['tqid'],clustering.labels_)
##group the ids as it needs to be contogous
partitioned_data= group_tqids(partitioned_data)
##train model again on this dataset
partitioned_data = train_model_using_LambdaMART(partitioned_data)

 Iter  Train score    Remaining                           Monitor Output 
    1       0.0000        3.81m                                         
    2       0.0000        3.70m                                         
    3       0.0000        3.67m                                         
    4       0.0000        3.62m                                         
    5       0.0000        3.61m                                         
    6       0.0000        3.59m                                         
    7       0.0000        3.59m                                         
    8       0.0000        3.57m                                         
    9       0.0000        3.56m                                         
   10       0.0000        3.54m                                         
   15       0.0000        3.48m                                         
   20       0.0000        3.43m                                         
   25       0.0000        3.39m                   

  140       0.6158        5.46s                                         
  160       0.6158        5.14s                                         
  180       0.6158        4.83s                                         
  200       0.6158        4.52s                                         
  220       0.6158        4.20s                                         
  240       0.6158        3.90s                                         
  260       0.6158        3.60s                                         
  280       0.6158        3.30s                                         
  300       0.6158        3.01s                                         
  320       0.6158        2.73s                                         
  340       0.6158        2.46s                                         
  360       0.6158        2.15s                                         
  380       0.6158        1.84s                                         
  400       0.6158        1.54s                    

In [15]:
##evaluate partiotioned data
partitioned_data = evaluate_partitioned_data(partitioned_data)

### Prepare training data for cluster classification

In [None]:
xxx = evaluate_features(partitioned_data, TX,Ty, Tqids )

In [None]:
np.mean(xxx[4]['eval'])

In [21]:
for k in range(0,4):
    f_avg = np.array([[np.mean(subarray)] for subarray in TX])
    Epred = partitioned_data[k]['model'].predict(f_avg)
    print ('Random ranking:',  partitioned_data[k]['metric'].calc_mean_random(Tqids, Ty))
    print ('Our model:', partitioned_data[k]['metric'].calc_mean(Tqids, Ty, Epred))

Random ranking: 341443188670396.94
Our model: 5333387515706.951
Random ranking: 0.23600386783826527
Our model: 0.20345739558503712
Random ranking: 0.25241019519320473
Our model: 0.007937142931938607
Random ranking: 0.2600063674729722
Our model: 0.22840703608300203
