# Pairwise
* RankNET
* LambdaMart -> Using LGBRanker


## Import 

In [1]:
import pandas as pd
import json
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import pickle
import math 

In [2]:
# 導入資料
train_df = pd.read_pickle('/home/adam/Steph_C/my_thesis/data/Train_by_postoal_code_without_review.pkl')
test_df = pd.read_pickle('/home/adam/Steph_C/my_thesis/data/Test_by_postoal_code_without_review.pkl')

## Model

In [3]:
from lightgbm import LGBMRanker
train_features = ['density', 'entropy', 'competitiveness','area_pop'\
                  , 'accessibility','complementary']
model = LGBMRanker(objective="lambdarank")

In [4]:
get_group_size = lambda df: df.reset_index().groupby("name")['name'].count()

train_groups = get_group_size(train_df).to_numpy()
test_groups = get_group_size(test_df).to_numpy()

print(sum(train_groups) , sum(test_groups))

4710 3966


### Training

In [5]:
print(len(Counter(train_df.name)) , len(Counter(test_df.name)))

564 564


In [6]:
print(train_df[train_features].shape ,test_df[train_features].shape )

(4710, 6) (3966, 6)


In [7]:
# the original version 
# model.fit(
#     train_df[train_features],
#     train_df[['relevance']],
#     group=train_groups,
#     eval_set=[(new_test[train_features],new_test[['relevance']])],
#     eval_group=[test_groups],
#     eval_at=6,
#     eval_metric=['ndcg'])

######### SCORE ############
# valid_0's ndcg@6: 0.0315787
# valid_0's ndcg@6: 0.0315787
# valid_0's ndcg@6: 0.0315787

In [8]:
model.fit(train_df[train_features], train_df[['relevance']], group=train_groups)

LGBMRanker(objective='lambdarank')

### Testing

In [9]:
predict = model.predict(test_df[train_features])
test_df['predictions'] = predict

In [10]:
len(predict)

3966

In [11]:
test_df.head()

Unnamed: 0,name,postal_code,density,entropy,competitiveness,area_pop,accessibility,complementary,relevance,predictions
1,1933 Lounge,46037,6.0,1.791759,-0.166667,1813.0,0.0,0.0,5.0,-0.538611
99,1933 Lounge,33558,2.0,0.693147,-0.5,117.0,0.0,0.0,0.0,-1.855596
100,1933 Lounge,33559,5.0,1.609438,-0.2,1796.0,0.0,0.0,0.0,0.707293
101,1933 Lounge,33563,1.0,-0.0,-1.0,344.0,0.0,0.0,0.0,-0.928484
102,1933 Lounge,33573,1.0,-0.0,-1.0,104.0,0.0,0.0,0.0,-1.139348


### Evaluation 

In [12]:
# create list of list for query ranking
def get_ranking(df , top = None):
    
    """
    Turn the probability array into a list of lists for calculation.
    
    Parameters:
    df(DataFrame): the test dataframe
    
    Returns:
    prediction (list of lists): A list of predicted rankings for each query.
    actual (list of lists): A list of actual rankings for each query.
    """
 
    pred_list = []
    pred_rel = []
    true_list = []
    
    
    for res in Counter(df.name):
        
        tmp = df[df.name == res]
        a_sorted = tmp.sort_values(by=['relevance'])
        p_sorted = tmp.sort_values(by=['predictions'])
#         p_sorted = p_sorted[p_sorted.predictions>0]
        if top != None:
            p_sorted = p_sorted[:top]

        true_list.append(list(a_sorted[a_sorted.relevance!=0].postal_code))
        pred_list.append(list(p_sorted.postal_code))
        pred_rel.append(list(a_sorted[a_sorted.relevance!=0].relevance))
        
        
        
    return pred_list, pred_rel , true_list

In [13]:
def _precision(predictions , actuals, k = None):
    
    """
    Calculate the precision at k
    
    Returns: a list of precisions
    """
    
    precisions =[]

    for i in range(len(predictions)):
        
        prediction = predictions[i]

        if  k != None:
            prediction =  predictions[i][:k]
        
        score = 0
        for j in prediction:
            if j in actuals[i]:
                score+=1
        precisions.append(score/len(prediction))
    return precisions
    

def _recall(predictions , actuals, k = None):
    
    """
    Calculate the precision at k
    
    Returns: a list of recalls
    """
    recalls =[]
    
    for i in range(len(predictions)):
        
        prediction =  predictions[i]
        
        if  k != None:
            prediction =  predictions[i][:k]
        
        score = 0
        for j in range(len(prediction)):
            if prediction[j] in actuals[i]:
                score+=1
        recalls.append(score/len(actuals[i]))
    
    return recalls



In [14]:
def calculate_mrr(predictions, actuals):
    """
    Calculate the mean reciprocal rank (MRR) for a set of predictions and actual values.
    
    Parameters:
    predictions (list of lists): A list of predicted rankings sorted by probability.
    actual (list of lists): A list of actual rankings sorted by relevance.
    
    Returns:    
    float: A list of MRR scores.
    """
    mrr_list = []
    for i in range(len(predictions)):
        reciprocal_rank = 0
        if actuals[i][0] in predictions[i]:
            reciprocal_rank = 1/ (predictions[i].index(actuals[i][0]) + 1)
        mrr_list.append(reciprocal_rank)
    return mrr_list

In [15]:
def calculate_map( predictions , actuals, k=None):
    """
    Calculate the mean average precision (MAP) for a set of queries.

    Parameters:
    actual (list of sets or lists): A list of sets or lists of the actual relevant items for each query.
    predicted (list of lists): A list of lists of predicted items for each query.
    k (int): The maximum number of predicted items to consider for each query.

    Returns:
    float: A list of MAP scores.
    """
    
    map_list = []
    

    for i in range(len(predictions)):
        
        ap_list = []
        hit = 0 
        cnt = 0 
        
        prediction =  predictions[i]
        
        if k != None:
            prediction =  predictions[i][:k]
        
        
        for j in prediction:
            if j in actuals[i]:
                hit+=1
                cnt+=1
                ap_list.append(hit/cnt)
            else:
                cnt+=1
        map_list.append(np.mean(ap_list))
    
    return map_list
                

In [16]:
def calculate_dcg_ndcg( predictions , actuals, rel ,k=None):
    """
    Calculate the DCG@k , NDCG@k for a set of queries.

    Parameters:
    actual (list of sets or lists): A list of sets or lists of the actual relevant items for each query.
    predicted (list of lists): A list of lists of predicted items for each query.
    k (int): The maximum number of predicted items to consider for each query.

    Returns:
    float: A list of DCG , NDCG scores.
    """
    dcg_list = []
    ndcg_list = []
    
    for i in range(len(predictions)):
        dcg =0
        idcg =0
        
        prediction = predictions[i]
        
        if k != None:
            prediction = predictions[i][:k]
        
        for j in range(len(actuals[i])):
            if actuals[i][j] in prediction:
                rank = prediction.index(actuals[i][j]) + 1
                dcg += np.divide(float(rel[i][j]),np.log2(rank+1))
                idcg += np.divide(float(rel[i][j]),np.log2((j+1)+1))
        dcg_list.append(dcg)
        ndcg_list.append(np.divide(dcg,idcg))
        
    return dcg_list , ndcg_list

In [17]:
pred_list, pred_rel , true_list = get_ranking(test_df)

In [31]:
mrr_list = calculate_mrr(pred_list , true_list)
map_list = calculate_map(pred_list , true_list)
dcg_list , ndcg_list = calculate_dcg_ndcg(pred_list,true_list,pred_rel)
precision_list = _precision(pred_list , true_list,k=3)
recall_list = _recall(pred_list , true_list,k=3)

In [32]:
print(f'Precision at 3 : {np.mean(precision_list)}')
print(f'Recall at 3 : {np.mean(recall_list)}')
print(f'MRR : {np.mean(mrr_list)}')
print(f'MAP : {np.mean(map_list)}')
print(f'DCG at 3 : {np.mean(dcg_list)}')
print(f'NDCG at 3 : {np.mean(ndcg_list)}')

Precision at 3 : 0.044326241134751775
Recall at 3 : 0.11908983451536641
MRR : 0.22518302963781686
MAP : 0.23465489145505833
DCG at 3 : 1.992562722947393
NDCG at 3 : 0.41761038598090067


# Results

In [20]:
# Only one true answer with all pred
# MRR : 0.22518302963781686
# MAP : 0.23465489145505833
# Precision at 3 : 0.16666666666666666
# Recall at 3 : 1.0
# DCG at 3 : 1.992562722947393
# NDCG at 3 : 0.41761038598090067

In [21]:
# one true answer with top 20 pred
# MRR : 0.07773851590106007
# MAP : 0.07773851590106007
# Precision at 3 : 0.025912838633686687
# Recall at 3 : 0.025912838633686687
# DCG at 3 : 0.34275618374558303
# NDCG at 3 : 0.0685512367491166

In [22]:
a = [1,2,3,4]
print(a[:3])

[1, 2, 3]
