# Learn to Rank

## Import 

In [156]:
import pandas as pd
import json
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import pickle
import math 

In [157]:
df = pd.read_pickle('/home/adam/Steph_C/my_thesis/data/FE_by_postoal_code_without_review.pkl')

In [158]:
df.shape

(1450, 24)

In [159]:
df.columns

Index(['business_id', 'stars_x', 'useful', 'funny', 'cool', 'text', 'date',
       'name', 'address', 'city', 'state', 'postal_code', 'stars_y',
       'review_count', 'is_open', 'attributes', 'categories', 'hours',
       'density', 'entropy', 'competitiveness', 'area_pop', 'accessibility',
       'complementary'],
      dtype='object')

## DF Construction
* add relevance

In [6]:
df = df.drop(columns=[ 'business_id', 'stars_x', 'useful', 'funny', 'cool', 'text',
       'date','address','is_open', 'stars_y','attributes','hours'])

In [7]:
# sort by name and review count 
df = df.sort_values(['name', 'review_count'],
              ascending = [True, False]).reset_index(drop=True)

In [8]:
# create relevance score
cnt = 0
new_df = pd.DataFrame()
for i in Counter(df.name):
    tmp = df[df.name==i].reset_index(drop=True)
    tmp['relevance']=''
    score = 6
    for j in range(len(tmp)):
        tmp['relevance'][j]=score
        score -=1
    new_df = pd.concat([new_df,tmp])

# check the shape
if new_df.shape[0] != df.shape[0]:
    print(f'There is a mistake creating the relevance score')
else:
    print(f'Relevance score added')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Relevance score added


In [9]:
# # breakdown affinity & complementary
# from operator import itemgetter
# indices = range(len(df['affinity'][0]))
# a_df = df['affinity'].transform({f'affinity_{i+1}': itemgetter(i) for i in indices})

# indices = range(len(df['complementary'][0]))
# c_df = df['complementary'].transform({f'complementary_{i+1}': itemgetter(i) for i in indices})

In [10]:
# new_df = new_df.join(a_df)
# new_df = new_df.join(c_df)

## Create Dataset

In [11]:
new_df.columns

Index(['name', 'city', 'state', 'postal_code', 'review_count', 'categories',
       'density', 'entropy', 'competitiveness', 'relevance'],
      dtype='object')

In [12]:
# split train test 
# 拿每一個餐廳最後一家分店當作 testting set (因為大部分只有兩家分店)

cnt = 0
train_df = pd.DataFrame()
test_df = pd.DataFrame()
for i in Counter(df.name):
    tmp = new_df[new_df.name==i]
    train_df = pd.concat([train_df ,tmp.iloc[:-1,:]])
    test_df = pd.concat([test_df ,tmp.iloc[-1:,:]])
    
# check if add = original shape
if train_df.shape[0] + test_df.shape[0] != new_df.shape[0]:
    print(f'There is something wrong with the splitting !!! ')
elif len(Counter(train_df.name)) != len(Counter(test_df.name)):
    print(f'The number of the restaurants is wrong !!!')
else:
    print(f'Perfect split')

Perfect split


In [13]:
# postal code and feature dict
postal_code_feature_dict = {}

for postal in Counter(df.postal_code):
    postal_code_feature_dict[postal]={}
    postal_code_feature_dict[postal]['density'] = df.loc[df['postal_code']==postal]['density'].iloc[0]
    postal_code_feature_dict[postal]['entropy'] = df.loc[df['postal_code']==postal]['entropy'].iloc[0]
    postal_code_feature_dict[postal]['competitiveness'] = df.loc[df['postal_code']==postal]['competitiveness'].iloc[0]

pd.DataFrame(postal_code_feature_dict).T.reset_index().rename(columns={'index': 'postal_code'})

Unnamed: 0,postal_code,density,entropy,competitiveness
0,46225,6.0,1.791759,-0.166667
1,46037,6.0,1.791759,-0.166667
2,33618,13.0,2.564949,-0.076923
3,33609,11.0,2.397895,-0.090909
4,63126,3.0,1.098612,-0.333333
...,...,...,...,...
334,18969,1.0,-0.000000,-1.000000
335,46235,1.0,-0.000000,-1.000000
336,08055,1.0,-0.000000,-1.000000
337,18915,1.0,-0.000000,-1.000000


In [14]:
# reconstruct testing set

new_test = pd.DataFrame()

for i in Counter(df.name):
#     print(i)
    subset = pd.DataFrame(postal_code_feature_dict).T.reset_index().rename(columns={'index': 'postal_code'})
    subset['relevance']=0
    subset = subset[~subset['postal_code'].isin(train_df[train_df.name==i].postal_code)]
#     print(f'After deleting train : {subset.shape}')
    subset = subset[~subset['postal_code'].isin(test_df[test_df.name==i].postal_code)]
#     print(f'After deleting test : {subset.shape}')
    subset['name']=i
    subset = pd.concat([subset , test_df[test_df.name==i][['name','postal_code','density', 'entropy', 'competitiveness','relevance']]])
#     print(f'After adding test : {subset.shape}')
    
    # check if the test is constructed properly
    if len(subset) + len(Counter(train_df[train_df.name==i].postal_code)) != len(Counter(df.postal_code)):
        print(f'{i} has some problem constructing the testing dataset')
        print(subset.shape , Counter(df[df.name==i].postal_code))
        break
     
    
    new_test = pd.concat([new_test , subset])
    

In [15]:
for i in ['density', 'entropy', 'competitiveness','relevance']:
    train_df[i] = train_df[i].astype('float')
    new_test[i] = new_test[i].astype('float')

## Model

In [16]:
from lightgbm import LGBMRanker
train_features = ['density', 'entropy', 'competitiveness']
model = LGBMRanker(objective="lambdarank")

In [17]:
get_group_size = lambda df: df.reset_index().groupby("name")['name'].count()

train_groups = get_group_size(train_df).to_numpy()
test_groups = get_group_size(new_test).to_numpy()

print(sum(train_groups) , sum(test_groups))

884 190990


### Training

In [18]:
print(len(Counter(train_df.name)) , len(Counter(new_test.name)))

566 566


In [19]:
print(train_df[train_features].shape ,new_test[train_features].shape )

(884, 3) (190990, 3)


In [20]:
# the original version 
# model.fit(
#     train_df[train_features],
#     train_df[['relevance']],
#     group=train_groups,
#     eval_set=[(new_test[train_features],new_test[['relevance']])],
#     eval_group=[test_groups],
#     eval_at=6,
#     eval_metric=['ndcg'])

######### SCORE ############
# valid_0's ndcg@6: 0.0315787
# valid_0's ndcg@6: 0.0315787
# valid_0's ndcg@6: 0.0315787

In [21]:
model.fit(train_df[train_features], train_df[['relevance']], group=train_groups)

LGBMRanker(objective='lambdarank')

### Testing

In [22]:
predict = model.predict(new_test[train_features])
new_test['predictions'] = predict

In [23]:
len(predict)

190990

In [24]:
new_test.head()

Unnamed: 0,postal_code,density,entropy,competitiveness,relevance,name,predictions
2,33618,13.0,2.564949,-0.076923,0.0,1933 Lounge,-0.287881
3,33609,11.0,2.397895,-0.090909,0.0,1933 Lounge,0.085094
4,63126,3.0,1.098612,-0.333333,0.0,1933 Lounge,-0.957807
5,62269,4.0,1.386294,-0.25,0.0,1933 Lounge,-0.187775
6,37064,11.0,2.397895,-0.090909,0.0,1933 Lounge,0.085094


### Evaluation 

In [142]:
# create list of list for query ranking
def get_ranking(df , top = None):
    
    """
    Turn the probability array into a list of lists for calculation.
    
    Parameters:
    df(DataFrame): the test dataframe
    
    Returns:
    prediction (list of lists): A list of predicted rankings for each query.
    actual (list of lists): A list of actual rankings for each query.
    """
 
    pred_list = []
    pred_rel = []
    true_list = []
    
    
    for res in Counter(df.name):
        
        tmp = df[df.name == res]
        a_sorted = tmp.sort_values(by=['relevance'])
        p_sorted = tmp.sort_values(by=['predictions'])
#         p_sorted = p_sorted[p_sorted.predictions>0]
        if top != None:
            p_sorted = p_sorted[:top]

        true_list.append(list(a_sorted[a_sorted.relevance!=0].postal_code))
        pred_list.append(list(p_sorted.postal_code))
        pred_rel.append(list(p_sorted.relevance))
        
        
        
    return pred_list, pred_rel , true_list

In [143]:
pred_list, pred_rel , true_list = get_ranking(new_test , top=20)

In [144]:
def _precision(predictions , actuals, k =3):
    
    """
    Calculate the precision at 3
    
    Returns: a list of precisions
    """
    
    precisions =[]
    for i in range(len(predictions)):
        score = 0
        for j in range(k):
            if predictions[i][j] in actuals[i]:
                score+=1
        precisions.append(score/k)
            
    return precisions
    

def _recall(predictions , actuals, k = 3):
    
    """
    Calculate the precision at 3
    
    Returns: a list of recalls
    """
    
    recalls =[]
    for i in range(len(actuals)):
        score = 0
        for j in actuals[i]:
            if j in predictions[i]:
                score+=1
        recalls.append(score/k)
    
    return recalls
    

In [149]:
def calculate_mrr(predictions, actuals):
    """
    Calculate the mean reciprocal rank (MRR) for a set of predictions and actual values.
    
    Parameters:
    predictions (list of lists): A list of predicted rankings sorted by probability.
    actual (list of lists): A list of actual rankings sorted by relevance.
    
    Returns:    
    float: A list of MRR scores.
    """
    reciprocal_ranks = []
    for i in range(len(predictions)):
        reciprocal_rank = 0
        for j in actuals[i]:
            if j in predictions[i]:
                rank = predictions[i].index(j) + 1
                reciprocal_rank += 1.0 / rank
        reciprocal_ranks.append(reciprocal_rank/len(actuals[i]))
    return sum(reciprocal_ranks) / len(reciprocal_ranks)


In [150]:
def calculate_ap(actual, prediction, k=None):
    """
    Calculate the average precision (AP) for a single query.

    Parameters:
    actual (set or list): A set or list of the actual relevant items.
    predicted (list): A list of predicted items.
    k (int): The maximum number of predicted items to consider.

    Returns:
    float: The AP score.
    """
    if k is not None:
        prediction = prediction[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(prediction):
        if p in actual and p not in prediction[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), len(prediction))


def calculate_map( predictions , actuals, k=None):
    """
    Calculate the mean average precision (MAP) for a set of queries.

    Parameters:
    actual (list of sets or lists): A list of sets or lists of the actual relevant items for each query.
    predicted (list of lists): A list of lists of predicted items for each query.
    k (int): The maximum number of predicted items to consider for each query.

    Returns:
    float: A list of MAP scores.
    """
    return [calculate_ap(a, p, k) for a, p in zip(actuals, predictions)]


In [151]:
def calculate_dcg_ndcg(predictions,k=None):
    """
    Calculate the DCG@k , NDCG@k for a set of queries.

    Parameters:
    actual (list of sets or lists): A list of sets or lists of the actual relevant items for each query.
    predicted (list of lists): A list of lists of predicted items for each query.
    k (int): The maximum number of predicted items to consider for each query.

    Returns:
    float: A list of DCG , NDCG scores.
    """
    dcg_list = []
    ndcg_list = []
    for i in range(len(predictions)):
        dcg =0
        idcg =0
        rank =1
    
        if k is not None:
            prediction = predictions[i][:k]
        else:
            prediction = predictions[i]
        # cal dcg
        for j in prediction:
            dcg += np.divide(float(j),np.log2(rank+1))
            rank +=1
        idcg = np.divide(5.0,np.log2(1+1))
        dcg_list.append(dcg)
        ndcg_list.append(np.divide(dcg,idcg))
        
    return dcg_list , ndcg_list

    

In [152]:
mrr_list = calculate_mrr(pred_list , true_list)
map_list = calculate_map(pred_list , true_list)
dcg_list , ndcg_list = calculate_dcg_ndcg(pred_rel, k=3)
precision_list = _precision(pred_list , true_list)
recall_list = _recall(pred_list , true_list)

In [153]:
cnt =0 
for i in range(len(dcg_list)):
    if ndcg_list[i]!=0:
        cnt+=1
print(cnt)

44


In [154]:
print(f'MRR : {np.mean(mrr_list)}')
print(f'MAP : {np.mean(map_list)}')
print(f'Precision at 3 : {np.mean(precision_list)}')
print(f'Recall at 3 : {np.mean(recall_list)}')
print(f'DCG at 3 : {np.mean(dcg_list)}')
print(f'NDCG at 3 : {np.mean(ndcg_list)}')

MRR : 0.07773851590106007
MAP : 0.07773851590106007
Precision at 3 : 0.025912838633686687
Recall at 3 : 0.025912838633686687
DCG at 3 : 0.34275618374558303
NDCG at 3 : 0.0685512367491166


# Results

In [None]:
# Only one true answer with all pred
# MRR : 0.08307696445985394
# MAP : 0.08307696445985367
# Precision at 3 : 0.025912838633686687
# Recall at 3 : 0.33333333333333326

In [None]:
# one true answer with top 20 pred
# MRR : 0.07773851590106007
# MAP : 0.07773851590106007
# Precision at 3 : 0.025912838633686687
# Recall at 3 : 0.025912838633686687
# DCG at 3 : 0.34275618374558303
# NDCG at 3 : 0.0685512367491166

In [141]:
a = [1,2,3,4]
print(a[:3])

[1, 2, 3]
