# Pointwise
* predict the relevance scores

# Import

In [1]:
from collections import Counter
import numpy as np  
import pandas as pd 
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt 
from sklearn.preprocessing import scale 
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import itertools

# models
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from lightgbm import LGBMRanker

In [2]:
RANDOM_STATE = 24

In [3]:
# 導入資料
train_df = pd.read_pickle('/home/adam/Steph_C/my_thesis/data/Train_by_postoal_code_without_review_pointwise_v3_3.pkl')
test_df = pd.read_pickle('/home/adam/Steph_C/my_thesis/data/Test_by_postoal_code_without_review_pointwise_v3_3.pkl')

In [4]:
ori_train_df = train_df.reset_index(drop=True)
ori_test_df = test_df.reset_index(drop=True)

In [5]:
# create pairwise datasets

def pairwise_transform(df):
    
    COL_NAME = ['name','loc_a','loc_b','density', 'entropy',\
                'competitiveness','area_pop', 'accessibility',\
                'complementary','relevance','pair_importance']
    
    new_df = pd.DataFrame(columns=COL_NAME)
    
    for res in Counter(df.name):
        new_row = {}
        tmp = df[df.name==res]
        loc_a_l = []
        loc_b_l = []
        comb_list = list(itertools.combinations(list(tmp.postal_code),2))
        
        for sets in comb_list:  #[('46142', '46250'), ('46142', '19341'), ('46142', '46123')]
            rel_a = list(tmp.loc[tmp.postal_code==sets[0]].relevance)[0]
            rel_b = list(tmp.loc[tmp.postal_code==sets[1]].relevance)[0]
            
            if rel_a != rel_b and sets[0] not in loc_a_l and sets[1] not in loc_b_l:
                loc_a_l.append(sets[0])
                loc_b_l.append(sets[1])
                new_row['name'] = res
                new_row['loc_a'] = list(tmp.loc[tmp.postal_code==sets[0]].postal_code)[0]
                new_row['loc_b'] = list(tmp.loc[tmp.postal_code==sets[1]].postal_code)[0]
                new_row['density'] = list(tmp.loc[tmp.postal_code==sets[0]].density)[0]-list(tmp.loc[tmp.postal_code==sets[1]].density)[0]
                new_row['entropy'] = list(tmp.loc[tmp.postal_code==sets[0]].entropy)[0]-list(tmp.loc[tmp.postal_code==sets[1]].entropy)[0]
                new_row['competitiveness'] = list(tmp.loc[tmp.postal_code==sets[0]].competitiveness)[0]-list(tmp.loc[tmp.postal_code==sets[1]].competitiveness)[0]
                new_row['area_pop'] = list(tmp.loc[tmp.postal_code==sets[0]].area_pop)[0]-list(tmp.loc[tmp.postal_code==sets[1]].area_pop)[0]
                new_row['accessibility'] = list(tmp.loc[tmp.postal_code==sets[0]].accessibility)[0]-list(tmp.loc[tmp.postal_code==sets[1]].accessibility)[0]
                new_row['complementary'] = list(tmp.loc[tmp.postal_code==sets[0]].complementary)[0]-list(tmp.loc[tmp.postal_code==sets[1]].complementary)[0]
                new_row['relevance'] = int(rel_a >rel_b )
                new_row['pair_importance'] = list(tmp.loc[tmp.postal_code==sets[0]].relevance)[0]+list(tmp.loc[tmp.postal_code==sets[1]].relevance)[0]
            new_df = new_df.append(new_row,ignore_index=True)  
    return new_df

In [6]:
train_df = pairwise_transform(ori_train_df)
test_df = pairwise_transform(ori_test_df)

In [7]:
train_df = train_df.drop_duplicates().reset_index(drop = True)
test_df = test_df.drop_duplicates().reset_index(drop = True)

In [8]:
train_df = train_df.sample(frac=1 , random_state = RANDOM_STATE).reset_index(drop=True)
test_df = test_df.sample(frac=1 , random_state = RANDOM_STATE).reset_index(drop=True)

In [9]:
for i in ['density', 'entropy', 'competitiveness','area_pop', 'accessibility','complementary','relevance']:
    train_df[i] = train_df[i].astype('float')
    test_df[i] = test_df[i].astype('float')

In [10]:
len(Counter(train_df.name))

405

In [11]:
train_df.shape

(1136, 11)

In [12]:
test_df.shape

(712, 11)

# Models

In [None]:
def _precision(predictions , actuals, k = None):
    
    """
    Calculate the precision at k
    
    Returns: a list of precisions
    """
    
    precisions =[]

    for i in range(len(predictions)):
        
        prediction = predictions[i]

        if  k != None:
            prediction =  predictions[i][:k]
        
        score = 0
        for j in prediction:
            if j in actuals[i]:
                score+=1
        if len(prediction) != 0:
            precisions.append(score/len(prediction))
        else:
            precisions.append(0)
    return precisions
    

def _recall(predictions , actuals, k = None):
    
    """
    Calculate the precision at k
    
    Returns: a list of recalls
    """
    recalls =[]
    
    for i in range(len(predictions)):
        
        prediction =  predictions[i]
        
        if  k != None:
            prediction =  predictions[i][:k]
        
        score = 0
        for j in range(len(prediction)):
            if prediction[j] in actuals[i]:
                score+=1
        recalls.append(score/len(actuals[i]))
    
    return recalls



In [None]:
def calculate_mrr(predictions, actuals):
    """
    Calculate the mean reciprocal rank (MRR) for a set of predictions and actual values.
    
    Parameters:
    predictions (list of lists): A list of predicted rankings sorted by probability.
    actual (list of lists): A list of actual rankings sorted by relevance.
    
    Returns:    
    float: A list of MRR scores.
    """
    mrr_list = []
    for i in range(len(predictions)):
        reciprocal_rank = 0
        if actuals[i][0] in predictions[i]:
            reciprocal_rank = 1/ (predictions[i].index(actuals[i][0]) + 1)
        mrr_list.append(reciprocal_rank)
    return mrr_list

In [None]:
def calculate_map( predictions , actuals, k=None):
    """
    Calculate the mean average precision (MAP) for a set of queries.

    Parameters:
    actual (list of sets or lists): A list of sets or lists of the actual relevant items for each query.
    predicted (list of lists): A list of lists of predicted items for each query.
    k (int): The maximum number of predicted items to consider for each query.

    Returns:
    float: A list of MAP scores.
    """
    
    map_list = []
    

    for i in range(len(predictions)):
        
        ap_list = []
        hit = 0 
        cnt = 0 
        
        prediction =  predictions[i]
        
        if k != None:
            prediction =  predictions[i][:k]
        
        
        for j in prediction:
            if j in actuals[i]:
                hit+=1
                cnt+=1
                ap_list.append(hit/cnt)
            else:
                cnt+=1
        if len(ap_list) != 0:
            map_list.append(np.mean(ap_list))
        else:
            map_list.append(0)
    
    return map_list
                

In [None]:
def calculate_dcg_ndcg( predictions , actuals, rel ,k=None):
    """
    Calculate the DCG@k , NDCG@k for a set of queries.

    Parameters:
    actual (list of sets or lists): A list of sets or lists of the actual relevant items for each query.
    predicted (list of lists): A list of lists of predicted items for each query.
    k (int): The maximum number of predicted items to consider for each query.

    Returns:
    float: A list of DCG , NDCG scores.
    """
    dcg_list = []
    ndcg_list = []
    
    for i in range(len(predictions)):
        dcg =0
        idcg =0
        
        prediction = predictions[i]
        
        if k != None:
            prediction = predictions[i][:k]
        
        for j in range(len(actuals[i])):
            if actuals[i][j] in prediction:
                rank = prediction.index(actuals[i][j]) + 1
                dcg += np.divide(float(rel[i][j]),np.log2(rank+1))
            idcg += np.divide(float(rel[i][j]),np.log2((j+1)+1))
        dcg_list.append(dcg)
        if np.divide(dcg,idcg) > 1:
            print(rel[i], prediction,actuals[i]  )
            print(i,dcg,idcg  , 'Wrong !!!')
        ndcg_list.append(np.divide(dcg,idcg))
        
    return dcg_list , ndcg_list

In [None]:
# create list of list for query ranking

def get_ranking_list(df):
    
    ranked_list = []
    
    for i in range(len(df)):
        loc_a = df.loc_a[i]
        loc_b = df.loc_b[i]

        if df.predictions[i]==1: # loc_a > loc_b
            if loc_a in ranked_list:
                ranked_list.append(loc_b)
            elif loc_b in ranked_list:
                ranked_list.insert(loc_a,ranked_list.index(loc_b))
            else:
                ranked_list.append(loc_a)
                ranked_list.append(loc_b)
        else:                     # loc_b > loc_a
            if loc_a in ranked_list:
                ranked_list.insert(loc_b,ranked_list.index(loc_a))
            elif loc_b in ranked_list:
                ranked_list.append(loc_a)
            else:
                ranked_list.append(loc_b)
                ranked_list.append(loc_a)

    return ranked_list


def get_ranking_pair(df,ori_df):
    
    """
    Turn the probability array into a list of lists for calculation.
    
    Parameters:
    df(DataFrame): the test dataframe
    
    Returns:
    prediction (list of lists): A list of predicted rankings for each query.
    actual (list of lists): A list of actual rankings for each query.
    """
 
    pred_list = []
    pred_rel = []
    true_list = []
    
    
    for res in Counter(df.name):
        
        p_sorted = []
        a_sorted = []
        
        # prediction
        tmp = df[df.name == res].sort_values(by=['pred_importance'],ascending=[False]).reset_index(drop=True)
        p_sorted = get_ranking_list(tmp)
 
        # true
        ori_tmp = df[df.name == res].sort_values(by=['pair_importance'],ascending=[False]).reset_index(drop=True)
        a_sorted = get_ranking_list(ori_tmp)
        
        
        true_list.append(a_sorted)
        true_rel.append()
        pred_list.append(p_sorted)

        
        
        
    return pred_list, true_rel , true_list

In [20]:
p_list = [(5,8),(6,8),(7,5)]
p_list.append((7,9))
sorted(p_list,reverse = True)

[(7, 9), (7, 5), (6, 8), (5, 8)]

In [28]:
p_list = [(5,8),(6,8),(7,5)]
p_dict = {}

cnt =0
for i in p_list:
    p_dict[i] = 12-cnt
    cnt+=1

for i in range(len(p_list)):
    print(p_dict)
    for j in range(len(p_list)):
        if i !=j:
            if sorted(p_list[i] ,reverse=True) < sorted(p_list[j],reverse = True) and p_dict[p_list[i]] > p_dict[p_list[j]]:
                tmp = p_dict[p_list[i]]
                p_dict[p_list[i]] = p_dict[p_list[j]]
                p_dict[p_list[j]] = tmp
p_dict
                      
    

{(5, 8): 12, (6, 8): 11, (7, 5): 10}
{(5, 8): 11, (6, 8): 12, (7, 5): 10}
{(5, 8): 11, (6, 8): 12, (7, 5): 10}


{(5, 8): 11, (6, 8): 12, (7, 5): 10}

In [23]:
sorted((5,8),reverse = True) > sorted((6,8),reverse = True)

False

# Model

In [29]:
# try
train_features=['density', 'entropy', 'competitiveness','area_pop', 'accessibility','complementary']

get_group_size = lambda df: df.reset_index().groupby("name")['name'].count()

train_groups = get_group_size(train_df).to_numpy()
test_groups = get_group_size(test_df).to_numpy()
# predict relevance score
LGBM = LGBMRanker(objective="lambdarank",random_state=RANDOM_STATE)
LGBM.fit(train_df[train_features], train_df[['relevance']], group=train_groups)
predict = LGBM.predict(test_df[train_features])
test_df['pred_rel'] = predict

# # predict importance
# LR = LogisticRegression(random_state=RANDOM_STATE)
# LR.fit(train_df[train_features], train_df[['pair_importance']])
# predict = LR.predict(test_df[train_features])
# test_df['pred_importance'] = predict

# pred_list, pred_rel , true_list = get_ranking_pair(test_df,ori_test_df)

In [30]:
for i in Counter(test_df[:5].name):
    tmp = test_df[test_df.name== i]
    
    print(tmp)

                           name  loc_a  loc_b  density   entropy  \
0    Sakura Japanese Restaurant  37076  63053     65.0  3.483729   
629  Sakura Japanese Restaurant  19020  37076     22.0  0.200665   

     competitiveness  area_pop  accessibility  complementary  relevance  \
0                0.0    3998.0            0.0            0.0        1.0   
629              0.0     431.0            0.0            0.0        0.0   

     pair_importance  pred_rel  pred_importance  
0                9.0  7.548513             12.0  
629             17.0  2.411267             12.0  
                name  loc_a  loc_b  density   entropy  competitiveness  \
1    Manhattan Bagel  19083  18914     37.0  1.108957              0.0   
237  Manhattan Bagel  18914  08057    -21.0 -0.885920              0.0   
338  Manhattan Bagel  08034  19083      6.0  0.233721              0.0   
611  Manhattan Bagel  08057  93190     32.0  1.845827              0.0   

     area_pop  accessibility  complementary  rel

In [None]:
# models
LR = LogisticRegression(random_state=RANDOM_STATE)
RF = RandomForestClassifier(random_state=RANDOM_STATE)
DTC = DecisionTreeClassifier(random_state=RANDOM_STATE)
KNC = KNeighborsClassifier()
SVC = svm.SVC(random_state=RANDOM_STATE)
GNB = GaussianNB()
LGBM = LGBMRanker(objective="lambdarank",random_state=RANDOM_STATE)

train_features=['density', 'entropy', 'competitiveness','area_pop', 'accessibility','complementary']

get_group_size = lambda df: df.reset_index().groupby("name")['name'].count()

train_groups = get_group_size(train_df).to_numpy()
test_groups = get_group_size(test_df).to_numpy()

print(sum(train_groups) , sum(test_groups))

In [None]:
models = [LR, RF, DTC, KNC, SVC, GNB, LGBM]
model_name =['LR', 'RF', 'DTC', 'KNC', 'SVC', 'GNB','LGBMRanker']
score_dict = {}

for i in range(len(models)):
    score_dict[model_name[i]]={}
    model = models[i]
    # Train
    if model_name[i] != 'LGBMRanker':
        model.fit(train_df[train_features], train_df[['relevance']])
    else:
        model.fit(train_df[train_features], train_df[['relevance']], group=train_groups)
    
    # Predict
    predict = model.predict(test_df[train_features])
    test_df['predictions'] = predict
    pred_list, pred_rel , true_list = get_ranking(test_df)

    # Evaluation
    mrr_list = calculate_mrr(pred_list , true_list)
    map_list = calculate_map(pred_list , true_list)
    dcg_list , ndcg_list = calculate_dcg_ndcg(pred_list , true_list,pred_rel)
    precision_list_1 = _precision(pred_list , true_list, k=1)
    recall_list_1 = _recall(pred_list , true_list,k=1)
    precision_list_3 = _precision(pred_list , true_list, k=3)
    recall_list_3 = _recall(pred_list , true_list,k=3)
    precision_list = _precision(pred_list , true_list)
    recall_list = _recall(pred_list , true_list)

    score_dict[model_name[i]]['precision @ 1'] = np.mean(precision_list_1)
    score_dict[model_name[i]]['recall @ 1'] = np.mean(recall_list_1)
    score_dict[model_name[i]]['precision @ 3 '] = np.mean(precision_list_3)
    score_dict[model_name[i]]['recall @ 3'] = np.mean(recall_list_3)
    score_dict[model_name[i]]['precision'] = np.mean(precision_list)
    score_dict[model_name[i]]['recall'] = np.mean(recall_list)
    score_dict[model_name[i]]['mrr'] = np.mean(mrr_list)
    score_dict[model_name[i]]['map'] = np.mean(map_list)
    score_dict[model_name[i]]['dcg'] = np.mean(dcg_list)
    score_dict[model_name[i]]['ndcg'] = np.mean(ndcg_list)

# Evaluation

In [None]:
pd.DataFrame(score_dict).round(3)