# Pointwise
* predict the relevance scores

# Import

In [1]:
from collections import Counter
import numpy as np  
import pandas as pd 
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt 
from sklearn.preprocessing import scale 
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# models
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from lightgbm import LGBMRanker

In [2]:
# 導入資料
res_df = pd.read_pickle ('/home/adam/Steph_C/my_thesis/data/ORI_by_postal_code_L_dropped.pkl').reset_index(drop=True)
train_df = pd.read_pickle('/home/adam/Steph_C/my_thesis/data/Train_by_postoal_code_without_review_pointwise_v2.pkl')
test_df = pd.read_pickle('/home/adam/Steph_C/my_thesis/data/Test_by_postoal_code_without_review_pointwise_v2.pkl')

In [3]:
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [4]:
len(Counter(train_df.name))

1053

In [5]:
train_df.shape

(15355, 27)

In [6]:
test_df.shape

(8679, 27)

In [7]:
res_df = res_df.drop_duplicates(subset=['name','postal_code']).reset_index(drop=True)

In [8]:
# create a branch cnt dictionary

branch_dict = {}

for i in Counter(res_df.name):
    tmp = res_df[res_df.name ==i]
    cnt = len(tmp)
    if cnt not in branch_dict:
        branch_dict[cnt]=[]
    else:
        branch_dict[cnt].append(i)
    

# Models

In [9]:
def _precision(predictions , actuals, k = None):
    
    """
    Calculate the precision at k
    
    Returns: a list of precisions
    """
    
    precisions =[]

    for i in range(len(predictions)):
        
        prediction = predictions[i]

        if  k != None:
            prediction =  predictions[i][:k]
        
        score = 0
        for j in prediction:
            if j in actuals[i]:
                score+=1
        precisions.append(score/len(prediction))
    return precisions
    

def _recall(predictions , actuals, k = None):
    
    """
    Calculate the precision at k
    
    Returns: a list of recalls
    """
    recalls =[]
    
    for i in range(len(predictions)):
        
        prediction =  predictions[i]
        
        if  k != None:
            prediction =  predictions[i][:k]
        
        score = 0
        for j in range(len(prediction)):
            if prediction[j] in actuals[i]:
                score+=1
        recalls.append(score/len(actuals[i]))
    
    return recalls



In [10]:
def calculate_mrr(predictions, actuals):
    """
    Calculate the mean reciprocal rank (MRR) for a set of predictions and actual values.
    
    Parameters:
    predictions (list of lists): A list of predicted rankings sorted by probability.
    actual (list of lists): A list of actual rankings sorted by relevance.
    
    Returns:    
    float: A list of MRR scores.
    """
    mrr_list = []
    for i in range(len(predictions)):
        reciprocal_rank = 0
        if actuals[i][0] in predictions[i]:
            reciprocal_rank = 1/ (predictions[i].index(actuals[i][0]) + 1)
        mrr_list.append(reciprocal_rank)
    return mrr_list

In [11]:
def calculate_map( predictions , actuals, k=None):
    """
    Calculate the mean average precision (MAP) for a set of queries.

    Parameters:
    actual (list of sets or lists): A list of sets or lists of the actual relevant items for each query.
    predicted (list of lists): A list of lists of predicted items for each query.
    k (int): The maximum number of predicted items to consider for each query.

    Returns:
    float: A list of MAP scores.
    """
    
    map_list = []
    

    for i in range(len(predictions)):
        
        ap_list = []
        hit = 0 
        cnt = 0 
        
        prediction =  predictions[i]
        
        if k != None:
            prediction =  predictions[i][:k]
        
        
        for j in prediction:
            if j in actuals[i]:
                hit+=1
                cnt+=1
                ap_list.append(hit/cnt)
            else:
                cnt+=1
        map_list.append(np.mean(ap_list))
    
    return map_list
                

In [12]:
def calculate_dcg_ndcg( predictions , actuals, rel ,k=None):
    """
    Calculate the DCG@k , NDCG@k for a set of queries.

    Parameters:
    actual (list of sets or lists): A list of sets or lists of the actual relevant items for each query.
    predicted (list of lists): A list of lists of predicted items for each query.
    k (int): The maximum number of predicted items to consider for each query.

    Returns:
    float: A list of DCG , NDCG scores.
    """
    dcg_list = []
    ndcg_list = []
    
    for i in range(len(predictions)):
        dcg =0
        idcg =0
        
        prediction = predictions[i]
        
        if k != None:
            prediction = predictions[i][:k]
        
        for j in range(len(actuals[i])):
            if actuals[i][j] in prediction:
                rank = prediction.index(actuals[i][j]) + 1
                dcg += np.divide(float(rel[i][j]),np.log2(rank+1))
                idcg += np.divide(float(rel[i][j]),np.log2((j+1)+1))
        dcg_list.append(dcg)
        print(rel[i], prediction,actuals[i]  )
        print(i,dcg,idcg )
        ndcg_list.append(np.divide(dcg,idcg))
        
    return dcg_list , ndcg_list

In [13]:
# create list of list for query ranking
def get_ranking(df ):
    
    """
    Turn the probability array into a list of lists for calculation.
    
    Parameters:
    df(DataFrame): the test dataframe
    
    Returns:
    prediction (list of lists): A list of predicted rankings for each query.
    actual (list of lists): A list of actual rankings for each query.
    """
 
    pred_list = []
    pred_rel = []
    true_list = []
    
    
    for res in Counter(df.name):
        
        tmp = df[df.name == res]
        a_sorted = tmp.sort_values(by=['relevance'],ascending=[False])
        p_sorted = tmp.sort_values(by=['predictions'],ascending=[False])
#         p_sorted = p_sorted[p_sorted.predictions>0]

        true_list.append(list(a_sorted[a_sorted.relevance!=0].postal_code))
        pred_list.append(list(p_sorted.postal_code))
        pred_rel.append(list(a_sorted[a_sorted.relevance!=0].relevance))
        
        
        
    return pred_list, pred_rel , true_list

# Model

In [14]:
# models
train_features=['density', 'entropy', 'competitiveness','area_pop', 'accessibility','complementary']

model = LogisticRegression()
model.fit(train_df[train_features], train_df[['relevance']])
predict = model.predict(test_df[train_features])
test_df['predictions'] = predict

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# Evaluate by branch cnt

In [15]:
# models = [LR, RF, DTC, KNC, SVC, GNB, LGBM]
# model_name =['LR', 'RF', 'DTC', 'KNC', 'SVC', 'GNB','LGBMRanker']
# score_dict = {}

# for i in range(len(models)):
    
#     score_dict[model_name[i]]={}
#     model = models[i]
#     # Train
#     if model_name[i] != 'LGBMRanker':
#         model.fit(train_df[train_features], train_df[['relevance']])
#     else:
#         model.fit(train_df[train_features], train_df[['relevance']], group=train_groups)
    
#     # Predict
#     predict = model.predict(test_df[train_features])
#     test_df['predictions'] = predict
#     pred_list, pred_rel , true_list = get_ranking(test_df)

#     # Evaluation
#     mrr_list = calculate_mrr(pred_list , true_list)
#     map_list = calculate_map(pred_list , true_list)
#     dcg_list , ndcg_list = calculate_dcg_ndcg(pred_list , true_list,pred_rel)
#     precision_list_1 = _precision(pred_list , true_list, k=1)
#     recall_list_1 = _recall(pred_list , true_list,k=1)
#     precision_list_3 = _precision(pred_list , true_list, k=3)
#     recall_list_3 = _recall(pred_list , true_list,k=3)
#     precision_list = _precision(pred_list , true_list)
#     recall_list = _recall(pred_list , true_list)

#     score_dict[model_name[i]]['precision @ 1'] = np.mean(precision_list_1)
#     score_dict[model_name[i]]['recall @ 1'] = np.mean(recall_list_1)
#     score_dict[model_name[i]]['precision @ 3 '] = np.mean(precision_list_3)
#     score_dict[model_name[i]]['recall @ 3'] = np.mean(recall_list_3)
#     score_dict[model_name[i]]['precision'] = np.mean(precision_list)
#     score_dict[model_name[i]]['recall'] = np.mean(recall_list)
#     score_dict[model_name[i]]['mrr'] = np.mean(mrr_list)
#     score_dict[model_name[i]]['map'] = np.mean(map_list)
#     score_dict[model_name[i]]['dcg'] = np.mean(dcg_list)
#     score_dict[model_name[i]]['ndcg'] = np.mean(ndcg_list)

In [16]:
score_dict = {}
for i in sorted(branch_dict):
#     print(i)
    model_name = 'branch_cnt_'+str(i)
    score_dict[model_name]={}
    tmp = test_df[test_df.name.isin(branch_dict[i])]
    pred_list, pred_rel , true_list = get_ranking(tmp)

    # Evaluation
    mrr_list = calculate_mrr(pred_list , true_list)
    map_list = calculate_map(pred_list , true_list)
    dcg_list , ndcg_list = calculate_dcg_ndcg(pred_list , true_list,pred_rel)
    precision_list_1 = _precision(pred_list , true_list, k=1)
    recall_list_1 = _recall(pred_list , true_list,k=1)
    precision_list_3 = _precision(pred_list , true_list, k=3)
    recall_list_3 = _recall(pred_list , true_list,k=3)
    precision_list = _precision(pred_list , true_list)
    recall_list = _recall(pred_list , true_list)

    score_dict[model_name]['precision @ 1'] = np.mean(precision_list_1)
    score_dict[model_name]['recall @ 1'] = np.mean(recall_list_1)
    score_dict[model_name]['precision @ 3 '] = np.mean(precision_list_3)
    score_dict[model_name]['recall @ 3'] = np.mean(recall_list_3)
    score_dict[model_name]['precision'] = np.mean(precision_list)
    score_dict[model_name]['recall'] = np.mean(recall_list)
    score_dict[model_name]['mrr'] = np.mean(mrr_list)
    score_dict[model_name]['map'] = np.mean(map_list)
    score_dict[model_name]['dcg'] = np.mean(dcg_list)
    score_dict[model_name]['ndcg'] = np.mean(ndcg_list)

[12.0] ['33759', '89509', '89440', '89521', '89439', '89502', '89501', '89511'] ['33759']
0 12.0 12.0
[12.0] ['33558', '89509', '89440', '89521', '89439', '89502', '89501', '89511'] ['33558']
1 12.0 12.0
[11.0] ['19102', '89509', '89440', '89521', '89439', '89502', '89501', '89511'] ['19102']
2 11.0 11.0
[11.0] ['19104', '89509', '89440', '89521', '89439', '89502', '89501', '89511'] ['19104']
3 11.0 11.0
[11.0] ['85706', '89509', '89440', '89521', '89439', '89502', '89501', '89511'] ['85706']
4 11.0 11.0
[12.0] ['37064', '89509', '89440', '89521', '89439', '89502', '89501', '89511'] ['37064']
5 12.0 12.0
[12.0] ['70065', '89509', '89440', '89521', '89439', '89502', '89501', '89511'] ['70065']
6 12.0 12.0
[12.0] ['19130', '89509', '89440', '89521', '89439', '89502', '89501', '89511'] ['19130']
7 12.0 12.0
[11.0] ['33612', '89509', '89440', '89521', '89439', '89502', '89501', '89511'] ['33612']
8 11.0 11.0
[11.0] ['37172', '89509', '89440', '89521', '89439', '89502', '89501', '89511'] ['

[10.0] ['85745', '89509', '89440', '89521', '89439', '89502', '89501', '89511'] ['85745']
0 10.0 10.0
[10.0] ['37211', '89509', '89440', '89521', '89439', '89502', '89501', '89511'] ['37211']
1 10.0 10.0
[10.0] ['46268', '89509', '89440', '89521', '89439', '89502', '89501', '89511'] ['46268']
2 10.0 10.0
[11.0] ['46250', '89509', '89440', '89521', '89439', '89502', '89501', '89511'] ['46250']
3 11.0 11.0
[10.0] ['19406', '89509', '89440', '89521', '89439', '89502', '89501', '89511'] ['19406']
4 10.0 10.0
[12.0] ['89595', '89501', '89519', '89508', '89503', '89439', '89440', '89512'] ['89595']
5 12.0 12.0
[10.0] ['19107', '89509', '89440', '89521', '89439', '89502', '89501', '89511'] ['19107']
6 10.0 10.0
[11.0] ['19125', '89509', '89440', '89521', '89439', '89502', '89501', '89511'] ['19125']
7 11.0 11.0
[11.0] ['33702', '89509', '89440', '89521', '89439', '89502', '89501', '89511'] ['33702']
8 11.0 11.0
[10.0] ['33781', '89509', '89440', '89521', '89439', '89502', '89501', '89511'] ['

[11.0, 9.0] ['19003', '19107', '89509', '89440', '89521', '89439', '89502', '89501', '89511'] ['19107', '19003']
0 15.940227289286032 16.678367782143116
[9.0, 8.0] ['37066', '37214', '89509', '89440', '89521', '89439', '89502', '89501', '89511'] ['37214', '37066']
1 13.678367782143116 14.04743802857166
[10.0, 8.0] ['08053', '46032', '89509', '89440', '89521', '89439', '89502', '89501', '89511'] ['46032', '08053']
2 14.309297535714574 15.04743802857166
[12.0, 10.0] ['19064', '85712', '89501', '89521', '89509', '89506', '89439', '89440', '89519'] ['85712', '19064']
3 17.57115704285749 18.309297535714574
[11.0, 9.0] ['89595', '89503', '89512', '89440', '89521', '89502', '89439', '89519'] ['89503', '89512']
4 11.440227289286032 16.678367782143116
[12.0, 8.0] ['19341', '33612', '89509', '89440', '89521', '89439', '89502', '89501', '89511'] ['19341', '33612']
5 17.047438028571662 17.047438028571662
[10.0, 9.0] ['08002', '33607', '89509', '89440', '89521', '89439', '89502', '89501', '89511'] 

In [17]:
pd.DataFrame(score_dict).round(3)

Unnamed: 0,branch_cnt_2,branch_cnt_3,branch_cnt_4,branch_cnt_5,branch_cnt_6,branch_cnt_7,branch_cnt_8,branch_cnt_9,branch_cnt_10,branch_cnt_11,branch_cnt_12
precision @ 1,1.0,0.984,0.929,0.933,0.895,1.0,0.8,0.778,1.0,1.0,0.9
recall @ 1,1.0,0.984,0.464,0.467,0.447,0.333,0.267,0.259,0.25,0.25,0.225
precision @ 3,0.333,0.333,0.667,0.667,0.667,1.0,0.933,0.926,1.0,1.0,0.967
recall @ 3,1.0,1.0,1.0,1.0,1.0,1.0,0.933,0.926,0.75,0.75,0.725
precision,0.126,0.126,0.224,0.225,0.227,0.3,0.326,0.312,0.364,0.364,0.367
recall,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
mrr,1.0,0.992,0.679,0.752,0.728,0.5,0.692,0.602,0.617,0.611,0.6
map,1.0,0.992,0.97,0.972,0.956,1.0,0.928,0.92,1.0,1.0,0.968
dcg,11.527,10.862,16.69,16.014,15.321,19.325,16.724,16.543,19.805,18.575,17.05
ndcg,1.0,0.994,0.956,0.963,0.94,0.934,0.916,0.885,0.921,0.891,0.903
