# Pointwise
* predict the relevance scores

# Import

In [1]:
from collections import Counter
import numpy as np  
import pandas as pd 
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt 
from sklearn.preprocessing import scale 
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# models
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from lightgbm import LGBMRanker

In [2]:
RANDOM_STATE = 24

In [3]:
# 導入資料
res_df = pd.read_pickle ('/home/adam/Steph_C/my_thesis/data/ORI_by_postal_code.pkl').reset_index(drop=True)
train_df = pd.read_pickle('/home/adam/Steph_C/my_thesis/data/Train_by_postoal_code_without_review_pointwise_v1_1.pkl')
test_df = pd.read_pickle('/home/adam/Steph_C/my_thesis/data/Test_by_postoal_code_without_review_pointwise_v1_1.pkl')

In [4]:
# shuffle
train_df = train_df.sample(frac=1 , random_state = RANDOM_STATE).reset_index(drop=True)
test_df = test_df.sample(frac=1 , random_state = RANDOM_STATE).reset_index(drop=True)

In [5]:
len(Counter(train_df.name))

565

In [6]:
train_df.shape

(4650, 11)

In [7]:
test_df.shape

(3900, 11)

In [8]:
res_df = res_df.drop_duplicates(subset=['name','postal_code']).reset_index(drop=True)

In [9]:
# create a branch cnt dictionary

branch_dict = {}

for i in Counter(res_df.name):
    tmp = res_df[res_df.name ==i]
    cnt = len(tmp)
    if cnt not in branch_dict:
        branch_dict[cnt]=[]
    else:
        branch_dict[cnt].append(i)
    

# Models

In [10]:
def _precision(predictions , actuals, k = None):
    
    """
    Calculate the precision at k
    
    Returns: a list of precisions
    """
    
    precisions =[]

    for i in range(len(predictions)):
        
        prediction = predictions[i]

        if  k != None:
            prediction =  predictions[i][:k]
        
        score = 0
        for j in prediction:
            if j in actuals[i]:
                score+=1
        if len(prediction) != 0:
            precisions.append(score/len(prediction))
        else:
            precisions.append(0)
    return precisions
    

def _recall(predictions , actuals, k = None):
    
    """
    Calculate the precision at k
    
    Returns: a list of recalls
    """
    recalls =[]
    
    for i in range(len(predictions)):
        
        prediction =  predictions[i]
        
        if  k != None:
            prediction =  predictions[i][:k]
        
        score = 0
        for j in range(len(prediction)):
            if prediction[j] in actuals[i]:
                score+=1
        recalls.append(score/len(actuals[i]))
    
    return recalls



In [11]:
def calculate_mrr(predictions, actuals):
    """
    Calculate the mean reciprocal rank (MRR) for a set of predictions and actual values.
    
    Parameters:
    predictions (list of lists): A list of predicted rankings sorted by probability.
    actual (list of lists): A list of actual rankings sorted by relevance.
    
    Returns:    
    float: A list of MRR scores.
    """
    mrr_list = []
    for i in range(len(predictions)):
        reciprocal_rank = 0
        if actuals[i][0] in predictions[i]:
            reciprocal_rank = 1/ (predictions[i].index(actuals[i][0]) + 1)
        mrr_list.append(reciprocal_rank)
    return mrr_list

In [12]:
def calculate_map( predictions , actuals, k=None):
    """
    Calculate the mean average precision (MAP) for a set of queries.

    Parameters:
    actual (list of sets or lists): A list of sets or lists of the actual relevant items for each query.
    predicted (list of lists): A list of lists of predicted items for each query.
    k (int): The maximum number of predicted items to consider for each query.

    Returns:
    float: A list of MAP scores.
    """
    
    map_list = []
    

    for i in range(len(predictions)):
        
        ap_list = []
        hit = 0 
        cnt = 0 
        
        prediction =  predictions[i]
        
        if k != None:
            prediction =  predictions[i][:k]
        
        
        for j in prediction:
            if j in actuals[i]:
                hit+=1
                cnt+=1
                ap_list.append(hit/cnt)
            else:
                cnt+=1
        if len(ap_list) != 0:
            map_list.append(np.mean(ap_list))
        else:
            map_list.append(0)
    
    return map_list
                

In [13]:
def calculate_dcg_ndcg( predictions , actuals, rel ,k=None):
    """
    Calculate the DCG@k , NDCG@k for a set of queries.

    Parameters:
    actual (list of sets or lists): A list of sets or lists of the actual relevant items for each query.
    predicted (list of lists): A list of lists of predicted items for each query.
    k (int): The maximum number of predicted items to consider for each query.

    Returns:
    float: A list of DCG , NDCG scores.
    """
    dcg_list = []
    ndcg_list = []
    
    for i in range(len(predictions)):
        dcg =0
        idcg =0
        
        prediction = predictions[i]
        
        if k != None:
            prediction = predictions[i][:k]
        
        for j in range(len(actuals[i])):
            if actuals[i][j] in prediction:
                rank = prediction.index(actuals[i][j]) + 1
                dcg += np.divide(float(rel[i][j]),np.log2(rank+1))
            idcg += np.divide(float(rel[i][j]),np.log2((j+1)+1))
        dcg_list.append(dcg)
        if np.divide(dcg,idcg) > 1:
            print(rel[i], prediction,actuals[i]  )
            print(i,dcg,idcg  , 'Wrong !!!')
        ndcg_list.append(np.divide(dcg,idcg))
        
    return dcg_list , ndcg_list

In [14]:
# create list of list for query ranking
def get_ranking(df ):
    
    """
    Turn the probability array into a list of lists for calculation.
    
    Parameters:
    df(DataFrame): the test dataframe
    
    Returns:
    prediction (list of lists): A list of predicted rankings for each query.
    actual (list of lists): A list of actual rankings for each query.
    """
 
    pred_list = []
    pred_rel = []
    true_list = []
    
    
    for res in Counter(df.name):
        
        tmp = df[df.name == res]
        a_sorted = tmp.sort_values(by=['relevance'],ascending=[False])
        p_sorted = tmp.sort_values(by=['predictions'],ascending=[False])
        p_sorted = p_sorted[p_sorted.predictions>0]

        true_list.append(list(a_sorted[a_sorted.relevance!=0].postal_code))
        pred_list.append(list(p_sorted.postal_code))
        pred_rel.append(list(a_sorted[a_sorted.relevance!=0].relevance))
        
        
        
    return pred_list, pred_rel , true_list

# Model

In [15]:
# models
train_features=['density', 'entropy', 'competitiveness','area_pop', 'accessibility','complementary']

model = LogisticRegression(random_state=RANDOM_STATE)
model.fit(train_df[train_features], train_df[['relevance']])
predict = model.predict(test_df[train_features])
test_df['predictions'] = predict

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# Evaluate by branch cnt

In [16]:
score_dict = {}
for i in sorted(branch_dict):
#     print(i)
    model_name = 'branch_cnt_'+str(i)
    score_dict[model_name]={}
    tmp = test_df[test_df.name.isin(branch_dict[i])]
    pred_list, true_rel , true_list = get_ranking(tmp)

    # Evaluation
    mrr_list = calculate_mrr(pred_list , true_list)
    map_list = calculate_map(pred_list , true_list)
    dcg_list , ndcg_list = calculate_dcg_ndcg(pred_list , true_list,true_rel)
    precision_list_1 = _precision(pred_list , true_list, k=1)
    recall_list_1 = _recall(pred_list , true_list,k=1)
    precision_list_3 = _precision(pred_list , true_list, k=3)
    recall_list_3 = _recall(pred_list , true_list,k=3)
    precision_list = _precision(pred_list , true_list)
    recall_list = _recall(pred_list , true_list)

    score_dict[model_name]['precision @ 1'] = np.mean(precision_list_1)
    score_dict[model_name]['recall @ 1'] = np.mean(recall_list_1)
    score_dict[model_name]['precision @ 3 '] = np.mean(precision_list_3)
    score_dict[model_name]['recall @ 3'] = np.mean(recall_list_3)
    score_dict[model_name]['precision'] = np.mean(precision_list)
    score_dict[model_name]['recall'] = np.mean(recall_list)
    score_dict[model_name]['mrr'] = np.mean(mrr_list)
    score_dict[model_name]['map'] = np.mean(map_list)
    score_dict[model_name]['dcg'] = np.mean(dcg_list)
    score_dict[model_name]['ndcg'] = np.mean(ndcg_list)
    score_dict[model_name]['size'] = len(tmp)

In [17]:
pd.DataFrame(score_dict).round(3)

Unnamed: 0,branch_cnt_2,branch_cnt_3,branch_cnt_4,branch_cnt_5,branch_cnt_6
precision @ 1,0.123,0.121,0.162,0.125,0.1
recall @ 1,0.123,0.121,0.081,0.062,0.033
precision @ 3,0.122,0.121,0.162,0.125,0.1
recall @ 3,0.123,0.121,0.095,0.062,0.033
precision,0.122,0.121,0.162,0.125,0.1
recall,0.123,0.121,0.095,0.062,0.033
mrr,0.123,0.121,0.027,0.042,0.0
map,0.123,0.121,0.176,0.125,0.1
dcg,0.615,0.485,0.565,0.292,0.2
ndcg,0.123,0.121,0.096,0.068,0.042


# Value Check

In [18]:
pred_list, true_rel , true_list = get_ranking(test_df)

In [19]:
for i in range(len(pred_list)):
    print(f'prediction : {pred_list[i]}')
    print(f'true : {true_list[i]}')
    print(f'relevance score  : {true_rel [i]}')

prediction : []
true : ['37027']
relevance score  : [5.0]
prediction : []
true : ['19003', '19406']
relevance score  : [3.0, 2.0]
prediction : ['37201', '37203']
true : ['46250', '46142', '46123']
relevance score  : [3.0, 2.0, 1.0]
prediction : []
true : ['37067']
relevance score  : [5.0]
prediction : []
true : ['46229']
relevance score  : [5.0]
prediction : []
true : ['63109']
relevance score  : [5.0]
prediction : []
true : ['19342']
relevance score  : [5.0]
prediction : []
true : ['37027', '34655']
relevance score  : [4.0, 3.0]
prediction : []
true : ['63108', '63102']
relevance score  : [4.0, 3.0]
prediction : []
true : ['34689']
relevance score  : [5.0]
prediction : []
true : ['18914']
relevance score  : [4.0]
prediction : ['19107']
true : ['19107']
relevance score  : [5.0]
prediction : []
true : ['08034', '19341', '19468']
relevance score  : [3.0, 2.0, 1.0]
prediction : ['83702']
true : ['83642']
relevance score  : [5.0]
prediction : ['83702']
true : ['89439']
relevance score  : [