# Pointwise
* predict the relevance scores

# Import

In [1]:
from collections import Counter
import numpy as np  
import pandas as pd 
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt 
from sklearn.preprocessing import scale 
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# models
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from lightgbm import LGBMRanker

In [2]:
# 導入資料
res_df = pd.read_pickle ('/home/adam/Steph_C/my_thesis/data/ORI_by_postal_code.pkl').reset_index(drop=True)
train_df = pd.read_pickle('/home/adam/Steph_C/my_thesis/data/Train_by_postoal_code_without_review_pointwise.pkl')
test_df = pd.read_pickle('/home/adam/Steph_C/my_thesis/data/Test_by_postoal_code_without_review_pointwise.pkl')

In [3]:
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [4]:
len(Counter(train_df.name))

565

In [5]:
train_df.shape

(3144, 27)

In [6]:
test_df.shape

(2556, 27)

In [7]:
res_df = res_df.drop_duplicates(subset=['name','postal_code']).reset_index(drop=True)

In [8]:
# create a branch cnt dictionary

branch_dict = {}

for i in Counter(res_df.name):
    tmp = res_df[res_df.name ==i]
    cnt = len(tmp)
    if cnt not in branch_dict:
        branch_dict[cnt]=[]
    else:
        branch_dict[cnt].append(i)
    

# Models

In [9]:
def _precision(predictions , actuals, k = None):
    
    """
    Calculate the precision at k
    
    Returns: a list of precisions
    """
    
    precisions =[]

    for i in range(len(predictions)):
        
        prediction = predictions[i]

        if  k != None:
            prediction =  predictions[i][:k]
        
        score = 0
        for j in prediction:
            if j in actuals[i]:
                score+=1
        precisions.append(score/len(prediction))
    return precisions
    

def _recall(predictions , actuals, k = None):
    
    """
    Calculate the precision at k
    
    Returns: a list of recalls
    """
    recalls =[]
    
    for i in range(len(predictions)):
        
        prediction =  predictions[i]
        
        if  k != None:
            prediction =  predictions[i][:k]
        
        score = 0
        for j in range(len(prediction)):
            if prediction[j] in actuals[i]:
                score+=1
        recalls.append(score/len(actuals[i]))
    
    return recalls



In [10]:
def calculate_mrr(predictions, actuals):
    """
    Calculate the mean reciprocal rank (MRR) for a set of predictions and actual values.
    
    Parameters:
    predictions (list of lists): A list of predicted rankings sorted by probability.
    actual (list of lists): A list of actual rankings sorted by relevance.
    
    Returns:    
    float: A list of MRR scores.
    """
    mrr_list = []
    for i in range(len(predictions)):
        reciprocal_rank = 0
        if actuals[i][0] in predictions[i]:
            reciprocal_rank = 1/ (predictions[i].index(actuals[i][0]) + 1)
        mrr_list.append(reciprocal_rank)
    return mrr_list

In [11]:
def calculate_map( predictions , actuals, k=None):
    """
    Calculate the mean average precision (MAP) for a set of queries.

    Parameters:
    actual (list of sets or lists): A list of sets or lists of the actual relevant items for each query.
    predicted (list of lists): A list of lists of predicted items for each query.
    k (int): The maximum number of predicted items to consider for each query.

    Returns:
    float: A list of MAP scores.
    """
    
    map_list = []
    

    for i in range(len(predictions)):
        
        ap_list = []
        hit = 0 
        cnt = 0 
        
        prediction =  predictions[i]
        
        if k != None:
            prediction =  predictions[i][:k]
        
        
        for j in prediction:
            if j in actuals[i]:
                hit+=1
                cnt+=1
                ap_list.append(hit/cnt)
            else:
                cnt+=1
        map_list.append(np.mean(ap_list))
    
    return map_list
                

In [12]:
def calculate_dcg_ndcg( predictions , actuals, rel ,k=None):
    """
    Calculate the DCG@k , NDCG@k for a set of queries.

    Parameters:
    actual (list of sets or lists): A list of sets or lists of the actual relevant items for each query.
    predicted (list of lists): A list of lists of predicted items for each query.
    k (int): The maximum number of predicted items to consider for each query.

    Returns:
    float: A list of DCG , NDCG scores.
    """
    dcg_list = []
    ndcg_list = []
    
    for i in range(len(predictions)):
        dcg =0
        idcg =0
        
        prediction = predictions[i]
        
        if k != None:
            prediction = predictions[i][:k]
        
        for j in range(len(actuals[i])):
            if actuals[i][j] in prediction:
                rank = prediction.index(actuals[i][j]) + 1
                dcg += np.divide(float(rel[i][j]),np.log2(rank+1))
                idcg += np.divide(float(rel[i][j]),np.log2((j+1)+1))
        dcg_list.append(dcg)
        print(rel[i], prediction,actuals[i]  )
        print(i,dcg,idcg )
        ndcg_list.append(np.divide(dcg,idcg))
        
    return dcg_list , ndcg_list

In [13]:
# create list of list for query ranking
def get_ranking(df ):
    
    """
    Turn the probability array into a list of lists for calculation.
    
    Parameters:
    df(DataFrame): the test dataframe
    
    Returns:
    prediction (list of lists): A list of predicted rankings for each query.
    actual (list of lists): A list of actual rankings for each query.
    """
 
    pred_list = []
    pred_rel = []
    true_list = []
    
    
    for res in Counter(df.name):
        
        tmp = df[df.name == res]
        a_sorted = tmp.sort_values(by=['relevance'],ascending=[False])
        p_sorted = tmp.sort_values(by=['predictions'],ascending=[False])
#         p_sorted = p_sorted[p_sorted.predictions>0]

        true_list.append(list(a_sorted[a_sorted.relevance!=0].postal_code))
        pred_list.append(list(p_sorted.postal_code))
        pred_rel.append(list(a_sorted[a_sorted.relevance!=0].relevance))
        
        
        
    return pred_list, pred_rel , true_list

# Model

In [14]:
# models
train_features=['density', 'entropy', 'competitiveness','area_pop', 'accessibility','complementary']

model = LogisticRegression()
model.fit(train_df[train_features], train_df[['relevance']])
predict = model.predict(test_df[train_features])
test_df['predictions'] = predict

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# Evaluate by branch cnt

In [15]:
# models = [LR, RF, DTC, KNC, SVC, GNB, LGBM]
# model_name =['LR', 'RF', 'DTC', 'KNC', 'SVC', 'GNB','LGBMRanker']
# score_dict = {}

# for i in range(len(models)):
    
#     score_dict[model_name[i]]={}
#     model = models[i]
#     # Train
#     if model_name[i] != 'LGBMRanker':
#         model.fit(train_df[train_features], train_df[['relevance']])
#     else:
#         model.fit(train_df[train_features], train_df[['relevance']], group=train_groups)
    
#     # Predict
#     predict = model.predict(test_df[train_features])
#     test_df['predictions'] = predict
#     pred_list, pred_rel , true_list = get_ranking(test_df)

#     # Evaluation
#     mrr_list = calculate_mrr(pred_list , true_list)
#     map_list = calculate_map(pred_list , true_list)
#     dcg_list , ndcg_list = calculate_dcg_ndcg(pred_list , true_list,pred_rel)
#     precision_list_1 = _precision(pred_list , true_list, k=1)
#     recall_list_1 = _recall(pred_list , true_list,k=1)
#     precision_list_3 = _precision(pred_list , true_list, k=3)
#     recall_list_3 = _recall(pred_list , true_list,k=3)
#     precision_list = _precision(pred_list , true_list)
#     recall_list = _recall(pred_list , true_list)

#     score_dict[model_name[i]]['precision @ 1'] = np.mean(precision_list_1)
#     score_dict[model_name[i]]['recall @ 1'] = np.mean(recall_list_1)
#     score_dict[model_name[i]]['precision @ 3 '] = np.mean(precision_list_3)
#     score_dict[model_name[i]]['recall @ 3'] = np.mean(recall_list_3)
#     score_dict[model_name[i]]['precision'] = np.mean(precision_list)
#     score_dict[model_name[i]]['recall'] = np.mean(recall_list)
#     score_dict[model_name[i]]['mrr'] = np.mean(mrr_list)
#     score_dict[model_name[i]]['map'] = np.mean(map_list)
#     score_dict[model_name[i]]['dcg'] = np.mean(dcg_list)
#     score_dict[model_name[i]]['ndcg'] = np.mean(ndcg_list)

NameError: name 'LR' is not defined

In [16]:
score_dict = {}
for i in branch_dict:
    print(i)
    model_name = 'branch_cnt'+str(i)
    score_dict[model_name]={}
    tmp = test_df[test_df.name.isin(branch_dict[i])]
    pred_list, pred_rel , true_list = get_ranking(tmp)

    # Evaluation
    mrr_list = calculate_mrr(pred_list , true_list)
    map_list = calculate_map(pred_list , true_list)
    dcg_list , ndcg_list = calculate_dcg_ndcg(pred_list , true_list,pred_rel)
    precision_list_1 = _precision(pred_list , true_list, k=1)
    recall_list_1 = _recall(pred_list , true_list,k=1)
    precision_list_3 = _precision(pred_list , true_list, k=3)
    recall_list_3 = _recall(pred_list , true_list,k=3)
    precision_list = _precision(pred_list , true_list)
    recall_list = _recall(pred_list , true_list)

    score_dict[model_name]['precision @ 1'] = np.mean(precision_list_1)
    score_dict[model_name]['recall @ 1'] = np.mean(recall_list_1)
    score_dict[model_name]['precision @ 3 '] = np.mean(precision_list_3)
    score_dict[model_name]['recall @ 3'] = np.mean(recall_list_3)
    score_dict[model_name]['precision'] = np.mean(precision_list)
    score_dict[model_name]['recall'] = np.mean(recall_list)
    score_dict[model_name]['mrr'] = np.mean(mrr_list)
    score_dict[model_name]['map'] = np.mean(map_list)
    score_dict[model_name]['dcg'] = np.mean(dcg_list)
    score_dict[model_name]['ndcg'] = np.mean(ndcg_list)

2
[5.0] ['34684', '34221', '34604', '34606'] ['34684']
0 5.0 5.0
[6.0] ['33558', '33503', '33510', '33511'] ['33558']
1 6.0 6.0
[5.0] ['33701', '33503', '33510', '33511'] ['33701']
2 5.0 5.0
[5.0] ['19102', '18913', '18914', '18915'] ['19102']
3 5.0 5.0
[6.0] ['46202', '46038', '46040', '46055'] ['46202']
4 6.0 6.0
[5.0] ['85706', '85641', '85653', '85658'] ['85706']
5 5.0 5.0
[6.0] ['37064', '37024', '37027', '37043'] ['37064']
6 6.0 6.0
[5.0] ['93101', '93103', '93013', '93067'] ['93103']
7 3.1546487678572874 5.0
[6.0] ['19130', '18913', '18914', '18915'] ['19130']
8 6.0 6.0
[6.0] ['46268', '46032', '46033', '46037'] ['46268']
9 6.0 6.0
[5.0] ['19125', '18913', '18914', '18915'] ['19125']
10 5.0 5.0
[6.0] ['19103', '18913', '18914', '18915'] ['19103']
11 6.0 6.0
[6.0] ['46205', '46038', '46040', '46055'] ['46205']
12 6.0 6.0
[5.0] ['19106', '18913', '18914', '18915'] ['19106']
13 5.0 5.0
[5.0] ['85737', '85641', '85653', '85658'] ['85737']
14 5.0 5.0
[6.0] ['46225', '46038', '46040',

[4.0] ['37211', '37064', '37066', '37067'] ['37211']
0 4.0 4.0
[5.0] ['46250', '46060', '46075', '46077'] ['46250']
1 5.0 5.0
[4.0] ['19104', '18917', '18928', '18929'] ['19104']
2 4.0 4.0
[4.0] ['33618', '33544', '33545', '33547'] ['33618']
3 4.0 4.0
[6.0] ['33618', '33543', '33545', '33547'] ['33618']
4 6.0 6.0
[5.0] ['33759', '33503', '33510', '33511'] ['33759']
5 5.0 5.0
[6.0] ['33611', '33543', '33544', '33545'] ['33611']
6 6.0 6.0
[6.0] ['34684', '34610', '34619', '34638'] ['34684']
7 6.0 6.0
[6.0] ['37067', '37012', '37013', '37015'] ['37067']
8 6.0 6.0
[6.0] ['85704', '85702', '85705', '85706'] ['85704']
9 6.0 6.0
[5.0] ['37205', '37012', '37013', '37015'] ['37205']
10 5.0 5.0
[5.0] ['70130', '70001', '70002', '70003'] ['70130']
11 5.0 5.0
[6.0] ['33612', '33543', '33544', '33545'] ['33612']
12 6.0 6.0
[6.0] ['37122', '37066', '37067', '37069'] ['37122']
13 6.0 6.0
[6.0] ['19103', '18917', '18928', '18929'] ['19103']
14 6.0 6.0
[6.0] ['93101', '93111', '93190', '93642'] ['93101

In [17]:
pd.DataFrame(score_dict).round(3)

Unnamed: 0,branch_cnt2,branch_cnt6,branch_cnt3,branch_cnt5,branch_cnt4
precision @ 1,0.962,0.8,0.909,0.917,0.784
recall @ 1,0.962,0.4,0.909,0.458,0.392
precision @ 3,0.333,0.667,0.333,0.653,0.658
recall @ 3,1.0,1.0,1.0,0.979,0.986
precision,0.25,0.25,0.25,0.25,0.25
recall,1.0,1.0,1.0,1.0,1.0
mrr,0.98,0.783,0.955,0.743,0.644
map,0.98,0.917,0.955,0.958,0.903
dcg,5.421,5.419,4.818,6.381,6.683
ndcg,0.985,0.874,0.966,0.923,0.888
