# Pointwise
* predict the relevance scores

# Import

In [1]:
from collections import Counter
from Evaluations import evaluation_metrics
import pandas as pd
import numpy as np

# models
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from lightgbm import LGBMRanker

Matplotlib created a temporary config/cache directory at /tmp/matplotlib-qz9db95s because the default path (/home/emma/.config/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [2]:
RANDOM_STATE = 24

In [3]:
# 導入資料
train_df = pd.read_pickle('../data/Train_by_postoal_code_without_review_pointwise_v3_4.pkl').reset_index(drop=True)
test_df = pd.read_pickle('../data/Test_by_postoal_code_without_review_pointwise_v3_4.pkl').reset_index(drop=True)

In [4]:
# expr. 1 change relevance score to binary
def change_rel_score(df):
    df['binary_score'] = ''
    for idx, row in df.iterrows():
        if row.relevance > 0.0:
            df['binary_score'][idx] = 1
        else:
            df['binary_score'][idx] = 0
    return df
train_df = change_rel_score(train_df)
test_df = change_rel_score(test_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [5]:
train_df['binary_score'] = train_df['binary_score'].astype('float')
test_df['binary_score'] = test_df['binary_score'].astype('float')

In [None]:
label_list = ['binary_score' ,'affinity', 'complementary','cosine_sim']
for i in label_list : 
    train_df[i] = train_df[i].astype('float')
    test_df[i] = test_df[i].astype('float')

In [6]:
train_df = train_df.sample(frac=1 , random_state = RANDOM_STATE)
test_df = test_df.sample(frac=1 , random_state = RANDOM_STATE)

In [7]:
train_df.shape

(21880, 11)

In [8]:
test_df.shape

(13793, 11)

# Models

In [9]:
# create list of list for query ranking
def get_ranking(df , y_label = 'binary_score'):
    
    """
    Turn the probability array into a list of lists for calculation.
    
    Parameters:
    df(DataFrame): the test dataframe
    
    Returns:
    prediction (list of lists): A list of predicted rankings for each query.
    actual (list of lists): A list of actual rankings for each query.
    """
 
    pred_list = []
    true_rel = []
    true_list = []

    output_dict = {}
    
    
    for id in Counter(df.business_id):

        output_dict[id] = {}
        
        tmp = df[df.business_id == id]
        a_sorted = tmp.sort_values(by=[y_label],ascending=[False])
        p_sorted = tmp.sort_values(by=['predictions'],ascending=[False])
        # p_sorted = p_sorted[p_sorted.predictions>0]

        true_list.append(list(a_sorted[a_sorted.relevance!=0].postal_code))
        pred_list.append(list(p_sorted.postal_code))
        true_rel.append(list(a_sorted[a_sorted.relevance!=0].relevance))
        
        output_dict[id]['predict'] = list(p_sorted.postal_code)
        output_dict[id]['true'] = list(a_sorted[a_sorted.relevance!=0].postal_code)
        
        
    return pred_list, true_rel , true_list , output_dict

# Model

In [10]:
# models
LR = LogisticRegression(random_state=RANDOM_STATE)
RF = RandomForestClassifier(random_state=RANDOM_STATE)
DTC = DecisionTreeClassifier(random_state=RANDOM_STATE)
KNC = KNeighborsClassifier()
SVC = svm.SVC(random_state=RANDOM_STATE)
GNB = GaussianNB()
LGBM = LGBMRanker(objective="lambdarank",random_state=RANDOM_STATE)

train_features=['density', 'entropy', 'competitiveness','area_pop']
# train_features=['density', 'entropy', 'competitiveness','area_pop','complementary','affinity']
# train_features=['density', 'entropy', 'competitiveness','area_pop', 'accessibility','cosine_sim']
# train_features=['density', 'entropy', 'competitiveness','area_pop','affinity', 'complementary','cosine_sim']


get_group_size = lambda df: df.reset_index().groupby("business_id")['business_id'].count()

train_groups = get_group_size(train_df).to_numpy()
test_groups = get_group_size(test_df).to_numpy()

print(sum(train_groups) , sum(test_groups))

21880 13793


In [11]:
models = [LR, RF, DTC, KNC, SVC, GNB, LGBM]
model_name =['LR', 'RF', 'DTC', 'KNC', 'SVC', 'GNB' , 'LGBMRanker'] 
score_dict = {}

for i in range(len(models)):
    score_dict[model_name[i]]={}
    model = models[i]
    # Train
    if model_name[i] != 'LGBMRanker':
        model.fit(train_df[train_features], train_df[['binary_score']])
    else:
        model.fit(train_df[train_features], train_df[['relevance']], group=train_groups)
    
    # Predict
    predict = model.predict(test_df[train_features])
    test_df['predictions'] = predict
    if model_name[i] != 'LGBMRanker':
        pred_list, true_rel , true_list , output_dict = get_ranking(test_df)
    else:
        pred_list, true_rel , true_list , output_dict = get_ranking(test_df,'relevance')
        

    # Evaluation
    mrr_list = evaluation_metrics._mrr(pred_list , true_list)
    map_list = evaluation_metrics._map(pred_list , true_list)
    dcg_list , ndcg_list = evaluation_metrics._dcg_ndcg(pred_list , true_list,true_rel)
    precision_list_1 = evaluation_metrics._precision(pred_list , true_list, k=1)
    recall_list_1 = evaluation_metrics._recall(pred_list , true_list,k=1)
    precision_list_3 = evaluation_metrics._precision(pred_list , true_list, k=3)
    recall_list_3 = evaluation_metrics._recall(pred_list , true_list,k=3)
    precision_list =evaluation_metrics. _precision(pred_list , true_list)
    recall_list = evaluation_metrics._recall(pred_list , true_list)

    score_dict[model_name[i]]['precision @ 1'] = np.mean(precision_list_1)
#     score_dict[model_name[i]]['recall @ 1'] = np.mean(recall_list_1)
    score_dict[model_name[i]]['precision @ 3 '] = np.mean(precision_list_3)
    score_dict[model_name[i]]['recall @ 3'] = np.mean(recall_list_3)
    score_dict[model_name[i]]['precision'] = np.mean(precision_list)
    score_dict[model_name[i]]['recall'] = np.mean(recall_list)
    score_dict[model_name[i]]['mrr'] = np.mean(mrr_list)
    score_dict[model_name[i]]['map'] = np.mean(map_list)
    score_dict[model_name[i]]['dcg'] = np.mean(dcg_list)
    score_dict[model_name[i]]['ndcg'] = np.mean(ndcg_list)

  return f(*args, **kwargs)
  # Remove the CWD from sys.path while we load stuff.
  return self._fit(X, y)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


In [12]:
pd.DataFrame(score_dict).round(3)

Unnamed: 0,LR,RF,DTC,KNC,SVC,GNB,LGBMRanker
precision @ 1,0.077,0.539,0.528,0.49,0.125,0.195,0.639
precision @ 3,0.059,0.195,0.191,0.184,0.074,0.125,0.266
recall @ 3,0.176,0.584,0.572,0.551,0.221,0.375,0.799
precision,0.053,0.053,0.053,0.053,0.053,0.053,0.053
recall,1.0,1.0,1.0,1.0,1.0,1.0,1.0
mrr,0.207,0.604,0.593,0.564,0.249,0.345,0.745
map,0.207,0.604,0.593,0.564,0.249,0.345,0.745
dcg,3.643,6.778,6.696,6.447,3.995,4.772,7.907
ndcg,0.374,0.688,0.679,0.656,0.408,0.486,0.805


In [None]:
# pd.DataFrame(output_dict).T.to_csv('./output.csv')

In [13]:
pd.DataFrame(output_dict).T

Unnamed: 0,predict,true
KvCmKEV_pL0qMmw3Q8Yagg,"[37214, 37122, 37027, 37075, 37067, 37076, 371...",[37214]
3iUCCf1FWmjlFbGYvBgf9w,"[37201, 37027, 37075, 37067, 37076, 37064, 371...",[37201]
0u7ARPwjPyoU_y2UZY9Pkw,"[33609, 33511, 33543, 33559, 33558, 33563, 335...",[33609]
9mA9qYqiv4c0T9ASid3PIQ,"[19107, 18901, 18938, 18940, 18914, 18944, 189...",[19107]
Imj_D35_8VZ7fuVUL9Sd8w,"[33511, 33543, 33569, 33559, 33558, 33563, 335...",[33511]
...,...,...
yI-ahfduU45_bZa3ZgEx4w,"[46219, 46032, 46123, 46038, 46168, 46037, 461...",[46219]
B2K0HFrYQx14TTJc1o3OpA,"[19380, 18901, 18938, 18940, 18914, 18944, 189...",[19380]
adu5voMt1rln1nilzZh9uA,"[85748, 85704, 85712, 85716, 85710, 85719, 857...",[85748]
nygJ4mY0glpdWGv_Hmp6Dg,"[37201, 37027, 37075, 37067, 37076, 37064, 371...",[37201]
