# Learn to Rank

## Import 

In [1]:
import pandas as pd
import json
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import pickle
import math 

In [2]:
df = pd.read_pickle('../data/df.pkl')

## DF Construction
* add relevance

In [3]:
df.columns

Index(['index', 'business_id', 'stars_x', 'useful', 'funny', 'cool', 'text',
       'date', 'name', 'address', 'city', 'state', 'postal_code', 'stars_y',
       'review_count', 'is_open', 'attributes', 'categories', 'hours',
       'affinity', 'complementary', 'density', 'entropy', 'competitiveness'],
      dtype='object')

In [4]:
df = df.drop_duplicates(subset=['business_id','postal_code']).reset_index(drop=True)
df = df.drop(columns=['index', 'business_id', 'stars_x', 'useful', 'funny', 'cool', 'text',
       'date','address','is_open', 'stars_y','attributes','hours'])

In [5]:
# sort by name and review count 
df = df.sort_values(['name', 'review_count'],
              ascending = [True, False]).reset_index(drop=True)

In [6]:
# create relevance score
cnt = 0
for i in Counter(df.name):
    tmp = df[df.name==i].reset_index(drop=True)
    tmp['relevance']=''
    score = 1/len(tmp)
    for j in range(len(tmp)):
        tmp['relevance'][j]=1-score*j
    if cnt ==0 :
        new_df = tmp
        cnt+=1
    new_df = pd.concat([new_df,tmp])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [7]:
new_df

Unnamed: 0,name,city,state,postal_code,review_count,categories,affinity,complementary,density,entropy,competitiveness,relevance
0,Acropolis Greek Taverna,Tampa,FL,33605,511,"Greek, Restaurants, Bars, Nightlife, Mediterra...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.9988400936126709, 0.0, 0.0, 0.0, 0.0, 0.0, ...",36,-0.0,-1.0,1.0
1,Acropolis Greek Taverna,St. Petersburg,FL,33701,403,"Restaurants, Greek, Mediterranean, Nightlife, ...","[0.9600942057702895, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.03874588784238142, 0.0, 0.0, 0.0, 0.0, 0.0,...",9,-0.0,-1.0,0.8
2,Acropolis Greek Taverna,Tampa,FL,33613,332,"Restaurants, Mediterranean, Greek, Beer, Wine ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.9988400936126709, 0.0, 0.0, 0.0, 0.0, 0.0, ...",36,-0.0,-1.0,0.6
3,Acropolis Greek Taverna,Riverview,FL,33578,327,"Beer, Wine & Spirits, Food, Bars, Nightlife, R...","[0.5758802507272236, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.4229598428854473, 0.0, 0.0, 0.0, 0.0, 0.0, ...",9,-0.0,-1.0,0.4
4,Acropolis Greek Taverna,Tampa,FL,33609,231,"Greek, Restaurants, Nightlife, Mediterranean, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.9988400936126709, 0.0, 0.0, 0.0, 0.0, 0.0, ...",36,-0.0,-1.0,0.2
...,...,...,...,...,...,...,...,...,...,...,...,...
2,WingHouse Bar & Grill,Clearwater,FL,33759,146,"Restaurants, Chicken Wings, Burgers, Sports Ba...","[0.809322657726625, 0.0, 0.0, 0.0, 0.0, 0.0, 0...","[0.0009984348789657815, 0.18866568803787231, 0...",22,-0.0,-1.0,0.5
3,WingHouse Bar & Grill,Brandon,FL,33511,138,"Chicken Wings, Restaurants, Sports Bars, Night...","[0.022958152011844035, 0.0, 0.0, 0.0, 0.0, 0.0...","[0.7873629405937468, 0.18866568803787231, 0.0,...",23,-0.0,-1.0,0.25
0,Wingstop,Wesley Chapel,FL,33544,56,"Restaurants, Chicken Wings","[0.9929083489698982, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.001119444437541972, 0.0, 0.0, 0.0, 0.0, 0.0...",15,-0.0,-1.0,1.0
1,Wingstop,Brandon,FL,33511,54,"Fast Food, Restaurants, Chicken Wings","[0.028162960822931016, 0.0, 0.0, 0.0, 0.0, 0.0...","[0.9658648325845092, 0.0, 0.0, 0.0, 0.0, 0.0, ...",23,-0.0,-1.0,0.666667


In [8]:
# breakdown affinity & complementary
from operator import itemgetter
indices = range(len(df['affinity'][0]))
a_df = df['affinity'].transform({f'affinity_{i+1}': itemgetter(i) for i in indices})

indices = range(len(df['complementary'][0]))
c_df = df['complementary'].transform({f'complementary_{i+1}': itemgetter(i) for i in indices})

In [9]:
new_df = new_df.join(a_df)
new_df = new_df.join(c_df)

## Model

### Create Dataset

In [12]:
new_df.columns

Index(['name', 'city', 'state', 'postal_code', 'review_count', 'categories',
       'affinity', 'complementary', 'density', 'entropy', 'competitiveness',
       'relevance', 'affinity_1', 'affinity_2', 'affinity_3', 'affinity_4',
       'affinity_5', 'affinity_6', 'affinity_7', 'affinity_8', 'affinity_9',
       'affinity_10', 'complementary_1', 'complementary_2', 'complementary_3',
       'complementary_4', 'complementary_5', 'complementary_6',
       'complementary_7', 'complementary_8', 'complementary_9',
       'complementary_10'],
      dtype='object')

In [19]:
features = ['name', 'city', 'state', 'postal_code', 'review_count', 'categories',
       'density', 'entropy', 'competitiveness',
       'affinity_1', 'affinity_2', 'affinity_3', 'affinity_4',
       'affinity_5', 'affinity_6', 'affinity_7', 'affinity_8', 'affinity_9',
       'affinity_10', 'complementary_1', 'complementary_2', 'complementary_3',
       'complementary_4', 'complementary_5', 'complementary_6',
       'complementary_7', 'complementary_8', 'complementary_9',
       'complementary_10']
target = 'relevance'

for i in ['density', 'entropy', 'competitiveness',
       'relevance', 'affinity_1', 'affinity_2', 'affinity_3', 'affinity_4',
       'affinity_5', 'affinity_6', 'affinity_7', 'affinity_8', 'affinity_9',
       'affinity_10', 'complementary_1', 'complementary_2', 'complementary_3',
       'complementary_4', 'complementary_5', 'complementary_6',
       'complementary_7', 'complementary_8', 'complementary_9',
       'complementary_10']:
       new_df[i] = new_df[i].astype('int32')
# train test split 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( new_df[features], new_df[target], test_size=0.33, random_state=42)

In [20]:
get_group_size = lambda new_df: new_df.reset_index().groupby("name")['name'].count()

train_groups = get_group_size(X_train)
test_groups = get_group_size(X_test)

print(sum(train_groups) , sum(test_groups))

146 72


### training

In [21]:
from lightgbm import LGBMRanker
train_features = ['density', 'entropy', 'competitiveness',
       'affinity_1', 'affinity_2', 'affinity_3', 'affinity_4',
       'affinity_5', 'affinity_6', 'affinity_7', 'affinity_8', 'affinity_9',
       'affinity_10', 'complementary_1', 'complementary_2', 'complementary_3',
       'complementary_4', 'complementary_5', 'complementary_6',
       'complementary_7', 'complementary_8', 'complementary_9',
       'complementary_10']

model = LGBMRanker(objective="lambdarank")
model.fit(X_train[train_features],y_train,group=train_groups,eval_set=[(X_test[train_features],y_test)],eval_group=[test_groups],eval_metric=['ndcg'])

[1]	valid_0's ndcg@1: 0.818182	valid_0's ndcg@2: 0.921174	valid_0's ndcg@3: 0.921174	valid_0's ndcg@4: 0.929176	valid_0's ndcg@5: 0.929176
[2]	valid_0's ndcg@1: 0.818182	valid_0's ndcg@2: 0.921174	valid_0's ndcg@3: 0.930464	valid_0's ndcg@4: 0.930464	valid_0's ndcg@5: 0.930464
[3]	valid_0's ndcg@1: 0.818182	valid_0's ndcg@2: 0.921174	valid_0's ndcg@3: 0.930464	valid_0's ndcg@4: 0.930464	valid_0's ndcg@5: 0.930464
[4]	valid_0's ndcg@1: 0.818182	valid_0's ndcg@2: 0.921174	valid_0's ndcg@3: 0.930464	valid_0's ndcg@4: 0.930464	valid_0's ndcg@5: 0.930464
[5]	valid_0's ndcg@1: 0.818182	valid_0's ndcg@2: 0.921174	valid_0's ndcg@3: 0.930464	valid_0's ndcg@4: 0.930464	valid_0's ndcg@5: 0.930464
[6]	valid_0's ndcg@1: 0.818182	valid_0's ndcg@2: 0.921174	valid_0's ndcg@3: 0.930464	valid_0's ndcg@4: 0.930464	valid_0's ndcg@5: 0.930464
[7]	valid_0's ndcg@1: 0.818182	valid_0's ndcg@2: 0.921174	valid_0's ndcg@3: 0.930464	valid_0's ndcg@4: 0.930464	valid_0's ndcg@5: 0.930464
[8]	valid_0's ndcg@1: 0.818

LGBMRanker(objective='lambdarank')