# Learn to Rank

## Import 

In [1]:
import pandas as pd
import json
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import pickle
import math 

In [2]:
df = pd.read_pickle('../data/df.pkl')

## DF Construction
* add relevance

In [3]:
df.columns

Index(['index', 'business_id', 'stars_x', 'useful', 'funny', 'cool', 'text',
       'date', 'name', 'address', 'city', 'state', 'postal_code', 'stars_y',
       'review_count', 'is_open', 'attributes', 'categories', 'hours',
       'affinity', 'complementary', 'density', 'entropy', 'competitiveness'],
      dtype='object')

In [4]:
df = df.drop_duplicates(subset=['business_id','postal_code']).reset_index(drop=True)
df = df.drop(columns=['index', 'business_id', 'stars_x', 'useful', 'funny', 'cool', 'text',
       'date','address','is_open', 'stars_y','attributes','hours'])

In [5]:
# sort by name and review count 
df = df.sort_values(['name', 'review_count'],
              ascending = [True, False]).reset_index(drop=True)

In [6]:
# create relevance score
cnt = 0
for i in Counter(df.name):
    tmp = df[df.name==i].reset_index(drop=True)
    tmp['relevance']=''
    score = 1/len(tmp)
    for j in range(len(tmp)):
        tmp['relevance'][j]=1-score*j
    if cnt ==0 :
        new_df = tmp
        cnt+=1
    new_df = pd.concat([new_df,tmp])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [7]:
for i in Counter(new_df.name):
    tmp = new_df[new_df.name==i]
    print(f'Restaurant : {i} , cities : {Counter(tmp.postal_code)}')

Restaurant : Acropolis Greek Taverna , cities : Counter({'33605': 2, '33701': 2, '33613': 2, '33578': 2, '33609': 2})
Restaurant : Another Broken Egg Cafe , cities : Counter({'33767': 1, '33611': 1, '33761': 1})
Restaurant : Anthony's Coal Fired Pizza , cities : Counter({'33618': 1, '33511': 1, '33629': 1, '33761': 1})
Restaurant : Ashley HomeStore , cities : Counter({'33763': 1, '33543': 1, '33510': 1, '33781': 1})
Restaurant : BJ's Restaurant & Brewhouse , cities : Counter({'33781': 1, '33559': 1, '33625': 1, '33761': 1})
Restaurant : Bahama Breeze , cities : Counter({'33607': 1, '33511': 1, '33559': 1})
Restaurant : Bonefish Grill , cities : Counter({'33544': 1, '33618': 1, '33511': 1, '33761': 1, '33609': 1, '33703': 1, '33770': 1, '34655': 1, '33710': 1})
Restaurant : Buffalo Wild Wings , cities : Counter({'33544': 1, '34655': 1, '33605': 1, '33511': 1, '33579': 1, '34677': 1, '33611': 1})
Restaurant : Burger Monger , cities : Counter({'33618': 1, '33543': 1, '33701': 1, '33759': 

In [8]:
# breakdown affinity & complementary
from operator import itemgetter
indices = range(len(df['affinity'][0]))
a_df = df['affinity'].transform({f'affinity_{i+1}': itemgetter(i) for i in indices})

indices = range(len(df['complementary'][0]))
c_df = df['complementary'].transform({f'complementary_{i+1}': itemgetter(i) for i in indices})

In [9]:
new_df = new_df.join(a_df)
new_df = new_df.join(c_df)

## Model

### Create Dataset

In [10]:
new_df.columns

Index(['name', 'city', 'state', 'postal_code', 'review_count', 'categories',
       'affinity', 'complementary', 'density', 'entropy', 'competitiveness',
       'relevance', 'affinity_1', 'affinity_2', 'affinity_3', 'affinity_4',
       'affinity_5', 'affinity_6', 'affinity_7', 'affinity_8', 'affinity_9',
       'affinity_10', 'complementary_1', 'complementary_2', 'complementary_3',
       'complementary_4', 'complementary_5', 'complementary_6',
       'complementary_7', 'complementary_8', 'complementary_9',
       'complementary_10'],
      dtype='object')

In [11]:
features = ['name', 'city', 'state', 'postal_code', 'review_count', 'categories',
       'density', 'entropy', 'competitiveness',
       'affinity_1', 'affinity_2', 'affinity_3', 'affinity_4',
       'affinity_5', 'affinity_6', 'affinity_7', 'affinity_8', 'affinity_9',
       'affinity_10', 'complementary_1', 'complementary_2', 'complementary_3',
       'complementary_4', 'complementary_5', 'complementary_6',
       'complementary_7', 'complementary_8', 'complementary_9',
       'complementary_10']
target = 'relevance'

for i in ['density', 'entropy', 'competitiveness',
       'relevance', 'affinity_1', 'affinity_2', 'affinity_3', 'affinity_4',
       'affinity_5', 'affinity_6', 'affinity_7', 'affinity_8', 'affinity_9',
       'affinity_10', 'complementary_1', 'complementary_2', 'complementary_3',
       'complementary_4', 'complementary_5', 'complementary_6',
       'complementary_7', 'complementary_8', 'complementary_9',
       'complementary_10']:
       new_df[i] = new_df[i].astype('int32')
# train test split 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( new_df[features], new_df[target], test_size=0.33, random_state=42)

In [12]:
get_group_size = lambda new_df: new_df.reset_index().groupby("name")['name'].count()

train_groups = get_group_size(X_train)
test_groups = get_group_size(X_test)

print(sum(train_groups) , sum(test_groups))

146 72


### training

In [13]:
from lightgbm import LGBMRanker
train_features = ['density', 'entropy', 'competitiveness']

model = LGBMRanker(objective="lambdarank")
model.fit(X_train[train_features],y_train,group=train_groups,eval_set=[(X_test[train_features],y_test)],eval_group=[test_groups],eval_metric=['ndcg'])

[1]	valid_0's ndcg@1: 0.818182	valid_0's ndcg@2: 0.921174	valid_0's ndcg@3: 0.921174	valid_0's ndcg@4: 0.929176	valid_0's ndcg@5: 0.929176
[2]	valid_0's ndcg@1: 0.818182	valid_0's ndcg@2: 0.921174	valid_0's ndcg@3: 0.930464	valid_0's ndcg@4: 0.930464	valid_0's ndcg@5: 0.930464
[3]	valid_0's ndcg@1: 0.818182	valid_0's ndcg@2: 0.921174	valid_0's ndcg@3: 0.930464	valid_0's ndcg@4: 0.930464	valid_0's ndcg@5: 0.930464
[4]	valid_0's ndcg@1: 0.818182	valid_0's ndcg@2: 0.921174	valid_0's ndcg@3: 0.930464	valid_0's ndcg@4: 0.930464	valid_0's ndcg@5: 0.930464
[5]	valid_0's ndcg@1: 0.818182	valid_0's ndcg@2: 0.921174	valid_0's ndcg@3: 0.930464	valid_0's ndcg@4: 0.930464	valid_0's ndcg@5: 0.930464
[6]	valid_0's ndcg@1: 0.818182	valid_0's ndcg@2: 0.921174	valid_0's ndcg@3: 0.930464	valid_0's ndcg@4: 0.930464	valid_0's ndcg@5: 0.930464
[7]	valid_0's ndcg@1: 0.818182	valid_0's ndcg@2: 0.921174	valid_0's ndcg@3: 0.930464	valid_0's ndcg@4: 0.930464	valid_0's ndcg@5: 0.930464
[8]	valid_0's ndcg@1: 0.818

LGBMRanker(objective='lambdarank')