# IMPORT

In [1]:
import pandas as pd
import numpy as np
from numba import jit

import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("data/training_set_VU_DM_2014.csv")

In [3]:
print(data.columns.values)

['srch_id' 'date_time' 'site_id' 'visitor_location_country_id'
 'visitor_hist_starrating' 'visitor_hist_adr_usd' 'prop_country_id'
 'prop_id' 'prop_starrating' 'prop_review_score' 'prop_brand_bool'
 'prop_location_score1' 'prop_location_score2' 'prop_log_historical_price'
 'position' 'price_usd' 'promotion_flag' 'srch_destination_id'
 'srch_length_of_stay' 'srch_booking_window' 'srch_adults_count'
 'srch_children_count' 'srch_room_count' 'srch_saturday_night_bool'
 'srch_query_affinity_score' 'orig_destination_distance' 'random_bool'
 'comp1_rate' 'comp1_inv' 'comp1_rate_percent_diff' 'comp2_rate'
 'comp2_inv' 'comp2_rate_percent_diff' 'comp3_rate' 'comp3_inv'
 'comp3_rate_percent_diff' 'comp4_rate' 'comp4_inv'
 'comp4_rate_percent_diff' 'comp5_rate' 'comp5_inv'
 'comp5_rate_percent_diff' 'comp6_rate' 'comp6_inv'
 'comp6_rate_percent_diff' 'comp7_rate' 'comp7_inv'
 'comp7_rate_percent_diff' 'comp8_rate' 'comp8_inv'
 'comp8_rate_percent_diff' 'click_bool' 'gross_bookings_usd'
 'booking_

# SCORE FUNCTION

In [15]:
@jit
def nDCG(relevances: np.ndarray, positions: np.ndarray) -> float:
    # Compute Normalized Discounted Cumulative Gain based on:
    # - relevances: Numpy Array containing DCG Relevances (5 if booked, 1 if clicked)
    # - positions: Numpy Array containing Positions (The display order)
    
    positions_normalized = np.argsort(positions)
    relevances_ordered = relevances[positions_normalized]
    relevances_sorted = np.sort(relevances)[::-1]
    
    gain = 2 ** relevances_ordered - 1
    ideal_gain = 2 ** relevances_sorted - 1
    
    discount = np.log2(np.arange(len(positions)) + 2)
    
    DCG = np.sum(gain / discount)
    IDCG = np.sum(ideal_gain / discount)
    
    if IDCG == 0: return 0.0
    else: return DCG / IDCG

@jit
def nDCG_mean(sid_groups: dict, relevance: np.ndarray, position: np.ndarray) -> float:
    # Calculate Mean Normalized Discounted Cumulative Gain
    
    nDCG_sum = 0.0
    for sid, indices in sid_groups.items():
        nDCG_sum += nDCG(relevance[indices], position[indices])
    return nDCG_sum / len(sid_groups)

@jit
def group_by_search_id(sids) -> dict:
    # Group Indices by Search ID, makes calculating mean nDCG faster!
    
    sid_groups = {}
    
    for i, sid in enumerate(sids):
        if sid not in sid_groups:
            sid_groups[sid] = []
        sid_groups[sid].append(i)
    
    return sid_groups

In [16]:
# Test nDCG - Should be 1.0
relevances = np.array([0, 0, 5, 1])
positions = np.array([3, 2, 0, 1])
nDCG(relevances, positions)

1.0

# CLEANING

In [17]:
# Select Part of Data
selection = data[data['random_bool'] == 0]

In [18]:
# Get Search ID groups
sid_selection = np.array(selection['srch_id'])
sid_groups = group_by_search_id(sid_selection)
sid_groups = {sid: indices for sid, indices in sid_groups.items() if np.random.uniform() < 0.2}

In [19]:
# Get Position in Expedia List
position = np.array(selection['position'])

In [20]:
# Compute Relevances based on Clicks and Books
relevance = np.zeros(len(selection))
relevance[selection['click_bool'] == 1] = 1
relevance[selection['booking_bool'] == 1] = 5

In [21]:
# Get Mean nDCG Score for positions in data (0.5-ish, quite high!)
nDCG_mean(sid_groups, relevance, position)

0.5162606500699968

In [22]:
# Features
price = data['price_usd']
price_stars = data['price_usd'] / (data['prop_starrating'] + 1)

review_score = np.array(data['prop_review_score'], np.float32)
review_score[np.isnan(review_score)] = np.percentile(review_score[~np.isnan(review_score)], 25)

location_score_2 = np.array(data['prop_location_score2'], np.float32)
location_score_2[np.isnan(location_score_2)] = np.percentile(location_score_2[~np.isnan(location_score_2)], 25)

features = np.array([
    price_stars,
    review_score,
    location_score_2,
], np.float32).T

print(features.shape)

(4958347, 3)


# TRAIN & TEST SPLIT

In [23]:
split = 0.8

# Create random probabilities for all Search ID's in Data Set
sids = np.array([sid for sid in sid_groups.keys()])
prob = np.random.uniform(0, 1, len(sids))

# Get indices for rows belonging to Seach ID's based on random probability
indices_train = np.concatenate([sid_groups[sid] for sid in sids[prob <= split]])
indices_test = np.concatenate([sid_groups[sid] for sid in sids[prob > split]])

# Get Test and Train Search ID's
sids_train = sid_selection[indices_train]
sids_test = sid_selection[indices_test]

# Get Test and Train Search ID groups: Dict[SearchID, Indices]
sid_train_groups = group_by_search_id(sids_train)
sid_test_groups = group_by_search_id(sids_test)

# Get Test and Train Positions
position_train = position[indices_train]
position_test = position[indices_test]

# Get Test and Train Relevances
relevance_train = relevance[indices_train]
relevance_test = relevance[indices_test]

# Get Test and Train Features
features_train = features[indices_train]
features_test = features[indices_test]

# Print nDCG Performance of the Data itself
print("Train nDCG:", len(sid_train_groups), len(indices_train), nDCG_mean(sid_train_groups, relevance_train, position_train))
print("Test nDCG:", len(sid_test_groups), len(indices_test), nDCG_mean(sid_test_groups, relevance_test, position_test))

Train nDCG: 22581 566591 0.5166371209176909
Test nDCG: 5297 133515 0.5146557627353202


# Machine Learning

In [24]:
from sklearn import ensemble, tree, metrics

# Train Classifier to predict Relevance
classifier = ensemble.GradientBoostingRegressor()
classifier.fit(features_train, relevance_train)
relevance_predict = classifier.predict(features_test)

# Position is the inverse of Relevance
# (We want high relevance to have a low position a.k.a. high on the displayed Expedia List)
position_predict = -relevance_predict

print("Mean nDCG Score:", 
      nDCG_mean(sid_test_groups, relevance_test, position_predict))
print("Random Mean nDCG Score:",
      nDCG_mean(sid_test_groups, relevance_test, np.random.uniform(size=len(position_predict))))

Mean nDCG Score: 0.34691870990949836
Random Mean nDCG Score: 0.3478128431733729
