In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
def nDCG(relevances: np.ndarray, positions: np.ndarray) -> float:
    """Compute Normalized Discounted Cumulative Gain based on:
    - relevances: Numpy Array containing DCG Relevances (5 if booked, 1 if clicked)
    - positions: Numpy Array containing Positions (The display order) """
    
    positions_normalized = np.argsort(positions)
    relevances_ordered = relevances[positions_normalized]
    relevances_sorted = np.sort(relevances)[::-1]
    
    gain = 2 ** relevances_ordered - 1
    ideal_gain = 2 ** relevances_sorted - 1
    
    discount = np.log2(np.arange(len(positions)) + 2)
    
    DCG = np.sum(gain / discount)
    IDCG = np.sum(ideal_gain / discount)
    
    return DCG / IDCG


def nDCG_mean(dataframe: pd.DataFrame) -> float:
    """
    Calculate Mean Normalized Discounted Cumulative Gain on DataFrame
    DataFrame must have fields: [srch_id, relevance, position]
    """
    
    nDCG_sum = 0.0
    
    searches = dataframe.groupby('srch_id')
    
    for name, search in searches:
        nDCG_sum += nDCG(search.relevance.values, search.position.values)
    return nDCG_sum / len(searches)

In [3]:
# Load That Shit (does take a while)
df = pd.read_csv("data/training_set_VU_DM_2014.csv")

In [4]:
# Add Relevance Column to DataFrame
relevance = np.zeros(len(df))
relevance[df['click_bool'] == 1] = 1
relevance[df['booking_bool'] == 1] = 5

df['relevance'] = relevance

In [5]:
# Reorganize DataFrame (a.k.a. throw out competitor info): handy for tweaking/overview?
df = df[[
    # ID
    'srch_id',
    
    # Labels (to predict)
    'position',
    'relevance',
    'click_bool',
    'booking_bool',
    
    # Per Seach Features
    'site_id',
    'date_time',
    'srch_destination_id',
    'srch_length_of_stay',
    'srch_booking_window',
    'srch_adults_count',
    'srch_children_count',
    'srch_room_count',
    'srch_saturday_night_bool',
    'srch_query_affinity_score',
    'orig_destination_distance',
    
    # Property Features
    'price_usd',
    'promotion_flag',
    'prop_country_id',
    'prop_id',
    'prop_starrating',
    'prop_review_score',
    'prop_brand_bool',
    'prop_location_score1',
    'prop_location_score2',
    'prop_log_historical_price',
    
    # Visitor Features
    'visitor_location_country_id',
    'visitor_hist_starrating',
    'visitor_hist_adr_usd',
    
    # Random Order
    'random_bool',
]]

In [6]:
# Fix NaN's
# Pandas Might Complain about me overwriting stuff
affinity = df.srch_query_affinity_score
affinity[np.isnan(affinity)] = np.min(affinity)

distance = df.orig_destination_distance
distance[np.isnan(distance)] = -1

review = df.prop_review_score
review[np.isnan(review)] = -1

location_2 = df.prop_location_score2
location_2[np.isnan(location_2)] = np.mean(location_2)

visitor_hist_stars = df.visitor_hist_starrating
visitor_hist_stars[np.isnan(visitor_hist_stars)] = -1

visitor_hist_usd = df.visitor_hist_adr_usd
visitor_hist_usd[np.isnan(visitor_hist_usd)] = -1

In [7]:
# Check NaN's per Column, if only smileys appear, you're fine ^^

from random import choice
smileys = [':)', ':D', ':P']

for column in df.columns:
    print("{:30s} {}".format(column, "NaN" if df[column].isnull().any() else choice(smileys)))

srch_id                        :D
position                       :)
relevance                      :)
click_bool                     :)
booking_bool                   :)
site_id                        :P
date_time                      :)
srch_destination_id            :D
srch_length_of_stay            :)
srch_booking_window            :)
srch_adults_count              :P
srch_children_count            :)
srch_room_count                :)
srch_saturday_night_bool       :P
srch_query_affinity_score      :)
orig_destination_distance      :P
price_usd                      :D
promotion_flag                 :P
prop_country_id                :D
prop_id                        :)
prop_starrating                :P
prop_review_score              :D
prop_brand_bool                :P
prop_location_score1           :)
prop_location_score2           :P
prop_log_historical_price      :)
visitor_location_country_id    :)
visitor_hist_starrating        :P
visitor_hist_adr_usd           :)
random_bool   

In [8]:
# Sample Random Fraction of Searches of DataFrame (To Speed Up Shit)
# (This does take a while though...)
RANDOM_FRACTION = 0.1
selection = df.groupby('srch_id').filter(lambda x: np.random.uniform() < RANDOM_FRACTION)

In [9]:
# Split Train and Test from random selection, again per Search ID
TEST_TRAIN_SPLIT = 0.8

unique_search_ids = np.unique(selection.srch_id)
mask = np.random.uniform(0, 1, len(unique_search_ids)) < TEST_TRAIN_SPLIT

train = selection[selection.srch_id.isin(unique_search_ids[mask])]
test = selection[selection.srch_id.isin(unique_search_ids[~mask])]

train_features = train.iloc[:, 7:]
train_labels = train['relevance']

test_features = test.iloc[:, 7:]
test_labels = test['relevance']

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Train Regressor on Train Features and Train Labels (a.k.a. Relevances)
# This should be fairly quick now :) (20 seconds?, idk)
classifier = RandomForestRegressor()
classifier.fit(train_features, train_labels)

# Predict Relevances
predict_labels = classifier.predict(test_features)

# Through Data to Calculate Score in new DataFrame
# Note that predicted position = - predicted relevance
result = pd.DataFrame({
    'srch_id': test.srch_id,
    'relevance': test.relevance,
    'position': -predict_labels})

print("Prediction:", nDCG_mean(result))


# Throw Random Positions in the Mix, to show we're doing better than Random :)
result = pd.DataFrame({
    'srch_id': test.srch_id,
    'relevance': test.relevance,
    'position': np.random.uniform(0, 1, len(test.relevance))
})

print("Random:", nDCG_mean(result))


# Plot Feature Importances Graph
plt.barh(np.arange(len(test_features.columns)), classifier.feature_importances_,
       tick_label=test_features.columns)