# Sorting place results

Goal is to determine a nice sequence of place results for the end user.

In [None]:
# %load_ext autoreload
# %autoreload 2

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

# import sys
# sys.path.append('../../')

In [None]:
data_dir = '../../data/wikivoyage/'
# folder where data should live for flask API
api_dir = '../../api/data/'

input_path = data_dir + 'processed/wikivoyage_destinations.csv'
output_path1 = data_dir + 'enriched/wikivoyage_destinations.csv'
output_path2 = api_dir + 'wikivoyage_destinations.csv'

### Read data

In [None]:
df = pd.read_csv(input_path)
df.head()

### Remove destinations with no tokens

Has to be done for resampling, otherwise there will be observations with weight 0 which means they will never get sampled and you can thus not 'sort' the *entire* data set as some observations aren't drawn.

In [None]:
df = df.loc[lambda df: df['nr_tokens'] > 0]

## Biased sorting

In order to get some randomness, but make sure the more important destinations get oversampled, use `nr_tokens` as a weight in the sampling method.

For now, let's first have a look at the overall distribution of `nr_tokens` in our data. It is strongly skewed towards destinations with very few tokens:

In [None]:
(
    df
#     .loc[lambda df: df['country'] == 'Netherlands']
    .assign(nr_tokens_bins = lambda df: pd.cut(df['nr_tokens'], bins = list(range(0, 10000, 500)) + [99999]))
    ['nr_tokens_bins']
    .value_counts()
    .sort_index()
    .plot(kind='bar')
);

You can imagine that you don't want to random sample this way. It would mean that you would mostly show very unknown destinations to the user. 

Let's compare 3 different ways of sampling:
1. without weights (so random)
2. weighting by `nr_tokens`
3. weighting by `nr_tokens` to the power `X`

The more weighting, the more places are drawn with a larger number of tokens.

In [None]:
n_results = 16 # number of fetched results per API call
power_factor = 1.5 # nr of times to the power of nr_tokens for sampling bigger documents

fig, axes = plt.subplots(nrows=8, ncols=3, figsize=(16, 8*4))

df_bins = (
    df
    .assign(nr_tokens_bins = lambda df: pd.cut(df['nr_tokens'], bins = list(range(0, 10000, 500)) + [99999]))
    .assign(nr_tokens_powered = lambda df: df['nr_tokens']**power_factor)
) 

for i, row in enumerate(axes):
    for weights, ax in zip(['random', 'nr_tokens', 'nr_tokens^{}'.format(power_factor)], row):
        
        n = (i+1)*n_results
        
        # depending on weights type, sample differently
        if weights == 'random':
            df_plot = df_bins.sample(frac=1, random_state=1234)
        elif weights == 'nr_tokens':
            df_plot = df_bins.sample(frac=1, random_state=1234, weights='nr_tokens')
        else: 
            df_plot = df_bins.sample(frac=1, random_state=1234, weights='nr_tokens_powered')
        
        # plot
        (
            df_plot
            .head(n)
            ['nr_tokens_bins']
            .value_counts()
            .sort_index()
            .plot(kind='bar', ax=ax)
        )
        # prettify plot
        if i < 7:
            ax.get_xaxis().set_ticks([])
        ax.set_title('{} - {} obs'.format(weights, n))
        
fig.tight_layout()
plt.show()

Power factor 1.5 seems to be nice. Powering even more will deplete the places with most observations very quickly. For the user this means that they first get all the well known destinations, and then the rest. The aim of our app is to surprise and inspire, so we also want to show more lesser known destinations.

## Write to CSV

Add the sampling weight feature and write the final data set to be used by the frontend

In [None]:
power_factor = 1.5

output_df = (
    df
    # add the feature
    .assign(weight = lambda df: (df['nr_tokens']**power_factor).astype(int))
    # other hygiene
    .drop(columns=['nr_tokens', 'ispartof', 'parentid'])
    .set_index('id', drop=False)
    # need to do this to convert numpy int and float to native data types
    .astype('object')
)
output_df.head()

In [None]:
# write 'approved' file to the data and api folders
output_df.to_csv(output_path1, index=False)
output_df.to_csv(output_path2, index=False)

Done.