# Create queries for simple flask api with pandas

Goal is to replace the call to the Firestore Database with a fetch from local disk.

I.e. we will save a csv file on the app engine machine.

In [22]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
import pandas as pd
import sys
sys.path.append('../../')

In [24]:
api_data_dir = '../../api/data/'

file_name = 'wikivoyage_destinations.csv'
features_file_name = 'wikivoyage_features.csv'
features_types = 'wikivoyage_features_types.csv'

### Read data

Read data that has been prepared for the frontend by `feature_engineering.py`


In [25]:
df = pd.read_csv(api_data_dir + file_name).set_index("id", drop=False)
df_features = pd.read_csv(api_data_dir + features_file_name).set_index("id")
df_feature_types = pd.read_csv(api_data_dir + features_types)

## Row lookup

Read with index column on `pageid`, then use `.loc`

In [None]:
df.loc[146019].to_dict()

To fetch random:

In [None]:
df.sample(1).iloc[0].to_dict()

## Build queries

### based on geo

ne_lat, ne_lng, sw_lat, and sw_lng will be given:

`&ne_lat=43.97363914475397&ne_lng=5.173845810128569&sw_lat=38.69043481932856&sw_lng=-0.5720037992464313`

In [26]:
ne_lat, ne_lng, sw_lat, sw_lng = 43.9, 5.17, 38.7, -0.57

Apply filter:

In [27]:
from api.resources.utils.selection import filter_on_geolocation

In [28]:
(
    df
    .pipe(filter_on_geolocation, ne_lat, ne_lng, sw_lat, sw_lng)
).head()

Unnamed: 0_level_0,id,wiki_id,name,status,type,lat,lng,country,weight,nr_tokens_norm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
902784,902784,370,Aigues-Mortes,outline,city,43.5667,4.2,France,2455,0.005104
690103,690103,379,Ainsa,outline,city,42.41622,0.13819,Spain,598,0.001732
111488,111488,489,Alaigne,outline,city,43.1,2.0833,France,14550,0.01768
240854,240854,594,Alcudia,outline,city,39.850833,3.120833,Spain,16001,0.018865
423393,423393,737,Almàssera,usable,city,39.511667,-0.356111,Spain,3263,0.006258


### Sampling`

What to do if the area is too small? Need to handle an error as there is no data available!

In [None]:
ne_lat, ne_lng, sw_lat, sw_lng = 48.9, 2.47, 48.82, 2.22

There are two cases: 

1. Really nothing can be found
2. Only a small number can be found, for example less then the 10 requested by `sample(10)`

Rather than sampling, let's try a technique where we sort the dataframe at random, and then pick the top x observations. This way, we can also work with offsets if we preserve the ordering. Preserve the ordering by setting a random seed.

In [None]:
(
    df
    .pipe(filter_on_geolocation, ne_lat, ne_lng, sw_lat, sw_lng)
).sample(frac=1, random_state=1234)

### Error handling

Now, what is returned in case no records are found?

In [None]:
ne_lat, ne_lng, sw_lat, sw_lng = 48.8, 2.2, 48.82, 2.22

try:
    (
        df
        .pipe(filter_on_geolocation, ne_lat, ne_lng, sw_lat, sw_lng)
    ).sample(frac=1, random_state=1234)
except ValueError:
    print("Oops, ValueError! Must have at least one record. Return empty list?")

### Offsets

Select subset of results when working with an offset.

In [44]:
n = 10
offset = 0
n_results = 3
subset = df.sample(frac=1, random_state=1234).head(n)

In [45]:
subset

Unnamed: 0_level_0,id,wiki_id,name,status,type,lat,lng,country,weight,nr_tokens_norm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
163999,163999,148007,Lower Gwynedd Township,outline,city,40.213,-75.284,United States of America,103,0.000243
468369,468369,13029,Gorkhi-Terelj National Park,outline,park,47.91404,107.43599,Mongolia,9898,0.013579
550618,550618,40241,Zahedan,outline,city,29.496389,60.862778,Iran,5118,0.008597
646697,646697,33184,Solana Beach,outline,city,32.995278,-117.260278,United States of America,4660,0.00805
709756,709756,147941,Matagi Island,outline,city,-16.733333,-179.750833,Fiji,16878,0.019564
903195,903195,26780,Paraty,usable,city,-23.219444,-44.714722,Brazil,32864,0.030743
362874,362874,7093,Chiaramonte Gulfi,outline,city,37.0333,14.7,Italy,985,0.002582
116262,116262,24235,New Canaan (Connecticut),usable,city,41.1468,-73.4949,United States of America,783,0.002157
874121,874121,24838,North Bay,usable,city,46.310772,-79.462605,Canada,26417,0.02652
783020,783020,16542,Junagadh,usable,city,21.52,70.47,India,11857,0.015372


In [31]:
subset.iloc[offset:offset+n_results]

Unnamed: 0_level_0,id,wiki_id,name,status,type,lat,lng,country,weight,nr_tokens_norm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
163999,163999,148007,Lower Gwynedd Township,outline,city,40.213,-75.284,United States of America,103,0.000243
468369,468369,13029,Gorkhi-Terelj National Park,outline,park,47.91404,107.43599,Mongolia,9898,0.013579
550618,550618,40241,Zahedan,outline,city,29.496389,60.862778,Iran,5118,0.008597


Works.

### Weighted sampling

In order to get some randomness, but still sample more important destinations first, use weights created in one of the feature engineering notebooks.

In [None]:
(
    df
    .sample(frac=1, random_state=1234, weights='weight')
    .head(3)
)

In [None]:
df['weight'].value_counts()

### Add top X destination features

Given the place id, grab and sort the features directly:

In [34]:
from api.resources.utils.features import select_features, select_features_with_profiles, select_feature_columns_with_profiles

In [35]:
dest_id = 662248
top_x = 5

select_features(dest_id, df_features)

['Historic villages',
 'Landmarks',
 'Sightseeing tours',
 'Neighborhood walks',
 'Town centers/squares/plazas']

In [36]:
# this one should just have one feature
select_features(146019, df_features)

['Town centers/squares/plazas']

Then simply add to the output doc.

In [37]:
doc = df.loc[int(dest_id)].to_dict()

doc["features"] = select_features(doc['id'], df_features)
doc

{'id': 662248,
 'wiki_id': 10,
 'name': "'s-Hertogenbosch",
 'status': 'guide',
 'type': 'city',
 'lat': 51.69014,
 'lng': 5.29897,
 'country': 'Netherlands',
 'weight': 99691,
 'nr_tokens_norm': 0.06488851084513032,
 'features': ['Historic villages',
  'Landmarks',
  'Sightseeing tours',
  'Neighborhood walks',
  'Town centers/squares/plazas']}

For the explore endpoint, apply the function to all destinations.

In [54]:
places = subset.iloc[offset:offset+n_results].copy()
places

Unnamed: 0_level_0,id,wiki_id,name,status,type,lat,lng,country,weight,nr_tokens_norm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
163999,163999,148007,Lower Gwynedd Township,outline,city,40.213,-75.284,United States of America,103,0.000243
468369,468369,13029,Gorkhi-Terelj National Park,outline,park,47.91404,107.43599,Mongolia,9898,0.013579
550618,550618,40241,Zahedan,outline,city,29.496389,60.862778,Iran,5118,0.008597


In [55]:
places["features"] = places["id"].apply(
            lambda x: select_features_with_profiles(
                x, [], df_features, df_feature_types
            )
        )

In [50]:
places.loc[:,"features"] = places["id"].apply(
            lambda x: select_features_with_profiles(
                x, [], df_features, df_feature_types
            )
        )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [53]:
places.loc["id"]

KeyError: 'id'

In [49]:
places["id"]

id
163999    163999
468369    468369
550618    550618
Name: id, dtype: int64

In [38]:
test = df.sample(2)
test['features'] = test['id'].apply(lambda x: select_features(x, df_features))
test.to_dict(orient="records")

[{'id': 199055,
  'wiki_id': 39328,
  'name': 'Wildwood (New Jersey)',
  'status': 'usable',
  'type': 'city',
  'lat': 38.99167,
  'lng': -74.815,
  'country': 'United States of America',
  'weight': 182889,
  'nr_tokens_norm': 0.09745428033294852,
  'features': ['Marina',
   'Festivals',
   'Sightseeing tours',
   'Shopping',
   'Live music / bands']},
 {'id': 228609,
  'wiki_id': 131164,
  'name': 'San Juan (Metro Manila)',
  'status': 'outline',
  'type': 'city',
  'lat': 14.60488,
  'lng': 121.02963999999999,
  'country': 'Philippines',
  'weight': 1837,
  'nr_tokens_norm': 0.0041314782186038035,
  'features': ['Shopping',
   'Town centers/squares/plazas',
   'Urban parks',
   'Rafting',
   'Water parks']}]

With activity profiles:

In [None]:
# features from selected profiels first
profiles = ['nature', 'active']
select_features_with_profiles(dest_id, profiles, df_features, df_feature_types)

In [None]:
# should show beaches
select_features_with_profiles(197270, ['beach'], df_features, df_feature_types)

In [None]:
# should return only one
select_features_with_profiles(146019, ['beach'], df_features, df_feature_types)

### Convert to json

Provide the places, as well as some metadata.

In [None]:
from api.resources.utils.utils import prettify_n_results

In [None]:
# example of prettifying an X number of results for the front-end
prettify_n_results(3500)

In [None]:
subset = df.sample(2).to_dict(orient='records')

{
    "Results": len(subset),
    "Results_string": prettify_n_results(len(subset)),
    "Destinations": subset
}

Done.