In [1]:
import pandas as pd
from pathlib import Path
import importlib

import src.charles.utils as utils

importlib.reload(utils)

DATA_DIR          = Path.cwd() / '..' / 'data'
DATA_URL          = "https://github.com/prayer-position/Project_NLP/releases/latest/download"

In [2]:
file_names = ["AttractionSubCategorie.csv", "Tripadvisor.csv", "AttractionSubType.csv", "cuisine.csv", "dietary_restrictions.csv", "restaurantType.csv"]

for file_name in file_names:
    url = DATA_URL + f"/{file_name}"
    utils.ensure_data(DATA_DIR, DATA_DIR / file_name, url)

Data found locally, skipping download
Data found locally, skipping download
Data found locally, skipping download
Data found locally, skipping download
Data found locally, skipping download
Data found locally, skipping download


In [3]:
df_path = DATA_DIR / 'Tripadvisor.csv'
results_path = DATA_DIR / 'similarity_results.json'
bm25_path = DATA_DIR / "BM25_similarity.json"

places = utils.read_places(df_path)
similarity_results = pd.read_json(results_path)
bm25_results = pd.read_json(bm25_path)

In [4]:
places.head()

Unnamed: 0,id,nom,rating,nbAvis,latitude,longitude,typeR,adresse,priceRange,closed,...,ap_exclusion,ap_inclusions,ap_introduction,ap_primary_supplier_attraction_id,ap_primary_supplier_subtype,ap_primary_ta_geo_id,ap_product_code,ap_product_highlights,ap_product_text,ap_raw
0,188467,Place des Vosges,4.108407,5663,48.855614,2.365553,A,"Place des Vosges 4th Arrondissement, 75004 Par...",,0.0,...,,,,,,,,,,
1,188468,Rue des Francs Bourgeois,3.316532,73,48.85814,2.35988,A,"Rue des Francs-Bourgeois, 75003 Paris France",,0.0,...,,,,,,,,,,
2,188470,Village Saint-Paul,3.017118,98,48.853733,2.361295,A,"Rue Saint-paul, 75004 Paris France",,0.0,...,,,,,,,,,,
3,188471,Au Passe-partout,2.743157,2,48.85347,2.3616,A,"21 rue Saint Paul, 75004 Paris France",,0.0,...,,,,,,,,,,
4,188472,Cloître des Billettes,2.942987,23,48.858,2.35498,A,"24 rue des Archives, 75004 Paris France",,0.0,...,,,,,,,,,,


In [5]:
similarity_results.head()

Unnamed: 0,idplace,neighbours
0,188467,"[{'id': 292257, 'confidence': 0.7759}, {'id': ..."
1,188470,"[{'id': 292257, 'confidence': 0.6664}, {'id': ..."
2,188679,"[{'id': 2397509, 'confidence': 0.9033}, {'id':..."
3,188738,"[{'id': 10386434, 'confidence': 0.9456}, {'id'..."
4,188745,"[{'id': 235552, 'confidence': 0.9779}, {'id': ..."


## Level 1 evaluation

In [6]:
importlib.reload(utils)
def get_lvl_1_score(results, places):
    sum = 0
    for i in range(len(results)):
        line = results.iloc[i] 
        sum += utils.lvl_1_eval(line.iloc[0], utils.dict_to_tuple(line.iloc[1])[0], places)
    
    return sum / len(results)

In [7]:
print(get_lvl_1_score(similarity_results, places))
print(get_lvl_1_score(bm25_results, places))

0.9097852028639629
0.6683378746593982


## Level 2 evaluation

In [8]:
def get_lvl_2_score(results):
    sum = 0
    translation_df = utils.get_translation_dicts()
    for i in range(len(results)):
        line = results.iloc[i]
        recommendations, _ = utils.dict_to_tuple(line.iloc[1])
        score = utils.lvl_2_eval(line.iloc[0], recommendations, places, translation_df)
        if score is not None:
            sum += score
        else :
            sum += len(recommendations)
    return sum / len(results)

In [9]:
print(get_lvl_2_score(similarity_results))
print(get_lvl_2_score(bm25_results))

1.990453460620525
3.3645776566757495


The score is pretty bad, let's see where the model is having the most trouble

In [10]:
restaurant_ids = places[places["typeR"] == "R"]['id'].values
restaurant_results = similarity_results[similarity_results['idplace'].isin(restaurant_ids)]
print(get_lvl_2_score(restaurant_results))

0.3584905660377358


In [11]:
attraction_ids = places[places["typeR"] == "A"]['id'].values
attraction_results = similarity_results[similarity_results['idplace'].isin(attraction_ids)]
print(get_lvl_2_score(attraction_results))

2.7916666666666665


In [12]:
hotel_ids = places[places['typeR'] == "H"]['id'].values
hotel_results = similarity_results[similarity_results['idplace'].isin(hotel_ids)]
print(get_lvl_2_score(hotel_results))

1.3617021276595744


In [13]:
attrac_prod_ids = places[places['typeR'] == "AP"]['id'].values
attrac_prod_results = similarity_results[similarity_results['idplace'].isin(attrac_prod_ids)]
print(get_lvl_2_score(attrac_prod_results))

5.0


The metric isn't functioning for the AP type because we don't have any instructions on how to evaluate that kind of recommendation

In [14]:
def see_metadata_reco(results, places):
    translation_df = utils.get_translation_dicts()
    for j in range(len(results)):
        line = results.iloc[j]
        print(f"Query metadata : {utils.get_metadata(line.iloc[0], places, translation_df)}")
        recommendations, _ = utils.dict_to_tuple(line.iloc[1])
        for i, recommendation in enumerate(recommendations):
            print(f"    Recommendation {i} metadata : {utils.get_metadata(recommendation, places, translation_df)}")

In [15]:
see_metadata_reco(attrac_prod_results, places)

Query metadata : set()
    Recommendation 0 metadata : set()
    Recommendation 1 metadata : set()
    Recommendation 2 metadata : set()
    Recommendation 3 metadata : set()
    Recommendation 4 metadata : set()
Query metadata : set()
    Recommendation 0 metadata : set()
    Recommendation 1 metadata : set()
    Recommendation 2 metadata : set()
    Recommendation 3 metadata : set()
    Recommendation 4 metadata : set()
Query metadata : set()
    Recommendation 0 metadata : set()
    Recommendation 1 metadata : set()
    Recommendation 2 metadata : set()
    Recommendation 3 metadata : set()
    Recommendation 4 metadata : set()
Query metadata : set()
    Recommendation 0 metadata : set()
    Recommendation 1 metadata : set()
    Recommendation 2 metadata : set()
    Recommendation 3 metadata : set()
    Recommendation 4 metadata : set()
Query metadata : set()
    Recommendation 0 metadata : set()
    Recommendation 1 metadata : set()
    Recommendation 2 metadata : set()
    Recomme

In [16]:
see_metadata_reco(attraction_results, places)

Query metadata : {'sites touristiques', 'sites et monuments'}
    Recommendation 0 metadata : {'quartiers'}
    Recommendation 1 metadata : {'vegetarian friendly', 'restaurants'}
    Recommendation 2 metadata : {'sites touristiques', 'fontaines'}
    Recommendation 3 metadata : set()
    Recommendation 4 metadata : {'vegetarian friendly', 'restaurants'}
Query metadata : set()
    Recommendation 0 metadata : {'quartiers'}
    Recommendation 1 metadata : set()
    Recommendation 2 metadata : {'sites touristiques', 'sites et monuments'}
    Recommendation 3 metadata : {'sites touristiques', 'sites et monuments'}
    Recommendation 4 metadata : {'quartiers'}
Query metadata : {'sites touristiques'}
    Recommendation 0 metadata : {'sites touristiques'}
    Recommendation 1 metadata : {'sites touristiques'}
    Recommendation 2 metadata : {'sites touristiques', 'ruines anciennes'}
    Recommendation 3 metadata : {'sites touristiques', 'ponts'}
    Recommendation 4 metadata : set()
Query meta

## BM25 model

In [None]:
bm25_reco = utils.bm25_recommendations()