In [None]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import copy

# Connection string
connection_string = "postgresql://postgres.svsobttfvdpdxpiwjeqg:z36ow70ANRJB5GHa@aws-0-eu-central-1.pooler.supabase.com:6543/postgres"
engine = create_engine(connection_string)

# Load tables from the database
dive_sites = pd.read_sql("SELECT * FROM dive_site", con=engine)
user_ratings_data = pd.read_sql("SELECT * FROM dive_site_rating", con=engine)
occurrences = pd.read_sql("SELECT * FROM occurrence", con=engine)
animals = pd.read_sql("SELECT * FROM animal", con=engine)
animals_ratings = pd.read_sql("SELECT * FROM animal_rating", con=engine)
categories = pd.read_sql("SELECT * FROM dive_site_category", con=engine)
categories_per_dive_site = pd.read_sql("SELECT * FROM categories_per_dive_site", con=engine)
alembic_version = pd.read_sql("SELECT * FROM alembic_version", con=engine)

# sort the dive sites by the id
dive_sites = dive_sites.sort_values(by='id')
dive_sites

Unnamed: 0,id,title,lat,long,description,image_url,url,max_depth,region,cluster
46,1,HMS Maori,35.90250,14.51532,Max Depth: -15mt √ Shore dive √ Beginners Free...,https://d2p1cf6997m1ir.cloudfront.net/media/th...,https://www.padi.com/dive-site/malta/hms-maori/,,Malta,2
47,2,USAT Liberty Shipwreck,-8.27396,115.59307,This is probably the most famous dive site in ...,https://d2p1cf6997m1ir.cloudfront.net/media/th...,https://www.padi.com/dive-site/indonesia/usat-...,,Indonesia,7
48,3,Ped,-8.67438,115.51499,Ped dive site consists of a fairly wide and sh...,https://d2p1cf6997m1ir.cloudfront.net/media/th...,https://www.padi.com/dive-site/indonesia/ped/,,Indonesia,1
49,4,Manta Point,-8.79547,115.52553,One of the most famous dive sites in the Bali ...,https://d2p1cf6997m1ir.cloudfront.net/media/th...,https://www.padi.com/dive-site/indonesia/manta...,65 feet / 20 meters,Indonesia,2
42,5,THE HOLE / GREEN BAY CAVES,34.99989,34.06868,Shore entry leading over reefs to a series of ...,https://d2p1cf6997m1ir.cloudfront.net/media/th...,https://www.padi.com/dive-site/cyprus/the-hole...,,Cyprus,1
...,...,...,...,...,...,...,...,...,...,...
4390,4391,"Lekuan 1, 2, 3",1.59822,124.76752,One of the most popular dive sites on Bunaken ...,https://d2p1cf6997m1ir.cloudfront.net/media/th...,https://www.padi.com/dive-site/indonesia/lekua...,,Indonesia,7
4391,4392,Moc-Che,20.64070,-87.04884,,https://d2p1cf6997m1ir.cloudfront.net/media/th...,https://www.padi.com/dive-site/mexico/moc-che/,,Mexico,2
4392,4393,Fish Market,20.81187,-86.88260,"Shallow reef, FULL of fish. 30 ft maximum dept...",https://d2p1cf6997m1ir.cloudfront.net/media/th...,https://www.padi.com/dive-site/mexico/fish-mar...,,Mexico,2
4393,4394,The Zenobia Wreck,34.88500,33.74000,The Zenobia Wreck is one of the top ten wreck ...,https://d2p1cf6997m1ir.cloudfront.net/media/th...,https://www.padi.com/dive-site/cyprus/the-zeno...,137 feet / 42 meters,Cyprus,6


In [53]:
# Query to list all tables in the 'public' schema
query = """
SELECT table_name
FROM information_schema.tables
WHERE table_schema = 'public';
"""

# Load the table names into a DataFrame
tables = pd.read_sql(query, con=engine)

tables

Unnamed: 0,table_name
0,user_vectors
1,item_vectors
2,categories_per_dive_site
3,dive_site
4,alembic_version
5,dive_site_category
6,occurrence
7,animal_rating
8,animal
9,user


In [3]:
occurrences

Unnamed: 0,id,dive_site_id,animal_id
0,1,1,1
1,2,1,2
2,3,1,3
3,4,1,4
4,5,1,5
...,...,...,...
21803,21856,4395,51
21804,21857,4395,92
21805,21858,4395,2
21806,21859,4395,37


In [4]:
categories_per_dive_site

Unnamed: 0,dive_site_id,dive_site_category_id
0,1,1
1,1,2
2,2,1
3,2,3
4,2,4
...,...,...
8553,4392,2
8554,4393,2
8555,4394,1
8556,4394,10


In [5]:
categories

Unnamed: 0,id,name,image_url
0,20,Fjord,https://www.divingsquad.com/wp-content/uploads...
1,18,Muck,https://murexresorts.com/wp-content/uploads/20...
2,19,Sea Loch,https://meanderapparel.com/cdn/shop/articles/r...
3,17,Spring,https://imgds360live.s3.amazonaws.com/storefro...
4,16,Pool,https://encrypted-tbn0.gstatic.com/images?q=tb...
5,15,Archaeological,https://marineprotectedareas.noaa.gov/toolkit/...
6,14,Quarry,https://images.downeast.com/wp-content/uploads...
7,13,Pinnacle,https://go2similan.com/wp-content/uploads/2021...
8,12,Cavern,https://aquaworld.com.mx/uploads/0000/1/2023/0...
9,11,River,https://www.macssports.com/wp-content/uploads/...


In [6]:
animals

Unnamed: 0,id,name,image_url
0,149,Spider Crab,https://i.ytimg.com/vi/knh7lQFWnnw/hqdefault.jpg
1,325,Ribbon Eel,https://i0.wp.com/www.australiangeographic.com...
2,83,Chondrichthyes,https://cdn.oceanographicmagazine.com/wp-conte...
3,121,Jellyfish,https://s3.eu-west-1.amazonaws.com/media.mcsuk...
4,1,Moray Eel,https://blog.mares.com/wp-content/uploads/2017...
...,...,...,...
495,498,Heteractis Anemones,https://as1.ftcdn.net/v2/jpg/01/65/57/74/1000_...
496,499,Rocketfishes,https://www.montereybayaquarium.org/globalasse...
497,500,Serpent Eel,https://upload.wikimedia.org/wikipedia/commons...
498,254,Tree Dorids,https://encrypted-tbn0.gstatic.com/images?q=tb...


In [7]:
# Analyse if animal names are unique
animal_names = animals['name'].values
unique_animal_names = np.unique(animal_names)
print("Number of animals: ", len(animal_names))
print("Number of unique animal names: ", len(unique_animal_names))


Number of animals:  500
Number of unique animal names:  500


In [8]:
# Analyze categotires_per_dive_site: How many dive sites have no categories? 3/4392
# check if every dive site id appears in the categories_per_dive_site table
dive_sites['id'].isin(categories_per_dive_site['dive_site_id']).value_counts()

id
True     4392
False       3
Name: count, dtype: int64

In [9]:
# how many dive sites have no description? -> 12,7 % of the dive sites have no description
dive_sites['description'].isnull().sum() / len(dive_sites)

#how many dive sites have no max_depth? -> 50 % of the dive sites have no max_depth
dive_sites['max_depth'].isnull().sum() / len(dive_sites)

np.float64(0.48282138794084184)

In [10]:
# get the max and min of latitude and longitude
dive_sites[['lat', 'long']].describe()

Unnamed: 0,lat,long
count,4395.0,4395.0
mean,19.856706,20.518053
std,22.578385,79.242567
min,-45.77355,-169.93568
25%,8.567945,-59.63482
50%,20.45098,14.28576
75%,38.009065,99.811555
max,69.639,179.93404


We need a vector-like representation for each dive-site:
- geodata latitude, longitude
- category
- animals


Create a vector like representation for each dive site.

In [88]:
# This needs to be stored in the database

# let's generate a copy for the current dive sites dataframe
converted_dive_sites = copy.deepcopy(dive_sites)    

# for each category, we generate a new column (indicator if category is present in the list of genres)
for cat_id in categories['id']:
    # create a new column for the current category
    category_name = categories.loc[categories['id'] == cat_id, 'name'].values[0]
    converted_dive_sites[category_name] = 0

    # iterate over all rows
    for index, row in converted_dive_sites.iterrows():
        # get a list of all dive_site_category_ids for the current dive_site_id
        list_of_cat_ids = list(categories_per_dive_site[categories_per_dive_site['dive_site_id'] == row['id']]['dive_site_category_id'])
        # check if the current cat_id in the list of categories for the current dive_site_id
        if cat_id in list_of_cat_ids:
            converted_dive_sites.at[index, category_name] = 1

# Scale lat and long to be between 0 and 1 with a MinMaxScaler
scaler = MinMaxScaler()
converted_dive_sites[['lat_scaled', 'long_scaled']] = scaler.fit_transform(converted_dive_sites[['lat', 'long']])


# Initialize animal feature columns in a single operation
animal_columns = {animal_name: 0 for animal_name in animals['name']}
converted_dive_sites = pd.concat(
    [converted_dive_sites, pd.DataFrame(animal_columns, index=converted_dive_sites.index)],
    axis=1
)


# Populate the animal feature columns
for index, row in converted_dive_sites.iterrows():
    dive_site_id = row['id']
    animal_ids = occurrences[occurrences['dive_site_id'] == dive_site_id]['animal_id'].values   # get all animal_ids for the current dive_site_id
    for animal_id in animal_ids:
        animal_name = animals[animals['id'] == animal_id]['name'].values[0]
        converted_dive_sites.at[index, animal_name] = 1

# START
#  The following is just for the examples in the end, delete if not needed:

# add a new column 'occurences' to the converted_dive_sites dataframe
converted_dive_sites['occurences'] = ''
for index, row in converted_dive_sites.iterrows():
    dive_site_id = row['id']
    # get all animal names for the current dive_site_id
    animal_ids = occurrences[occurrences['dive_site_id'] == dive_site_id]['animal_id'].values
    animal_names = []
    for animal_id in animal_ids:
        animal_name = animals[animals['id'] == animal_id]['name'].values[0]
        animal_names.append(animal_name)

    animal_names = ', '.join(animal_names)
    converted_dive_sites.at[index, 'occurences'] = animal_names

# add a new column 'categories' to the converted_dive_sites dataframe
converted_dive_sites['categories'] = ''
for index, row in converted_dive_sites.iterrows():
    dive_site_id = row['id']
    # get all category names for the current dive_site_id
    category_ids = categories_per_dive_site[categories_per_dive_site['dive_site_id'] == dive_site_id]['dive_site_category_id'].values
    category_names = []
    for category_id in category_ids:
        category_name = categories[categories['id'] == category_id]['name'].values[0]
        category_names.append(category_name)

    category_names = ', '.join(category_names)
    converted_dive_sites.at[index, 'categories'] = category_names


# END


# sort converted_dive_sites by id 
# drop the label index
converted_dive_sites = converted_dive_sites.sort_values(by='id').reset_index(drop=True)

converted_dive_sites

NotImplementedError: 

In [12]:
# Quick Test

# Dive site 1 has the following animals
animal_ids_in_dive_site_1 = occurrences[occurrences['dive_site_id'] == 1]["animal_id"].tolist()

# Get the names of the animals in animal_ids_in_dive_site_1
animal_names_in_dive_site_1 = []
for animal_id in animal_ids_in_dive_site_1:
    animal_name = animals[animals['id'] == animal_id]['name'].values[0]
    animal_names_in_dive_site_1.append(animal_name)

print(animal_names_in_dive_site_1)

# Check if the columns with an animal_name included in animal_names_in_dive_site_1 are set to 1 - the rest should be 0
assert converted_dive_sites.loc[converted_dive_sites['id'] == 1, animal_names_in_dive_site_1].values.all() == 1
assert converted_dive_sites.loc[converted_dive_sites['id'] == 1, ~converted_dive_sites.columns.isin(animal_names_in_dive_site_1)].values.all() == 0

converted_dive_sites.loc[converted_dive_sites['id'] == 1, animal_names_in_dive_site_1]

['Moray Eel', 'Seahorse Family', 'Flounder', 'Eagle Ray', 'Sea Anemone']


Unnamed: 0,Moray Eel,Seahorse Family,Flounder,Eagle Ray,Sea Anemone
0,1,1,1,1,1


Get Item Profiles

In [13]:
# EUCLIDEAN SIMILARITY (for geodata)
# Cosine similarity is not suitable for geocoordinates. We can use Euclidean similarity instead.

def get_euclidean_similarity(v1, v2, max_distance=1):
    """
    Compute similarity between scaled geocoordinates using Euclidean distance.
    """
    lat1, lon1 = v1
    lat2, lon2 = v2
    distance = np.sqrt((lat1 - lat2)**2 + (lon1 - lon2)**2)
    similarity = max(0, 1 - distance / max_distance)
    return similarity


In [14]:
# Cosine Similarity

def get_cosine_similarity(x, y):
    
    numerator = np.dot(x,y)
    denominator = np.linalg.norm(x) * np.linalg.norm(y)

    # sanity check: x and y must be non-zero vectors
    if denominator > 0:
        sim = numerator / denominator
    else:
        raise Exception("The cosine similarity is not defined for vectors containing only zeros!")

    return sim

In [15]:
get_cosine_similarity((0.33776, 0.180727), (0.790692, 0.31076))

np.float64(0.9931825125192547)

In [16]:
get_euclidean_similarity((0.33776, 0.180727), (0.790692, 0.31076))

np.float64(0.5287718411289495)

In [17]:
def get_recommendations_for_a_dive_site(dive_site_id, w_cat=1/3, w_geo=1/3, w_animal=1/3, n=10):
    """
    This function generates a recommendation based on the category & geodata of the input dive site.

    w_cat: weight for the category vector
    w_geo: weight for the geodata (lat_scaled, long_scaled) vector 
    """
    print(f"Generating recommendations for dive site with ID {dive_site_id}...")

    idx = dive_site_id-1 # index of the query dive site in the DataFrame


    # Query Dive Site: Get Feature Vectors
    # Category vector
    query_categories_vector = converted_dive_sites.loc[idx, categories['name']].to_numpy() 
    # Geodata vector
    query_geodata_vector = converted_dive_sites.loc[idx, ['lat_scaled', 'long_scaled']].to_numpy()
    # Animal vector
    query_animal_vector = converted_dive_sites.loc[idx, animals['name']].to_numpy()

    # Other Dive Sites
    
    print(f"Queried dive site index: {idx}")

    # generate recommendations
    recommendations = recommend(query_categories_vector, query_geodata_vector, query_animal_vector, w_cat, w_geo, w_animal, n)
    
    dive_sites_indexes = [d['index'] for d in recommendations]

    # return the list of titles and similarities
    recommendations_df = converted_dive_sites.loc[dive_sites_indexes, ['id', 'title', 'lat', 'long', 'occurences', 'categories']]
    recommendations_df[f'Similarity to dive site {dive_site_id}'] = [d['combined'] for d in recommendations]
    recommendations_df[f'Category Similarity to dive site {dive_site_id}'] = [d['category'] for d in recommendations]
    recommendations_df[f'Geodata Similarity to dive site {dive_site_id}'] = [d['geodata'] for d in recommendations] 
    recommendations_df[f'Animal Similarity to dive site {dive_site_id}'] = [d['animal'] for d in recommendations]

    return recommendations_df

# EXAMPLE RECOMMENDATIONS

In [18]:
# the following examples all use dive site with id = 2

print("I asked for recommendations for dive site with ID 2:")
print(converted_dive_sites.loc[1, ['id', 'title', 'lat', 'long', 'description', 'occurences', 'categories']])

I asked for recommendations for dive site with ID 2:
id                                                             2
title                                     USAT Liberty Shipwreck
lat                                                     -8.27396
long                                                   115.59307
description    This is probably the most famous dive site in ...
occurences     Sea Turtle, Jackfish, Grouper, Dorid Nudibranc...
categories                                    Wreck, Beach, Wall
Name: 1, dtype: object


### Example 1: General Recommendation

Category, Location and Occurences are equally important.


In [85]:
example_recommendations = get_recommendations_for_a_dive_site(2, w_cat=1/3, w_geo=1/3, w_animal=1/3, n=10)

print("\nRESULT")
example_recommendations


Generating recommendations for dive site with ID 2...
Queried dive site index: 1
Iterate over all dive sites...
 4394 / 4395
RESULT


Unnamed: 0,id,title,lat,long,occurences,categories,Similarity to dive site 2,Category Similarity to dive site 2,Geodata Similarity to dive site 2,Animal Similarity to dive site 2
1,2,USAT Liberty Shipwreck,-8.27396,115.59307,"Sea Turtle, Jackfish, Grouper, Dorid Nudibranc...","Wreck, Beach, Wall",1.0,1.0,1.0,1.0
4376,4377,USAT Liberty Shipwreck,-8.27396,115.59307,"Mackerel, Cardinalfish, Butterflyfish, Gobies,...","Wreck, Beach, Wall",1.0,1.0,1.0,1.0
2858,2859,Indonesia Bali Tulamben USS Liberty Wreck,-8.27397,115.59312,"Baracuda, Moray Eel, Saddleback Fish, Angelfis...","Wreck, Beach, Reef, Wall",0.955342,0.866025,1.0,1.0
3247,3248,自由号沉船,-8.27299,115.59239,"Surgeonfish, Sea Turtle, Trumpetfish, Baracuda...","Wreck, Beach, Ocean",0.888886,0.666667,0.999991,1.0
1069,1070,Drop Off,-8.27779,115.59651,"Saber-Toothed Blennies, Tigerfish, Butterflyfi...","Beach, Reef, Wall",0.888877,0.666667,0.999965,1.0
5,6,Tulamben,-8.27341,115.59235,"Jackfish, Goatfish, Cuttlefish, Seahorse Famil...",Wreck,0.855123,0.57735,0.999995,0.988024
4380,4381,Tulamben,-8.27341,115.59235,"Seahorse Family, Stingray, Cuttlefish, Scorpio...",Wreck,0.855123,0.57735,0.999995,0.988024
1957,1958,Jemeluk Wall,-8.33771,115.66067,"Stingray, Lionfish, Stonefish, Sea Turtle, Dam...","Beach, Wall",0.730521,0.816497,0.999415,0.375653
4151,4152,Wreck Point,13.52245,120.9847,"Snaper, Hawkfish, Baracuda, Tigerfish, Moray E...","Wreck, Drift, Reef, Wall",0.698237,0.57735,0.810516,0.706845
1962,1963,Gili Meno Wall,-8.34323,116.04936,"Stingray, Eagle Ray, Boxfish, Damselfish, Scor...",Wall,0.691787,0.57735,0.998564,0.499445


### Example 2: Only look at geographically close dive sites to id=2. Ignore animals and categories

In [81]:
example_recommendations = get_recommendations_for_a_dive_site(2, w_cat=0, w_geo=1, w_animal=0, n=10)

print("\nRESULT")
example_recommendations


Generating recommendations for dive site with ID 2...
Queried dive site index: 1
Iterate over all dive sites...
 4394 / 4395
RESULT


Unnamed: 0,id,title,lat,long,occurences,categories,Similarity to dive site 2,Category Similarity to dive site 2,Geodata Similarity to dive site 2,Animal Similarity to dive site 2
1,2,USAT Liberty Shipwreck,-8.27396,115.59307,"Sea Turtle, Jackfish, Grouper, Dorid Nudibranc...","Wreck, Beach, Wall",1.0,,1.0,
4376,4377,USAT Liberty Shipwreck,-8.27396,115.59307,"Mackerel, Cardinalfish, Butterflyfish, Gobies,...","Wreck, Beach, Wall",1.0,,1.0,
2858,2859,Indonesia Bali Tulamben USS Liberty Wreck,-8.27397,115.59312,"Baracuda, Moray Eel, Saddleback Fish, Angelfis...","Wreck, Beach, Reef, Wall",1.0,,1.0,
5,6,Tulamben,-8.27341,115.59235,"Jackfish, Goatfish, Cuttlefish, Seahorse Famil...",Wreck,0.999995,,0.999995,
4380,4381,Tulamben,-8.27341,115.59235,"Seahorse Family, Stingray, Cuttlefish, Scorpio...",Wreck,0.999995,,0.999995,
3247,3248,自由号沉船,-8.27299,115.59239,"Surgeonfish, Sea Turtle, Trumpetfish, Baracuda...","Wreck, Beach, Ocean",0.999991,,0.999991,
1069,1070,Drop Off,-8.27779,115.59651,"Saber-Toothed Blennies, Tigerfish, Butterflyfi...","Beach, Reef, Wall",0.999965,,0.999965,
3237,3238,house reff,-8.2653,115.58853,"Baracuda, Parrotfish, Saddleback Fish, Grouper...","Beach, Reef, Sea Loch",0.999924,,0.999924,
1041,1042,"Tulamben Area (Seraya, Sidem)",-8.28477,115.60512,,Beach,0.9999,,0.9999,
1098,1099,"Kubu, Boga Wreck",-8.24955,115.58064,,"Wreck, Reef",0.999786,,0.999786,


### Example 3: Get similar dive site regardless of distance

In [80]:
# We can also insert negative weights, e.g. w_geo = -1 to punish recommendations that are close geographically (in theory, doesnt work)

# Example: Punish recommendations that are close to the query dive site
example_recommendations = get_recommendations_for_a_dive_site(2, w_cat=0.5, w_geo=0, w_animal=0.5, n=10)

print("\nRESULT")
example_recommendations


Generating recommendations for dive site with ID 2...
Queried dive site index: 1
Iterate over all dive sites...
 4394 / 4395
RESULT


Unnamed: 0,id,title,lat,long,occurences,categories,Similarity to dive site 2,Category Similarity to dive site 2,Geodata Similarity to dive site 2,Animal Similarity to dive site 2
1,2,USAT Liberty Shipwreck,-8.27396,115.59307,"Sea Turtle, Jackfish, Grouper, Dorid Nudibranc...","Wreck, Beach, Wall",1.0,1.0,,1.0
4376,4377,USAT Liberty Shipwreck,-8.27396,115.59307,"Mackerel, Cardinalfish, Butterflyfish, Gobies,...","Wreck, Beach, Wall",1.0,1.0,,1.0
2858,2859,Indonesia Bali Tulamben USS Liberty Wreck,-8.27397,115.59312,"Baracuda, Moray Eel, Saddleback Fish, Angelfis...","Wreck, Beach, Reef, Wall",0.933013,0.866025,,1.0
1069,1070,Drop Off,-8.27779,115.59651,"Saber-Toothed Blennies, Tigerfish, Butterflyfi...","Beach, Reef, Wall",0.833333,0.666667,,1.0
3247,3248,自由号沉船,-8.27299,115.59239,"Surgeonfish, Sea Turtle, Trumpetfish, Baracuda...","Wreck, Beach, Ocean",0.833333,0.666667,,1.0
5,6,Tulamben,-8.27341,115.59235,"Jackfish, Goatfish, Cuttlefish, Seahorse Famil...",Wreck,0.782687,0.57735,,0.988024
4380,4381,Tulamben,-8.27341,115.59235,"Seahorse Family, Stingray, Cuttlefish, Scorpio...",Wreck,0.782687,0.57735,,0.988024
683,684,Lighthouse,28.49902,34.51988,"Electric Ray, Snaper, Ghostpipefish, Stonefish...","Beach, Reef, Wall",0.654268,0.666667,,0.64187
685,686,Mashraba,28.4952,34.51702,"Goatfish, Leptastrea Fish, Damselfish, Moray E...","Beach, Reef, Wall",0.650112,0.666667,,0.633556
4151,4152,Wreck Point,13.52245,120.9847,"Snaper, Hawkfish, Baracuda, Tigerfish, Moray E...","Wreck, Drift, Reef, Wall",0.642098,0.57735,,0.706845


### Example 4: Find dive sites with a similar profile, prioritise result that are far away

In [79]:

# Example: Punish recommendations that are close to the query dive site
# CAUTION! Summed weights should be 1
example_recommendations = get_recommendations_for_a_dive_site(2, w_cat=0.75, w_geo=-0.5, w_animal=0.75, n=10)

print("\nRESULT")
example_recommendations


Generating recommendations for dive site with ID 2...
Queried dive site index: 1
Iterate over all dive sites...
 4394 / 4395
RESULT


Unnamed: 0,id,title,lat,long,occurences,categories,Similarity to dive site 2,Category Similarity to dive site 2,Geodata Similarity to dive site 2,Animal Similarity to dive site 2
1,2,USAT Liberty Shipwreck,-8.27396,115.59307,"Sea Turtle, Jackfish, Grouper, Dorid Nudibranc...","Wreck, Beach, Wall",1.0,1.0,1.0,1.0
4376,4377,USAT Liberty Shipwreck,-8.27396,115.59307,"Mackerel, Cardinalfish, Butterflyfish, Gobies,...","Wreck, Beach, Wall",1.0,1.0,1.0,1.0
2858,2859,Indonesia Bali Tulamben USS Liberty Wreck,-8.27397,115.59312,"Baracuda, Moray Eel, Saddleback Fish, Angelfis...","Wreck, Beach, Reef, Wall",0.899519,0.866025,1.0,1.0
1069,1070,Drop Off,-8.27779,115.59651,"Saber-Toothed Blennies, Tigerfish, Butterflyfi...","Beach, Reef, Wall",0.750017,0.666667,0.999965,1.0
3247,3248,自由号沉船,-8.27299,115.59239,"Surgeonfish, Sea Turtle, Trumpetfish, Baracuda...","Wreck, Beach, Ocean",0.750004,0.666667,0.999991,1.0
3577,3578,Front Porch,12.16451,-68.28722,"Pufferfish, Brain Coral, Serpulid Worms, Ircin...","Wreck, Beach, Reef",0.684258,0.666667,0.445399,0.54261
683,684,Lighthouse,28.49902,34.51988,"Electric Ray, Snaper, Ghostpipefish, Stonefish...","Beach, Reef, Wall",0.67839,0.666667,0.606025,0.64187
5,6,Tulamben,-8.27341,115.59235,"Jackfish, Goatfish, Cuttlefish, Seahorse Famil...",Wreck,0.674033,0.57735,0.999995,0.988024
4380,4381,Tulamben,-8.27341,115.59235,"Seahorse Family, Stingray, Cuttlefish, Scorpio...",Wreck,0.674033,0.57735,0.999995,0.988024
668,669,USCGC Bibb,25.004902,-80.373159,"Angelfish, Requiem Sharks, Eagle Ray, Snaper, ...",Wreck,0.672529,0.57735,0.370024,0.566038


# ADDING USER PROFILE


In [58]:
user = pd.read_sql('SELECT * FROM "user"', con=engine)

#user

Unnamed: 0,name,email,id
0,Steffen Kirchhoff,steffen.kirchhoff99@gmx.de,152e8e72-c295-4a60-9249-63d6aba3be23
1,Dominik Eitner,dominikeitner@gmail.com,1b4839eb-7c8a-4598-9ce6-a4e2cb9e8f8f
2,Veronica Thompson,crystalrogers@example.com,f8f0a8f5-599a-4874-bef5-a0d33b1f6467
3,Melissa Hoover,harriselizabeth@example.com,975185b7-0919-4726-b132-c83e95010e53
4,Karen Allen,tammyhuff@example.org,9701fe9f-e2c5-4ab4-bcd9-ee1c38fc2c71
...,...,...,...
397,Tracy Haas,etucker@example.org,5bad2a20-80f8-4706-b284-599433d3e456
398,Marie Herrera,elizabethtaylor@example.com,02cb64d2-9a73-4b18-bf26-9648a4516d4f
399,Philip Stuart,amanda39@example.com,df7c9f90-78a0-49b0-abdb-af10595b6e86
400,Robert Middleton,jennifer16@example.org,4b62f0d4-6ae2-4e91-aae9-0d01fa5862cb


In [60]:
# TODO: Can be deleted as soon as lat and long is added to the user database

# Add lat and long columns to user dataframe

user['user_lat'] = np.random.uniform(-90, 90, len(user))
user['user_long'] = np.random.uniform(-180, 180, len(user))

user


Unnamed: 0,name,email,id,user_lat,user_long
0,Steffen Kirchhoff,steffen.kirchhoff99@gmx.de,152e8e72-c295-4a60-9249-63d6aba3be23,-53.697738,136.691290
1,Dominik Eitner,dominikeitner@gmail.com,1b4839eb-7c8a-4598-9ce6-a4e2cb9e8f8f,77.878758,115.675834
2,Veronica Thompson,crystalrogers@example.com,f8f0a8f5-599a-4874-bef5-a0d33b1f6467,-1.759962,56.150637
3,Melissa Hoover,harriselizabeth@example.com,975185b7-0919-4726-b132-c83e95010e53,-80.817103,27.331433
4,Karen Allen,tammyhuff@example.org,9701fe9f-e2c5-4ab4-bcd9-ee1c38fc2c71,-60.219738,-65.977175
...,...,...,...,...,...
397,Tracy Haas,etucker@example.org,5bad2a20-80f8-4706-b284-599433d3e456,21.386186,60.731803
398,Marie Herrera,elizabethtaylor@example.com,02cb64d2-9a73-4b18-bf26-9648a4516d4f,68.515758,-172.603559
399,Philip Stuart,amanda39@example.com,df7c9f90-78a0-49b0-abdb-af10595b6e86,-85.743940,38.104181
400,Robert Middleton,jennifer16@example.org,4b62f0d4-6ae2-4e91-aae9-0d01fa5862cb,18.440468,-176.054865


In [63]:

user_ratings_data = pd.read_sql('SELECT * FROM dive_site_rating', con=engine)

user_ratings_data

Unnamed: 0,id,dive_site_id,rating,user_id
0,1,3071,4.0,152e8e72-c295-4a60-9249-63d6aba3be23
1,4,2845,2.0,152e8e72-c295-4a60-9249-63d6aba3be23
2,5,3839,5.0,152e8e72-c295-4a60-9249-63d6aba3be23
3,6,4079,3.0,f8f0a8f5-599a-4874-bef5-a0d33b1f6467
4,7,1348,4.0,f8f0a8f5-599a-4874-bef5-a0d33b1f6467
...,...,...,...,...
23213,23217,3333,5.0,152e8e72-c295-4a60-9249-63d6aba3be23
23214,23218,1034,5.0,152e8e72-c295-4a60-9249-63d6aba3be23
23215,23219,3696,5.0,152e8e72-c295-4a60-9249-63d6aba3be23
23216,23220,3101,5.0,152e8e72-c295-4a60-9249-63d6aba3be23


In [67]:
user_ratings_data['user_id'][0]

UUID('152e8e72-c295-4a60-9249-63d6aba3be23')

In [68]:
import uuid

# Define the target user_id as a UUID object
target_user_id = uuid.UUID('152e8e72-c295-4a60-9249-63d6aba3be23')

# Filter rows by user_id
user_ratings = user_ratings_data[user_ratings_data['user_id'] == target_user_id]

user_ratings


Unnamed: 0,id,dive_site_id,rating,user_id
0,1,3071,4.0,152e8e72-c295-4a60-9249-63d6aba3be23
1,4,2845,2.0,152e8e72-c295-4a60-9249-63d6aba3be23
2,5,3839,5.0,152e8e72-c295-4a60-9249-63d6aba3be23
23209,23213,3674,4.0,152e8e72-c295-4a60-9249-63d6aba3be23
23210,23214,231,1.0,152e8e72-c295-4a60-9249-63d6aba3be23
23211,23215,3581,5.0,152e8e72-c295-4a60-9249-63d6aba3be23
23212,23216,808,5.0,152e8e72-c295-4a60-9249-63d6aba3be23
23213,23217,3333,5.0,152e8e72-c295-4a60-9249-63d6aba3be23
23214,23218,1034,5.0,152e8e72-c295-4a60-9249-63d6aba3be23
23215,23219,3696,5.0,152e8e72-c295-4a60-9249-63d6aba3be23


In [70]:
# This are all the features columns we use for recommending
# it wouldnt make sense to include geodata here. If a user rates one dive site in Australia and one in America, we would calculate the mean of the geodata, which would lead us ending up in Africa or Europa - nonsense
feature_columns = categories['name'].tolist() + animals['name'].tolist()


# for the given user, extract all ratings and item profiles

def get_item_profile_of_user(user_id):
    """
    This function returns a list of item profiles and a list of ratings for the given user_id.
    """
    user_ratings = user_ratings_data[user_ratings_data['user_id'] == user_id]

    print(f"User with ID {user_id} has rated {len(user_ratings)} dive sites.")
    print(user_ratings)

    item_profiles = []
    ratings = []

    for index, row in user_ratings.iterrows():
        
        rating = row['rating']
        dive_site_id = row['dive_site_id']
        item_profile = converted_dive_sites[converted_dive_sites['id'] == dive_site_id][feature_columns].to_numpy().flatten()
        item_profiles.append(item_profile)
        ratings.append(rating)

    ratings = np.array(ratings)
    item_profiles = np.array(item_profiles)

    # Create a DataFrame for better interpretability
    item_profiles = pd.DataFrame(item_profiles, columns=feature_columns)

    return ratings, item_profiles


# TEST

# Define the target user_id as a UUID object
target_user_id = uuid.UUID('152e8e72-c295-4a60-9249-63d6aba3be23')

# get the item profiles of user with id 1
ratings, item_profiles = get_item_profile_of_user(target_user_id)

# check if the item profiles are correct
assert np.array_equal(ratings, user_ratings_data[user_ratings_data['user_id'] == target_user_id]['rating'].to_numpy())

i = 0
for row in user_ratings_data[user_ratings_data['user_id'] == target_user_id].to_numpy():
    dive_site_id = row[1] # get the dive site id
    item_profile = converted_dive_sites[converted_dive_sites['id'] == dive_site_id][feature_columns].to_numpy().flatten() # get the item profile of that id
    assert np.array_equal(item_profile, item_profiles.iloc[i].to_numpy())
    i += 1


User with ID 152e8e72-c295-4a60-9249-63d6aba3be23 has rated 12 dive sites.
          id  dive_site_id  rating                               user_id
0          1          3071     4.0  152e8e72-c295-4a60-9249-63d6aba3be23
1          4          2845     2.0  152e8e72-c295-4a60-9249-63d6aba3be23
2          5          3839     5.0  152e8e72-c295-4a60-9249-63d6aba3be23
23209  23213          3674     4.0  152e8e72-c295-4a60-9249-63d6aba3be23
23210  23214           231     1.0  152e8e72-c295-4a60-9249-63d6aba3be23
23211  23215          3581     5.0  152e8e72-c295-4a60-9249-63d6aba3be23
23212  23216           808     5.0  152e8e72-c295-4a60-9249-63d6aba3be23
23213  23217          3333     5.0  152e8e72-c295-4a60-9249-63d6aba3be23
23214  23218          1034     5.0  152e8e72-c295-4a60-9249-63d6aba3be23
23215  23219          3696     5.0  152e8e72-c295-4a60-9249-63d6aba3be23
23216  23220          3101     5.0  152e8e72-c295-4a60-9249-63d6aba3be23
23217  23221          2140     5.0  152e8e72-c295

In [72]:
def generate_user_profile(ratings, item_profiles):
    """
    This function generates a user profile based on the given ratings and item profiles.
    """
    user_profile = None

    item_profiles = item_profiles.to_numpy()

    # option 1
    # new_ratings = ratings # normal ratings

    # option 2
    #new_ratings = ratings - ratings.mean() # mean ratings

    # option 3
    new_ratings = ratings - 2.5

    for i in range(len(item_profiles)):
        
        if user_profile is None:
            user_profile = new_ratings[i] * item_profiles[i]
        else:
            user_profile += new_ratings[i] * item_profiles[i]

    user_profile = user_profile / len(item_profiles)

    user_profile = pd.DataFrame(user_profile.reshape(1, user_profile.shape[0]), columns=feature_columns)
        
    return user_profile



# TEST
ratings, item_profiles = get_item_profile_of_user(target_user_id)
user_profile = generate_user_profile(ratings, item_profiles)

ratings = ratings - 2.5
# Check if the user profile is correct. It should be the average of the item profiles weighted by the ratings
non_zero_features = user_profile.loc[:, (user_profile != 0).any(axis=0)]

expected_user_profile = np.zeros(len(feature_columns))
for i in range(len(item_profiles)):
    expected_user_profile += ratings[i] * item_profiles.iloc[i].to_numpy()

expected_user_profile = expected_user_profile / len(item_profiles)

assert np.array_equal(expected_user_profile, user_profile.to_numpy().flatten())

for column in user_profile.columns:
    print(f"{column}: {user_profile[column].values[0]}") #, Expected {expected_user_profile[feature_columns.index(column)]}")


def normalize_user_profile(user_profile):
    """
    Normalize the user profile so that the highest values become 1,
    the lowest become 0, and neutral (0) values remain 0.
    """
    user_profile_array = user_profile.to_numpy().flatten()
    
    max_val = np.max(user_profile_array)
    min_val = np.min(user_profile_array)
    
    # Avoid division by zero
    if max_val == min_val:
        return user_profile  # No meaningful scaling possible
    
    # Normalize values to [0, 1]
    normalized_array = (user_profile_array - min_val) / (max_val - min_val)
    
    # Retain 0s for unimportant features
    normalized_array[user_profile_array == 0] = 0
    
    # Convert back to DataFrame
    normalized_profile = pd.DataFrame(normalized_array.reshape(1, -1), columns=user_profile.columns)
    return normalized_profile

# TEST
normalized_user_profile = normalize_user_profile(user_profile)

# Check if the normalization is correct
assert np.all(normalized_user_profile >= 0)
assert np.all(normalized_user_profile <= 1)

for column in normalized_user_profile.columns:
    print(f"{column}: {normalized_user_profile[column].values[0]}")


def add_geodata_to_user_profile(user_profile, user_id):
    """
    This function adds the user's geodata to the user profile.
    """

    user_lat = user.loc[user['id'] == user_id, 'user_lat'].values
    user_long = user.loc[user['id'] == user_id, 'user_long'].values

    # Scale the user's geodata between 0 and 1
    lat_min = -90
    lat_max = 90

    long_min = -180
    long_max = 180

    user_lat_scaled = (user_lat - lat_min) / (lat_max - lat_min)
    user_long_scaled = (user_long - long_min) / (long_max - long_min)

    # Add the scaled geodata to the user profile 
    # TODO:
    user_profile['user_lat_scaled'] = user_lat_scaled
    user_profile['user_long_scaled'] = user_long_scaled

    return user_profile


User with ID 152e8e72-c295-4a60-9249-63d6aba3be23 has rated 12 dive sites.
          id  dive_site_id  rating                               user_id
0          1          3071     4.0  152e8e72-c295-4a60-9249-63d6aba3be23
1          4          2845     2.0  152e8e72-c295-4a60-9249-63d6aba3be23
2          5          3839     5.0  152e8e72-c295-4a60-9249-63d6aba3be23
23209  23213          3674     4.0  152e8e72-c295-4a60-9249-63d6aba3be23
23210  23214           231     1.0  152e8e72-c295-4a60-9249-63d6aba3be23
23211  23215          3581     5.0  152e8e72-c295-4a60-9249-63d6aba3be23
23212  23216           808     5.0  152e8e72-c295-4a60-9249-63d6aba3be23
23213  23217          3333     5.0  152e8e72-c295-4a60-9249-63d6aba3be23
23214  23218          1034     5.0  152e8e72-c295-4a60-9249-63d6aba3be23
23215  23219          3696     5.0  152e8e72-c295-4a60-9249-63d6aba3be23
23216  23220          3101     5.0  152e8e72-c295-4a60-9249-63d6aba3be23
23217  23221          2140     5.0  152e8e72-c295

In [73]:
def recommend(input_categories_vector, input_geodata_vector, input_animal_vector, w_cat=1/3, w_geo=1/3, w_animal=1/3, n=10):
    """
    This is a helper function used to recommend dive sites based on input vectors. These input vectors can describe a user or a dive site.     

    CAUTION:
    If we do not have any animal data for a specific dive site, but the w_animal is not 0, we will not consider this dive site for recommendations. (same for categories and geodata)    
    """

    # TODO: Add optional ignore argument if input vectors are a dive spot, ignore this dive spot in the recommendations with index x
    # Precompute dive site vectors
    # they are not ordered by dive_site_id
    dive_site_categories = converted_dive_sites[categories['name']].to_numpy() 
    dive_site_geodata = converted_dive_sites[['lat_scaled', 'long_scaled']].to_numpy()
    dive_site_animals = converted_dive_sites[animals['name']].to_numpy()


    # compute cosine similarities between the user feature vectors and all
    # dive sites in the catalog (except for the query dive site)
    similarities = []

    # iterate over all dive sites
    print("Iterate over all dive sites...")
    for i in range(len(converted_dive_sites)):

        print(f" {i} / {len(converted_dive_sites)}", end="\r")

        similiarity_dict = {}
        similiarity_dict['index'] = i  
        similiarity_dict['animal'] = None
        similiarity_dict['category'] = None
        similiarity_dict['geodata'] = None
        similiarity_dict['combined'] = None

        total_weight = 0
        combined_similarity = 0

        # Category Similarity
        if w_cat != 0:
            other_categories_vector = dive_site_categories[i]
            
            if np.count_nonzero(other_categories_vector) > 0:
                sim_cat = get_cosine_similarity(input_categories_vector, other_categories_vector)
                similiarity_dict['category'] = sim_cat
                combined_similarity += w_cat * sim_cat
                total_weight += w_cat
            else:
                continue

        # Geodata Similarity
        if w_geo != 0:
            other_geodata_vector = dive_site_geodata[i]
            if np.count_nonzero(other_geodata_vector) > 0:
                sim_geo = get_euclidean_similarity(input_geodata_vector, other_geodata_vector)
                similiarity_dict['geodata'] = sim_geo
                combined_similarity += w_geo * sim_geo
                total_weight += w_geo
            else:
                continue

        # Animal Similarity
        if w_animal != 0:
            other_animal_vector = dive_site_animals[i]
            if np.count_nonzero(other_animal_vector) > 0:
                sim_animal = get_cosine_similarity(input_animal_vector, other_animal_vector)
                similiarity_dict['animal'] = sim_animal
                combined_similarity += w_animal * sim_animal
                total_weight += w_animal
            else:
                continue

        # Normalize the similarity by total weight if any feature contributed
        if total_weight != 0:
            combined_similarity /= total_weight
            similiarity_dict['combined'] = combined_similarity
            similarities.append(similiarity_dict)
        

    # sort pairs w.r.t. combined_similarity in descending order (reverse=True)
    similarities = sorted(similarities, key=lambda x: x['combined'], reverse=True)

    # take the top n elements
    recommendations = similarities[:n]

    return recommendations

    

In [43]:
def get_recommendations_for_a_user(user_id, w_cat=1/3, w_geo=1/3, w_animal=1/3, n=10):

    """
    This function generates a content based recommendation for a specific user.
    It generates a feature vector for the "ideal dive site" which the given user would like based off his ratings.
    Using this feature vector (which includes category, geodata, animal data) it computes the distance to the dive sites given in the dataset.

    w_cat: weight for the category vector
    w_geo: weight for the geodata (lat_scaled, long_scaled) vector
    w_animal: weight for the animal vector 
    n: number of recommendations to return

    """
    print(f"Generating recommendations for the user with the ID {user_id}...")

    # get the ratings and item profiles of the user
    ratings, item_profiles = get_item_profile_of_user(user_id)

    # generate the user profile
    user_profile = generate_user_profile(ratings, item_profiles)
    # normalize the user profile
    user_profile = normalize_user_profile(user_profile)

    # add the geodata to the user profile
    user_profile = add_geodata_to_user_profile(user_profile, user_id)

    # split up the user profile into the different feature vectors: category, geodata, animal
    # Category vector
    user_categories_vector = user_profile[categories['name']].to_numpy().flatten()
    # Geodata vector
    user_geodata_vector = user_profile[['user_lat_scaled', 'user_long_scaled']].to_numpy().flatten()
    # Animal vector
    user_animal_vector = user_profile[animals['name']].to_numpy().flatten()

    # generate recommendations
    recommendations = recommend(user_categories_vector, user_geodata_vector, user_animal_vector, w_cat, w_geo, w_animal, n)
    
    dive_sites_indexes = [d['index'] for d in recommendations]

    # return the list of titles and similarities
    recommendations_df = converted_dive_sites.loc[dive_sites_indexes, ['id', 'title', 'lat', 'long', 'occurences', 'categories']]
    recommendations_df[f'Similarity to user {user_id}'] = [d['combined'] for d in recommendations]
    recommendations_df[f'Category Similarity to user {user_id}'] = [d['category'] for d in recommendations]
    recommendations_df[f'Geodata Similarity to user {user_id}'] = [d['geodata'] for d in recommendations] 
    recommendations_df[f'Animal Similarity to user {user_id}'] = [d['animal'] for d in recommendations]

    return recommendations_df

EXAMPLE RECOMMENDATIONS

In [75]:
# Find recommendations for user with id 1

ratings, item_profiles = get_item_profile_of_user(target_user_id)
user_profile = generate_user_profile(ratings, item_profiles)
user_profile = normalize_user_profile(user_profile)
user_profile = add_geodata_to_user_profile(user_profile, target_user_id)

user_profile[['user_lat_scaled', 'user_long_scaled']] 


User with ID 152e8e72-c295-4a60-9249-63d6aba3be23 has rated 12 dive sites.
          id  dive_site_id  rating                               user_id
0          1          3071     4.0  152e8e72-c295-4a60-9249-63d6aba3be23
1          4          2845     2.0  152e8e72-c295-4a60-9249-63d6aba3be23
2          5          3839     5.0  152e8e72-c295-4a60-9249-63d6aba3be23
23209  23213          3674     4.0  152e8e72-c295-4a60-9249-63d6aba3be23
23210  23214           231     1.0  152e8e72-c295-4a60-9249-63d6aba3be23
23211  23215          3581     5.0  152e8e72-c295-4a60-9249-63d6aba3be23
23212  23216           808     5.0  152e8e72-c295-4a60-9249-63d6aba3be23
23213  23217          3333     5.0  152e8e72-c295-4a60-9249-63d6aba3be23
23214  23218          1034     5.0  152e8e72-c295-4a60-9249-63d6aba3be23
23215  23219          3696     5.0  152e8e72-c295-4a60-9249-63d6aba3be23
23216  23220          3101     5.0  152e8e72-c295-4a60-9249-63d6aba3be23
23217  23221          2140     5.0  152e8e72-c295

Unnamed: 0,user_lat_scaled,user_long_scaled
0,0.201679,0.879698


In [77]:
example_recommendations = get_recommendations_for_a_user(target_user_id, w_cat=1/3, w_geo=1/3, w_animal=1/3, n=10)

example_recommendations



Generating recommendations for the user with the ID 152e8e72-c295-4a60-9249-63d6aba3be23...
User with ID 152e8e72-c295-4a60-9249-63d6aba3be23 has rated 12 dive sites.
          id  dive_site_id  rating                               user_id
0          1          3071     4.0  152e8e72-c295-4a60-9249-63d6aba3be23
1          4          2845     2.0  152e8e72-c295-4a60-9249-63d6aba3be23
2          5          3839     5.0  152e8e72-c295-4a60-9249-63d6aba3be23
23209  23213          3674     4.0  152e8e72-c295-4a60-9249-63d6aba3be23
23210  23214           231     1.0  152e8e72-c295-4a60-9249-63d6aba3be23
23211  23215          3581     5.0  152e8e72-c295-4a60-9249-63d6aba3be23
23212  23216           808     5.0  152e8e72-c295-4a60-9249-63d6aba3be23
23213  23217          3333     5.0  152e8e72-c295-4a60-9249-63d6aba3be23
23214  23218          1034     5.0  152e8e72-c295-4a60-9249-63d6aba3be23
23215  23219          3696     5.0  152e8e72-c295-4a60-9249-63d6aba3be23
23216  23220          3101    

Unnamed: 0,id,title,lat,long,occurences,categories,Similarity to user 152e8e72-c295-4a60-9249-63d6aba3be23,Category Similarity to user 152e8e72-c295-4a60-9249-63d6aba3be23,Geodata Similarity to user 152e8e72-c295-4a60-9249-63d6aba3be23,Animal Similarity to user 152e8e72-c295-4a60-9249-63d6aba3be23
2582,2583,Drum and Drumsticks,-35.04489,150.8416,"Moray Eel, Seahorse Family, Seal, Humpback Whale","Reef, Sandy bottom, Wall, Ocean",0.616321,0.663855,0.885108,0.3
4373,4374,Aruh Fanno,-0.29353,73.41174,"Grouper, Coral, Moray Eel","Reef, Wall, Ocean",0.615547,0.766554,0.733678,0.34641
2624,2625,The Long Drop,-36.82454,175.82138,"Moray Eel, Sea Anemone, Crawfish","Pinnacle, Reef, Wall, Ocean",0.615121,0.663855,0.835096,0.34641
1094,1095,Menjangan Island,-8.0966,114.51103,"Whaleshark, Moray Eel","Reef, Wall",0.603626,0.528094,0.85852,0.424264
3710,3711,Basdio Sanctuary,9.74844,124.49909,"Nudibranch, Moray Eel","Drift, Reef, Wall, Ocean",0.602044,0.663855,0.718014,0.424264
2609,2610,Spectacle Reef,-36.82821,175.82365,"Stingray, Moray Eel, Crawfish, Sponges","Pinnacle, Reef, Wall, Ocean",0.599641,0.663855,0.835068,0.3
3673,3674,Pontinha deep,16.58343,-22.94435,Moray Eel,"Reef, Wall, Ocean",0.59857,0.766554,0.429156,0.6
43,44,Henry Head,-33.99853,151.23762,"Cockatoo Cuttlefish, Berycids, Basket Star, Po...",Ocean,0.598434,0.580873,0.893246,0.321182
4100,4101,Latitude Rock,-32.20896,152.56634,"Scorpionfish, Sea Turtle, Carpet Sharks, Moray...","Reef, Ocean",0.592252,0.625889,0.905917,0.244949
3474,3475,Karacaoren Reef,36.53893,29.05307,Moray Eel,"Reef, Wall, Ocean",0.589312,0.766554,0.401382,0.6


In [306]:

# Check if the recommendations are correct
# get the item profiles of user with id 1
ratings, item_profiles = get_item_profile_of_user(1)

# generate the user profile
user_profile = generate_user_profile(ratings, item_profiles)
user_profile = normalize_user_profile(user_profile)

user_profile[['lat_scaled', 'long_scaled']] = user.loc[user['id'] == 1, ['user_lat_scaled', 'user_long_scaled']].to_numpy().flatten()

# get values out of the user profile that are not 0
non_zero_features = user_profile.loc[:, (user_profile != 0).any(axis=0)]
non_zero_features



User with ID 1 has rated 13 dive sites.
      user_id  dive_site_id  rating
1228        1          1134       3
1229        1          1125       3
1230        1          4190       2
1231        1            87       3
1232        1          1130       3
1233        1          1136       2
1234        1          3077       2
1235        1          4056       3
1236        1          3076       4
1237        1            39       4
1238        1          4186       3
1239        1          4191       3
1240        1          2968       3


Unnamed: 0,Archaeological,Drift,Wall,Beach,Reef,Wreck,lat_scaled,long_scaled,Moray Eel,Trumpetfish Pipefish,...,Pufferfish,Frogfish,Squid,Drums,Lobster,Cusk Eels,Nudibranch,Round Stingrays,Seahorse,Yellowtail
0,0.271312,0.271312,0.406967,0.406967,0.94959,0.678279,0.542419,0.688286,0.271312,0.271312,...,0.406967,0.406967,0.406967,0.406967,0.813935,0.678279,0.542623,0.406967,0.542623,0.271312


In [307]:
# This used to be recommended for user_id=1. It the issue that zero values are taking too much

# get the item profile of the dive site with index 4348
dive_site_4348 = converted_dive_sites.loc[2300]

# convert the Series to a DataFrame
dive_site_4348 = dive_site_4348.to_frame().T

# filter out columns with all zero values
dive_site_4348 = dive_site_4348.loc[:, (dive_site_4348 != 0).any(axis=0)]

dive_site_4348

Unnamed: 0,id,title,lat,long,description,image_url,url,max_depth,region,cluster,...,Squid,Pistol Shrimp,Lobster,Pipefish,Whaleshark,Crown-Of-Thorns Starfish,Elapids,Seahorse,occurences,categories
2300,2301,Koh Ha Lagoon,7.42942,98.89539,A beautiful site - perfect for both diving and...,https://d2p1cf6997m1ir.cloudfront.net/media/th...,https://www.padi.com/dive-site/thailand/koh-ha...,,Thailand,7,...,1,1,1,1,1,1,1,1,"Squid, Frogfish, Crown-Of-Thorns Starfish, Gho...",Reef
