In [7]:
import pandas as pd
import ast

def compute_average_ratings(user_ratings_df):
    """
    Compute the average rating for each dive site.
    """
    avg_ratings = user_ratings_df.groupby("dive_site_id")["rating"].mean().reset_index()
    avg_ratings.columns = ["dive_site_id", "average_rating"]
    return avg_ratings


def build_user_profile(user_id, user_ratings_df, dive_sites_df):
    """
    Builds a user profile indicating the likelihood of the user liking each cluster.
    """
    # Merge ratings with dive sites to access clusters
    merged_df = pd.merge(user_ratings_df, dive_sites_df, left_on="dive_site_id", right_on="id")
    
    # Filter user's ratings
    user_data = merged_df[merged_df["user_id"] == user_id]
    
    # Calculate average rating per cluster
    cluster_preferences = user_data.groupby("cluster_x")["rating"].mean().reset_index()
    cluster_preferences.columns = ["cluster", "preference_score"]
    
    # Normalize preference scores (optional for consistent scaling)
    cluster_preferences["preference_score"] /= cluster_preferences["preference_score"].sum()
    
    return cluster_preferences.sort_values(by="preference_score", ascending=False)

import numpy as np

def recommend_from_clusters(user_id, cluster_preferences, user_ratings_df, dive_sites_df, avg_ratings, top_n=10):
    """
    Recommends dive sites based on the user's cluster preferences.
    """
    import numpy as np

    # Get dive sites the user has already rated
    rated_sites = user_ratings_df[user_ratings_df["user_id"] == user_id]["dive_site_id"].values
    
    # Filter dive sites the user hasn't rated
    unrated_sites = dive_sites_df[~dive_sites_df["id"].isin(rated_sites)]
    
    # Merge unrated sites with average ratings
    unrated_sites = pd.merge(unrated_sites, avg_ratings, left_on="id", right_on="dive_site_id", how="left")
    
    # Calculate number of recommendations per cluster
    cluster_preferences["num_recommendations"] = (
        (cluster_preferences["preference_score"] * top_n).round().astype(int)
    )
    
    recommendations = []
    for _, row in cluster_preferences.iterrows():
        cluster = row["cluster"]
        num_recommendations = row["num_recommendations"]
        
        # Get dive sites from the current cluster
        cluster_sites = unrated_sites[unrated_sites["cluster"] == cluster]
        
        # Sort sites by average rating
        cluster_sites = cluster_sites.sort_values(by="average_rating", ascending=False)
        
        # Add the top sites from the cluster to recommendations
        recommendations.extend(cluster_sites.head(int(num_recommendations)).to_dict("records"))
    
    # Shuffle the recommendations to avoid cluster grouping in the result
    np.random.shuffle(recommendations)
    
    # Return the top N recommendations
    return recommendations[:top_n]

def is_cluster_in_preferred_clusters(cluster, preferred_clusters):
    """
    Checks if a cluster is in the list of preferred clusters.
    
    Parameters:
    cluster (int): The recommended cluster.
    preferred_clusters (list): List of preferred clusters.
    
    Returns:
    bool:True if the cluster exists in the preferred clusters, otherwise False.
    """
    return cluster in ast.literal_eval(preferred_clusters)

In [10]:
# Load data
user_ratings_df = pd.read_csv("../user_ratings_data.csv")
dive_sites_df = pd.read_csv("../dive_sites.csv")  
features = pd.read_csv('../preferences.csv')

# Compute average ratings
avg_ratings = compute_average_ratings(user_ratings_df)

i = int(input("Enter the user ID to calculate recommendations for: "))
    
ft = features[features['user_id']==i]

# Build user profile
cluster_preferences = build_user_profile(i, user_ratings_df, dive_sites_df)

# Generate recommendations
recommendations = recommend_from_clusters(i, cluster_preferences, user_ratings_df, dive_sites_df, avg_ratings, top_n=10)
    
print(ft)
print(pd.DataFrame(recommendations[:10]))

   Unnamed: 0  user_id           preferred_regions preferred_animals  \
1           1        2  ['South Korea', 'Denmark']        [414, 223]   

     preferred_types preferred_clusters  
1  ['Pool', 'River']             [0, 7]  
   Unnamed: 0    id                     title       lat      long  \
0        2611  2606      Loch Low-Minn Quarry  35.48219 -84.50952   
1        2319  2313               La Catedral  38.95109   1.52769   
2        1961  1961          Kraken Freighter  28.44390 -94.28613   
3        1486  1470     El Arco de San Andrés  36.99124  -1.88652   
4        3503  3493             PIEDRAS ALTAS  36.71949  -3.73396   
5        1108  1083  Grayton Beach State Park  30.30769 -86.15891   
6        3085  3074                Hog Heaven  26.13500 -80.07900   
7        4299  4297                 3 - ROCKS  36.75074  26.99066   
8         138   111  Reserva Marina del Cavet  41.06369   1.07846   
9        3024  3015        Sea Hawk Dive Boat  40.63768 -73.58187   

           

## Region Basierte Ausgabe

In [93]:
import pandas as pd
import numpy as np

def compute_average_ratings(user_ratings_df):
    """
    Compute the weighted average rating for each dive site, incorporating the number of reviews.
    """
    # Count the number of reviews for each dive site
    review_counts = user_ratings_df.groupby("dive_site_id")["rating"].count().reset_index()
    review_counts.columns = ["dive_site_id", "review_count"]
    
    # Compute the average rating
    avg_ratings = user_ratings_df.groupby("dive_site_id")["rating"].mean().reset_index()
    avg_ratings.columns = ["dive_site_id", "average_rating"]
    
    # Merge review counts with average ratings
    avg_ratings = pd.merge(avg_ratings, review_counts, on="dive_site_id")
    
    # Normalize review counts to a reasonable range (optional)
    avg_ratings["review_weight"] = avg_ratings["review_count"] / avg_ratings["review_count"].max()
    
    # Apply weight to the average rating
    avg_ratings["weighted_rating"] = avg_ratings["average_rating"] * avg_ratings["review_weight"]
    
    return avg_ratings


def build_user_profile(user_id, user_ratings_df, dive_sites_df):
    """
    Builds a user profile indicating the likelihood of the user liking each cluster.
    """
    # Merge ratings with dive sites to access clusters
    merged_df = pd.merge(user_ratings_df, dive_sites_df, left_on="dive_site_id", right_on="id")
    
    # Filter user's ratings
    user_data = merged_df[merged_df["user_id"] == user_id]
    
        # Calculate average rating per cluster
    cluster_preferences = user_data.groupby("cluster_x")["rating"].mean().reset_index()
    cluster_preferences.columns = ["cluster", "preference_score"]
    
    # Normalize preference scores (optional for consistent scaling)
    cluster_preferences["preference_score"] /= cluster_preferences["preference_score"].sum()
    
    return cluster_preferences.sort_values(by="preference_score", ascending=False)


def recommend_from_clusters_site(user_id, cluster_preferences, user_ratings_df, dive_sites_df, avg_ratings, top_n=10, top_n_preferences=1):
    """
    Recommends dive sites based on the user's cluster preferences.
    """
    import numpy as np

    # Get dive sites the user has already rated
    rated_sites = user_ratings_df[user_ratings_df["user_id"] == user_id]["dive_site_id"].values
    
    # Filter dive sites the user hasn't rated
    unrated_sites = dive_sites_df[~dive_sites_df["id"].isin(rated_sites)]
    
    # Merge unrated sites with average ratings (use weighted ratings)
    unrated_sites = pd.merge(unrated_sites, avg_ratings[["dive_site_id", "weighted_rating"]], 
                         left_on="id", right_on="dive_site_id", how="left")
    
    cluster_preferences = cluster_preferences.head(top_n_preferences)
    
    # Calculate number of recommendations per cluster
    cluster_preferences["num_recommendations"] = (
        (cluster_preferences["preference_score"] * top_n ).round().astype(int)
    )
    
    recommendations = []
    for _, row in cluster_preferences.iterrows():
        cluster = row["cluster"]
        num_recommendations = row["num_recommendations"]
        
        # Get dive sites from the current cluster
        cluster_sites = unrated_sites[unrated_sites["cluster"] == cluster]
        
        # Sort sites by average rating
        cluster_sites = cluster_sites.sort_values(by="weighted_rating", ascending=False)
        
        # Add the top sites from the cluster to recommendations
        recommendations.extend(cluster_sites.head(int(num_recommendations)).to_dict("records"))
    
    # Shuffle the recommendations to avoid cluster grouping in the result
    np.random.shuffle(recommendations)
    
    # Return the top N recommendations
    return recommendations[:top_n]

import pandas as pd
import random

def recommend_regions_with_sites(recommendations, dive_sites_df, top_n_regions=3, max_sites_per_region=3, min_sites_per_region=15):
    """
    Groups recommendations by region, introduces randomness to the region order, 
    and selects top regions with dive site recommendations, preferring lower indices.
    Enforces a minimum number of dive sites per region.
    """
    # Convert recommendations to DataFrame
    recommendations_df = pd.DataFrame(recommendations)

    # Group recommendations by region
    region_groups = recommendations_df.groupby("region")
    
    # Filter regions with at least the minimum number of dive sites
    valid_regions = dive_sites_df.groupby("region").filter(lambda x: len(x) >= min_sites_per_region)["region"].unique()
    recommendations_df = recommendations_df[recommendations_df["region"].isin(valid_regions)]

    # Recalculate region scores (e.g., sum of site ratings in the region)
    region_scores = recommendations_df.groupby("region")["weighted_rating"].count().reset_index()
    region_scores.columns = ["region", "region_score"]
    
    # Sort regions by score, introduce randomness in sorting for ties
    region_scores["random_tiebreaker"] = [random.random() for _ in range(len(region_scores))]
    top_regions = (
        region_scores.sort_values(by=["region_score", "random_tiebreaker"], ascending=[False, True])
        .head(top_n_regions)
    )

    # Prepare region recommendations
    region_recommendations = []
    for region in top_regions["region"]:
        # Get top dive sites in the region, prioritize lower indices
        region_sites = recommendations_df[recommendations_df["region"] == region] \
                       .sort_values(by=["weighted_rating", "id"], ascending=[False, True]) \
                       .head(max_sites_per_region)
        region_recommendations.append({
            "region": region,
            "top_sites": region_sites[["id", "title", "name", "cluster", "weighted_rating"]]
        })

    # Shuffle the region recommendations for further randomness
    region_recommendations
    
    return region_recommendations



In [98]:
# Load data
user_ratings_df = pd.read_csv("../user_ratings_data.csv")
dive_sites_df = pd.read_csv("../dive_sites.csv")  
features = pd.read_csv('../preferences.csv')

user_id = 3

# Schritt 1: Berechne durchschnittliche Ratings
avg_ratings = compute_average_ratings(user_ratings_df)

# Schritt 2: Erstelle Benutzerprofil
cluster_preferences = build_user_profile(user_id, user_ratings_df, dive_sites_df)

# Schritt 3: Empfehle Dive Sites mit gewichteter Sampling
recommendations = recommend_from_clusters_site(
    user_id, cluster_preferences, user_ratings_df, dive_sites_df, avg_ratings, top_n=100
)

# Schritt 4: Finde beste Regionen und ihre Dive Sites
region_recommendations = recommend_regions_with_sites(
    recommendations, dive_sites_df, top_n_regions=5, max_sites_per_region=5, min_sites_per_region=10
)

# Ergebnisse anzeigen
for rec in region_recommendations:
    print(f"\nRegion: {rec['region']}")
    print("Top Dive Sites:")
    print(rec["top_sites"])



Region: Curacao
Top Dive Sites:
      id                title               name  cluster  weighted_rating
16  1938  Blue Bay The Garden           ['Reef']        0           2.9375
7   1720        Water Factory  ['Beach', 'Reef']        0           2.8750
5   2406           Santa Cruz           ['Reef']        0           2.8125
14   762         Rif St Marie           ['Reef']        0           2.7500
9    771      Mushroom Forest           ['Reef']        0           2.7500

Region: Puerto Rico
Top Dive Sites:
    id                                          title               name  \
4   88  Escambron Marine Park (La poza del Escambron)  ['Reef', 'Ocean']   
10  89                                   Estella Reef  ['Beach', 'Reef']   

    cluster  weighted_rating  
4         0           2.3125  
10        0           2.3125  

Region: Egypt
Top Dive Sites:
      id        title               name  cluster  weighted_rating
11  1144     Daedalus           ['Reef']        0           

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_preferences["num_recommendations"] = (
