In [23]:
import pandas as pd
import ast

def compute_average_ratings(user_ratings_df):
    """
    Compute the average rating for each dive site.
    """
    avg_ratings = user_ratings_df.groupby("dive_site_id")["rating"].mean().reset_index()
    avg_ratings.columns = ["dive_site_id", "average_rating"]
    return avg_ratings


def build_user_profile(user_id, user_ratings_df, dive_sites_df):
    """
    Builds a user profile indicating the likelihood of the user liking each cluster.
    """
    # Merge ratings with dive sites to access clusters
    merged_df = pd.merge(user_ratings_df, dive_sites_df, left_on="dive_site_id", right_on="id")
    
    # Filter user's ratings
    user_data = merged_df[merged_df["user_id"] == user_id]
    
    # Calculate average rating per cluster
    cluster_preferences = user_data.groupby("cluster_x")["rating"].mean().reset_index()
    cluster_preferences.columns = ["cluster", "preference_score"]
    
    # Normalize preference scores (optional for consistent scaling)
    cluster_preferences["preference_score"] /= cluster_preferences["preference_score"].sum()
    
    return cluster_preferences.sort_values(by="preference_score", ascending=False)


def recommend_from_clusters(user_id, cluster_preferences, user_ratings_df, dive_sites_df, avg_ratings, top_n=10):
    """
    Recommends dive sites based on the user's cluster preferences.
    """
    # Get dive sites the user has already rated
    rated_sites = user_ratings_df[user_ratings_df["user_id"] == user_id]["dive_site_id"].values
    
    # Filter dive sites the user hasn't rated
    unrated_sites = dive_sites_df[~dive_sites_df["id"].isin(rated_sites)]
    
    # Merge unrated sites with average ratings
    unrated_sites = pd.merge(unrated_sites, avg_ratings, left_on="id", right_on="dive_site_id", how="left")
    
    # Create recommendations list
    recommendations = []
    
    for _, row in cluster_preferences.iterrows():
        cluster = row["cluster"]
        # Get dive sites from the current cluster
        cluster_sites = unrated_sites[unrated_sites["cluster"] == cluster]
        # Sort sites by average rating
        cluster_sites = cluster_sites.sort_values(by="average_rating", ascending=False)
        recommendations.extend(cluster_sites.to_dict("records"))
    
    # Return the top N recommendations
    return recommendations[:top_n]

def is_cluster_in_preferred_clusters(cluster, preferred_clusters):
    """
    Checks if a cluster is in the list of preferred clusters.
    
    Parameters:
    cluster (int): The recommended cluster.
    preferred_clusters (list): List of preferred clusters.
    
    Returns:
    bool:True if the cluster exists in the preferred clusters, otherwise False.
    """
    return cluster in ast.literal_eval(preferred_clusters)

In [24]:
# Load data
user_ratings_df = pd.read_csv("../user_ratings_data.csv")
dive_sites_df = pd.read_csv("../dive_sites.csv")  
features = pd.read_csv('../preferences.csv')

# Compute average ratings
avg_ratings = compute_average_ratings(user_ratings_df)

i = int(input("Enter the user ID to calculate recommendations for: "))
    
ft = features[features['user_id']==i]

# Build user profile
cluster_preferences = build_user_profile(i, user_ratings_df, dive_sites_df)

# Generate recommendations
recommendations = recommend_from_clusters(i, cluster_preferences, user_ratings_df, dive_sites_df, avg_ratings, top_n=10)
    
print(ft)
print(pd.DataFrame(recommendations[:10]))

    Unnamed: 0  user_id                                  preferred_regions  \
10          10       11  ['Ireland', 'Northern Mariana Islands', 'Andam...   

   preferred_animals                     preferred_types preferred_clusters  
10   [199, 382, 488]  ['Wall', 'Sandy bottom', 'Cavern']             [7, 6]  
   Unnamed: 0    id                        title       lat       long  \
0        1908  1908      Dive Magic Orchard Pool  43.58086 -116.24270   
1        3140  3129               Flagpole Point  47.56457 -123.01457   
2        2611  2606         Loch Low-Minn Quarry  35.48219  -84.50952   
3        3712  3706               Blue Cathedral   9.83881  126.16734   
4        2817  2809              Circle of Heros  28.05100  -83.00000   
5        3079  3068  Manhattan Plaza Health Club  40.75997  -73.99410   
6        1946  1946                     The Arch  33.47290 -119.03600   
7        1948  1948             Farnsworth Banks  33.50556 -118.77556   
8        2319  2313           

## Region Basierte Ausgabe

In [25]:
import pandas as pd
import numpy as np

def compute_average_ratings(user_ratings_df):
    """
    Compute the average rating for each dive site.
    """
    avg_ratings = user_ratings_df.groupby("dive_site_id")["rating"].mean().reset_index()
    avg_ratings.columns = ["dive_site_id", "average_rating"]
    return avg_ratings


def build_user_profile(user_id, user_ratings_df, dive_sites_df):
    """
    Builds a user profile indicating the likelihood of the user liking each cluster.
    """
    # Merge ratings with dive sites to access clusters
    merged_df = pd.merge(user_ratings_df, dive_sites_df, left_on="dive_site_id", right_on="id")
    
    # Filter user's ratings
    user_data = merged_df[merged_df["user_id"] == user_id]
    
        # Calculate average rating per cluster
    cluster_preferences = user_data.groupby("cluster_x")["rating"].mean().reset_index()
    cluster_preferences.columns = ["cluster", "preference_score"]
    
    # Normalize preference scores (optional for consistent scaling)
    cluster_preferences["preference_score"] /= cluster_preferences["preference_score"].sum()
    
    return cluster_preferences.sort_values(by="preference_score", ascending=False)


def recommend_from_clusters_with_sampling(
    user_id, cluster_preferences, user_ratings_df, dive_sites_df, avg_ratings, sample_size=10
):
    """
    Recommends dive sites based on user's cluster preferences with weighted random sampling.
    """
    # Get dive sites the user has already rated
    rated_sites = user_ratings_df[user_ratings_df["user_id"] == user_id]["dive_site_id"].values
    
    # Filter dive sites the user hasn't rated
    unrated_sites = dive_sites_df[~dive_sites_df["id"].isin(rated_sites)]
    
    # Merge unrated sites with average ratings
    unrated_sites = pd.merge(unrated_sites, avg_ratings, left_on="id", right_on="dive_site_id", how="left")
    
    # Perform weighted sampling based on cluster preferences
    recommendations = []
    for _, row in cluster_preferences.iterrows():
        cluster = row["cluster"]
        preference_weight = row["preference_score"]
        
        # Filter dive sites from the current cluster
        cluster_sites = unrated_sites[unrated_sites["cluster"] == cluster]
        
        # Perform weighted sampling
        if not cluster_sites.empty:
            sampled_sites = cluster_sites.sample(
                n=min(len(cluster_sites), int(sample_size * preference_weight)), 
                weights="average_rating", 
                replace=False
            )
            recommendations.extend(sampled_sites.to_dict("records"))
    
    # Shuffle recommendations for randomness
    np.random.shuffle(recommendations)
    
    return recommendations


def recommend_regions_with_sites(recommendations, dive_sites_df, top_n_regions=3, max_sites_per_region=3):
    """
    Groups recommendations by region and selects top regions with dive site recommendations.
    """
    # Group recommendations by region
    recommendations_df = pd.DataFrame(recommendations)
    region_groups = recommendations_df.groupby("region")
    
    # Calculate region scores (e.g., sum of site ratings in the region)
    region_scores = region_groups["average_rating"].sum().reset_index()
    region_scores.columns = ["region", "region_score"]
    
    # Sort regions by score
    top_regions = region_scores.sort_values(by="region_score", ascending=False).head(top_n_regions)
    
    # Prepare region recommendations
    region_recommendations = []
    for region in top_regions["region"]:
        # Get top dive sites in the region
        region_sites = recommendations_df[recommendations_df["region"] == region] \
                       .sort_values(by="average_rating", ascending=False) \
                       .head(max_sites_per_region)
        region_recommendations.append({
            "region": region,
            "top_sites": region_sites[["id", "title", "name", "cluster", "average_rating"]]
        })
    
    return region_recommendations


In [29]:
# Load data
user_ratings_df = pd.read_csv("../user_ratings_data.csv")
dive_sites_df = pd.read_csv("../dive_sites.csv")  
features = pd.read_csv('../preferences.csv')

user_id = int(input("Enter User ID: "))

# Schritt 1: Berechne durchschnittliche Ratings
avg_ratings = compute_average_ratings(user_ratings_df)

# Schritt 2: Erstelle Benutzerprofil
cluster_preferences = build_user_profile(user_id, user_ratings_df, dive_sites_df)

# Schritt 3: Empfehle Dive Sites mit gewichteter Sampling
recommendations = recommend_from_clusters_with_sampling(
    user_id, cluster_preferences, user_ratings_df, dive_sites_df, avg_ratings, sample_size=55
)

# Schritt 4: Finde beste Regionen und ihre Dive Sites
region_recommendations = recommend_regions_with_sites(recommendations, dive_sites_df, top_n_regions=6, max_sites_per_region=5)

# Ergebnisse anzeigen
for rec in region_recommendations:
    print(f"\nRegion: {rec['region']}")
    print("Top Dive Sites:")
    print(rec["top_sites"])



Region: Spain
Top Dive Sites:
      id                        title                      name  cluster  \
42   779    El Cabrón - The Table Top  ['Cave', 'Reef', 'Wall']        1   
43   594                LOS PASADIZOS                  ['Wall']        6   
36   591           TERRITORIO CONGRIO         ['Drift', 'Wall']        6   
10  1332         Cañones de Vedranell       ['Wall', 'Channel']        6   
35  1069  CUEVAS, CT 12 ,CT 17 ,CT 11                  ['Wall']        6   

    average_rating  
42        4.333333  
43        4.000000  
36        3.500000  
10        3.000000  
35        3.000000  

Region: Philippines
Top Dive Sites:
      id                                           title  \
16  3932                                    Tamaraw Reef   
6   3041  Aquaventure Helicopter & Boat wreck dive site.   
21  3829                                     Balas Balas   
47  4167                                         Canyons   
9   1655                                         