In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

# Connection string
connection_string = "postgresql://postgres.svsobttfvdpdxpiwjeqg:z36ow70ANRJB5GHa@aws-0-eu-central-1.pooler.supabase.com:6543/postgres"
engine = create_engine(connection_string)

# Load tables from the database
dive_sites = pd.read_sql("SELECT * FROM dive_site", con=engine)
occurrences = pd.read_sql("SELECT * FROM occurrence", con=engine)
categories = pd.read_sql("SELECT * FROM dive_site_category", con=engine)
categories_per_dive_site = pd.read_sql("SELECT * FROM categories_per_dive_site", con=engine)

In [2]:
from sklearn.model_selection import train_test_split

# Assuming dive_sites, categories, categories_per_dive_site, and occurrences are already loaded
# Preprocessing dive sites table
dive_sites["lat"] = pd.to_numeric(dive_sites["lat"], errors="coerce")
dive_sites["long"] = pd.to_numeric(dive_sites["long"], errors="coerce")
dive_sites["max_depth"] = pd.to_numeric(dive_sites["max_depth"], errors="coerce")

# Merge categories and occurrences
category_mapping = categories_per_dive_site.merge(categories, left_on="dive_site_category_id", right_on="id", how="left")
dive_site_categories = category_mapping.groupby("dive_site_id")["name"].apply(list).reset_index()
dive_sites = dive_sites.merge(dive_site_categories, left_on="id", right_on="dive_site_id", how="left")

occurrence_mapping = occurrences.groupby("dive_site_id")["animal_id"].apply(list).reset_index()
dive_sites = dive_sites.merge(occurrence_mapping, left_on="id", right_on="dive_site_id", how="left")

# Vectorize description
tfidf = TfidfVectorizer(max_features=100)
description_vectors = tfidf.fit_transform(dive_sites["description"].fillna("")).toarray()

# Normalize numerical features
scaler = StandardScaler()
geo_features = scaler.fit_transform(dive_sites[["max_depth"]].fillna(0))

# Encode categorical data
categories_encoded = pd.get_dummies(dive_sites["name"].explode()).groupby(level=0).sum()
animals_encoded = pd.get_dummies(dive_sites["animal_id"].explode()).groupby(level=0).sum()

# Combine features for clustering
features = np.hstack([geo_features, description_vectors, categories_encoded, animals_encoded])

# Perform clustering
kmeans = KMeans(n_clusters=8, random_state=1)
dive_sites["cluster"] = kmeans.fit_predict(features)

# Generate synthetic user data
np.random.seed(42)

# Number of users and records
num_users = 400
num_records = 4000

# Define user preferences
regions = dive_sites["region"].dropna().unique()
animals = occurrences["animal_id"].dropna().unique()
dive_types = categories["name"].dropna().unique()
clusters = dive_sites["cluster"].unique()  # Available clusters

user_preferences = []
for user_id in range(1, num_users + 1):
    preferred_regions = np.random.choice(regions, size=np.random.randint(2, 4), replace=False).tolist()
    preferred_animals = np.random.choice(animals, size=np.random.randint(2, 4), replace=False).tolist()
    preferred_types = np.random.choice(dive_types, size=np.random.randint(2, 4), replace=False).tolist()
    preferred_clusters = np.random.choice(clusters, size=np.random.randint(2, 3), replace=False).tolist()  # Add cluster preferences
    user_preferences.append({
        "user_id": user_id,
        "preferred_regions": preferred_regions,
        "preferred_animals": preferred_animals,
        "preferred_types": preferred_types,
        "preferred_clusters": preferred_clusters,  # Store preferred clusters
    })
    
pd.DataFrame(user_preferences).to_csv('../preferences.csv')

user_ratings_data = []

import random

# Generate synthetic data for multiple trips per user
for _ in range(num_records):
    user = np.random.choice(user_preferences)
    num_trips = np.random.randint(1, 15)
    visited_sites = set()  # Track visited dive sites for the user
    
    for trip_num in range(num_trips):
        trip_region = np.random.choice(user["preferred_regions"]+[np.random.choice(dive_types)])
        region_filter = dive_sites[dive_sites["region"] == trip_region]
        cluster_filter = region_filter[region_filter["cluster"].isin(user["preferred_clusters"]+[np.random.choice(clusters)])]
        trip_sites = cluster_filter if not cluster_filter.empty else region_filter
        
        num_dives = np.random.randint(2, 24)
        num_dives = min(num_dives, len(trip_sites))
        if num_dives == 0:
            continue
        
        selected_sites = trip_sites.sample(n=num_dives, replace=False) if not trip_sites.empty else region_filter.sample(n=num_dives, replace=False)
        
        for _, dive_site in selected_sites.iterrows():
            # Neutral rating base
            rating = 2.0
            
            # Adjust rating for matching region
            if dive_site["region"] in user["preferred_regions"]:
                rating += np.random.uniform(0.3,1)
            else:
                rating -= np.random.uniform(0,0.3)
            
            # Adjust rating for matching cluster
            if dive_site["cluster"] in user["preferred_clusters"]:
                rating += np.random.uniform(0.3,1)
            else:
                rating -= np.random.uniform(0,0.3)
            
            # Matching animals
            dive_site_animals = dive_site["animal_id"] if isinstance(dive_site["animal_id"], list) else []
            matching_animals = set(user["preferred_animals"]) & set(dive_site_animals)
            rating += len(matching_animals) * 0.3
            
            if len(matching_animals)==0:
                rating -= np.random.uniform(0,1)
            
            # Matching dive types
            dive_site_types = dive_site["name"] if isinstance(dive_site["name"], list) else []
            matching_types = set(user["preferred_types"]) & set(dive_site_types)
            rating += len(matching_types) * 0.3
            
            if len(matching_types)==0:
                rating -= np.random.uniform(-2,-0.5)
            
            # Add subtle random noise
            rating += np.random.uniform(-0.5, 0.5)
            
            # Bound the rating between 1 and 5
            rating = max(min(round(rating), 5), 1)
            
            rating = round(rating,0)
            
            # Add entry to ratings data
            user_ratings_data.append({
                "user_id": user["user_id"],
                "dive_site_id": int(dive_site["id"]),
                "region": dive_site["region"],
                "cluster": dive_site["cluster"],
                "rating": rating,
            })

# Convert to DataFrame
user_ratings_df = pd.DataFrame(user_ratings_data)

# Drop duplicates for unique user-dive site combinations
user_ratings_df = user_ratings_df.drop_duplicates(subset=["user_id", "dive_site_id"])

# Display a sample of the generated dataset
print(user_ratings_df.head())

# Save to CSV for further processing
user_ratings_df.to_csv("../user_ratings_data.csv", index=False)


   user_id  dive_site_id   region  cluster  rating
0      304          4079    Libya        5       3
1      304          1348  Denmark        5       4
5      143          1773   Turkey        0       4
6      143          3757   Turkey        0       4
7      143          1686   Turkey        0       3


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Assume user_ratings_data is a list of dictionaries
# Convert it into a DataFrame for easier manipulation
user_ratings_df

# Plot a histogram of ratings

plt.figure(figsize=(10, 6))
sns.histplot(user_ratings_df["rating"], bins=20, kde=True, color="skyblue")
plt.title("Distribution of Dive Site Ratings", fontsize=16)
plt.xlabel("Rating", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Plot a boxplot of ratings
plt.figure(figsize=(10, 6))
sns.boxplot(x=user_ratings_df["rating"], color="lightgreen")
plt.title("Boxplot of Dive Site Ratings", fontsize=16)
plt.xlabel("Rating", fontsize=14)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.show()

# Print basic statistics
mean_rating = user_ratings_df["rating"].mean()
median_rating = user_ratings_df["rating"].median()
std_rating = user_ratings_df["rating"].std()

print(f"Mean Rating: {mean_rating:.2f}")
print(f"Median Rating: {median_rating:.2f}")
print(f"Standard Deviation of Ratings: {std_rating:.2f}")
