# Data Mining Project
## Touristic Hotspot in Lyon


In [22]:
import pandas as pd
import numpy as np
import folium
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from scipy.spatial import ConvexHull
import os
import webbrowser

In [6]:
data = pd.read_table("../data/dataset.csv", sep=",", low_memory=False)

data.columns = data.columns.str.strip()

# coordonnées géographiques retenu pour Lyon = [45.75, 4.85]
# rayon de 15 km autour de ce point
validation_rules = {
    'lat': lambda x: pd.api.types.is_number(x) and 45.614067767464974 <= x <= 45.88380569722158,
    'long': lambda x: pd.api.types.is_number(x) and 4.655505238288724 <= x <= 5.042868327562071,
    'date_taken_minute': lambda x: pd.api.types.is_number(x) and 0 <= x <= 59,
    'date_taken_hour': lambda x: pd.api.types.is_number(x) and 0 <= x <= 23,
    'date_taken_day': lambda x: pd.api.types.is_number(x) and 1 <= x <= 31,
    'date_taken_month': lambda x: pd.api.types.is_number(x) and 1 <= x <= 12,
    'date_taken_year': lambda x: pd.api.types.is_number(x) and 1839 <= x <= 2024,
    'date_upload_minute': lambda x: pd.api.types.is_number(x) and 0 <= x <= 59,
    'date_upload_hour': lambda x: pd.api.types.is_number(x) and 0 <= x <= 23,
    'date_upload_day': lambda x: pd.api.types.is_number(x) and 1 <= x <= 31,
    'date_upload_month': lambda x: pd.api.types.is_number(x) and 1 <= x <= 12,
    'date_upload_year': lambda x: pd.api.types.is_number(x) and 1839 <= x <= 2024,
}

# Fonction de nettoyage des colonnes
def clean_column(dataframe, column_name, validation_func):
    dataframe[column_name] = dataframe[column_name].apply(
        lambda x: x if validation_func(x) else np.nan
    )

for column, rule in validation_rules.items():
    if column in data.columns:
        clean_column(data, column, rule)

print(f"Before removing missing values: {len(data)}")
data_cleaned_mv = data.dropna(subset=['id', 'lat', 'long'])
print(f"After removing missing values: {len(data_cleaned_mv)}")

print(f"Before removing exact duplicates: {len(data_cleaned_mv)}")
data_cleaned_d = data_cleaned_mv.drop_duplicates(subset=['id', 'user', 'lat', 'long'], keep='first')
print(f"After removing exact duplicates: {len(data_cleaned_d)}")

data_cleaned_d.to_csv('../data/datasetCleaned.csv', index=False)
print("Cleaned data saved in 'data/datasetCleaned.csv'")


Before removing missing values: 420240
After removing missing values: 420240
Before removing exact duplicates: 420240
After removing exact duplicates: 168097
Cleaned data saved in 'data/datasetCleaned.csv'


In [17]:
# Load and filter data
print("Loading data...")
data_path = "../data/datasetCleaned.csv"
df = pd.read_csv(data_path, low_memory=False)

# Filter for Lyon area
lyon_bounds = {'lat_min': 45.73, 'lat_max': 45.80, 'lon_min': 4.79, 'lon_max': 4.90}
mask = (
    (df['lat'] >= lyon_bounds['lat_min']) & (df['lat'] <= lyon_bounds['lat_max']) &
    (df['long'] >= lyon_bounds['lon_min']) & (df['long'] <= lyon_bounds['lon_max'])
)
df = df[mask]
print(f"Filtered data: {len(df)} points")

# Apply K-Means clustering
print("Applying K-Means...")
X = df[['lat', 'long']].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=50, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(X_scaled)
print(f"K-Means completed with {len(set(df['cluster']))} clusters")

# Create Folium map
print("Creating Folium map...")
map_clusters = folium.Map(location=[45.75, 4.85], zoom_start=13)

# Generate colors for clusters
cluster_colors = [
    f"#{''.join(np.random.choice(list('0123456789ABCDEF'), 6))}"
    for _ in range(len(df['cluster'].unique()))
]

# Add points to map
for _, row in df.iterrows():
    cluster_id = row['cluster']
    color = cluster_colors[cluster_id]
    folium.CircleMarker(
        location=[row['lat'], row['long']],
        radius=3,
        color=color,
        fill=True,
        fill_opacity=0.5
    ).add_to(map_clusters)

# Reverse scaling for cluster centers
cluster_centers_original = scaler.inverse_transform(kmeans.cluster_centers_)

# Add cluster centers to the map
for i, center_coords in enumerate(cluster_centers_original):
    folium.Marker(
        location=[center_coords[0], center_coords[1]],
        popup=f"Cluster Center {i}",
        icon=folium.Icon(color='red', icon='info-sign')
    ).add_to(map_clusters)

# Save and open the map
map_file = "../output/clusteringKMeans.html"
map_clusters.save(map_file)
print(f"Map saved as {map_file}")

# Open the map automatically in the browser
webbrowser.open(f'file:///{os.path.abspath(map_file)}')

print("Map ready")


Loading data...
Filtered data: 132626 points
Applying K-Means...
K-Means completed with 50 clusters
Creating Folium map...
Map saved as ../output/clusteringKMeans.html
Map ready


In [21]:
# Load and filter data
print("Loading data...")
data_path = "../data/datasetCleaned.csv"
df = pd.read_csv(data_path, low_memory=False)

# Remove whitespace from column names
df.columns = df.columns.str.strip()

# Remove duplicates based on latitude/longitude
df = df.drop_duplicates(subset=['lat', 'long'])
print(f"Data size after removing duplicates: {len(df)} points")

# Filter for Lyon area
lyon_bounds = {'lat_min': 45.709, 'lat_max': 45.80, 'lon_min': 4.79, 'lon_max': 4.90}
mask = (
    (df['lat'] >= lyon_bounds['lat_min']) & (df['lat'] <= lyon_bounds['lat_max']) &
    (df['long'] >= lyon_bounds['lon_min']) & (df['long'] <= lyon_bounds['lon_max'])
)
df = df[mask]
print(f"Data size after geographic filtering: {len(df)} points")

# Reduce density by random sampling (keep only 20% of points)
df = df.sample(frac=0.2, random_state=42)
print(f"Data size after sampling: {len(df)} points")

# Apply Hierarchical Clustering
print("Applying Hierarchical Clustering...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[['lat', 'long']])

n_clusters = 25
clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
df['cluster'] = clustering.fit_predict(X_scaled)
print(f"Clustering completed with {n_clusters} clusters")

# Create Folium map
print("Creating Folium map...")
map_clusters = folium.Map(location=[df['lat'].mean(), df['long'].mean()], zoom_start=13)

# Add Lyon boundary
bounds = [[lyon_bounds['lat_min'], lyon_bounds['lon_min']],
          [lyon_bounds['lat_max'], lyon_bounds['lon_max']]]
folium.Rectangle(bounds=bounds, color='red', weight=2, fill=False, popup='Study area').add_to(map_clusters)

# Generate unique colors for each cluster
cluster_colors = [
    f"#{''.join(np.random.choice(list('0123456789ABCDEF'), 6))}"
    for _ in range(len(df['cluster'].unique()))
]

# Add cluster points
for _, row in df.iterrows():
    cluster_id = row['cluster']
    folium.CircleMarker(
        location=[row['lat'], row['long']],
        radius=5,
        color=cluster_colors[cluster_id],
        fill=True,
        fill_opacity=0.5,
        popup=f"Cluster {cluster_id}"
    ).add_to(map_clusters)

# Save and open the map
map_file = "../output/clustersHierarchical.html"
map_clusters.save(map_file)
print(f"Map saved as {map_file}")

# Open the map automatically in the browser
webbrowser.open(f'file:///{os.path.abspath(map_file)}')

print("Map ready")

Loading data...
Data size after removing duplicates: 66819 points
Data size after geographic filtering: 62397 points
Data size after sampling: 12479 points
Applying Hierarchical Clustering...
Clustering completed with 25 clusters
Creating Folium map...
Map saved as ../output/clustersHierarchical.html
Map ready


In [None]:
# Load and filter data
print("Loading data...")
data_path = "../data/datasetCleaned.csv"
df = pd.read_csv(data_path, low_memory=False)

# Remove duplicates based on latitude/longitude
df = df.drop_duplicates(subset=['lat', 'long'])
print(f"Data size after removing duplicates: {len(df)} points")

# Sampling to improve performance (reduce density)
df = df.sample(frac=0.2, random_state=42)  # Keep only 30% of points
print(f"Data size after sampling: {len(df)} points")

# Apply DBSCAN clustering
print("Applying DBSCAN...")
X = df[['lat', 'long']].values

eps = 0.0004
min_samples = 4
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
df['cluster'] = dbscan.fit_predict(X)

# Compute cluster statistics
n_clusters = len(set(df['cluster'])) - (1 if -1 in df['cluster'] else 0)
n_noise = sum(df['cluster'] == -1)
print(f"Clusters found: {n_clusters}")
print(f"Noise points: {n_noise}")

# Create Folium map
print("Creating Folium map...")
map_clusters = folium.Map(location=[df['lat'].mean(), df['long'].mean()], zoom_start=12)

# Generate unique colors for each cluster
cluster_colors = ['#808080']  # Gray for noise points
cluster_colors += [
    f"#{''.join(np.random.choice(list('0123456789ABCDEF'), 6))}"
    for _ in range(n_clusters)
]

# Add cluster points
for _, row in df.iterrows():
    cluster_id = row['cluster']
    color_idx = cluster_id + 1 if cluster_id >= 0 else 0  # Gray for noise
    folium.CircleMarker(
        location=[row['lat'], row['long']],
        radius=3,
        color=cluster_colors[color_idx],
        stroke=False if cluster_id < 0 else True,
        fill=True,
        fill_opacity=0.5 if cluster_id >= 0 else 0.6
    ).add_to(map_clusters)

# Add convex hulls for clusters
for cluster_id in range(n_clusters):
    cluster_points = df[df['cluster'] == cluster_id][['lat', 'long']].values
    if len(cluster_points) >= 3:
        try:
            hull = ConvexHull(cluster_points)
            hull_points = cluster_points[hull.vertices]
            folium.Polygon(
                locations=[[point[0], point[1]] for point in hull_points],
                color=cluster_colors[cluster_id + 1],
                weight=2,
                fill=True,
                fill_color=cluster_colors[cluster_id + 1],
                fill_opacity=0.2
            ).add_to(map_clusters)
        except Exception as e:
            print(f"Error creating polygon for cluster {cluster_id}: {e}")

# Save and open the map
map_file = "../output/clusteringDBSCAN.html"
map_clusters.save(map_file)
print(f"Map saved as {map_file}")

# Open the map automatically in the browser
webbrowser.open(f'file:///{os.path.abspath(map_file)}')

print("Map ready! Opened in your browser.")

Loading data...
Data size after removing duplicates: 66819 points
Data size after sampling: 20046 points
Applying DBSCAN...
Clusters found: 333
Noise points: 3377
Creating Folium map...
Map saved as ../output/clusteringDBSCAN.html
Map ready! Opened in your browser.
