1Ô∏è‚É£ Load the Dataset
- Load the CSV file into a pandas DataFrame.
- Display the first 5 rows of the dataset.

2Ô∏è‚É£ Feature Selection
- Extract the following columns:
- pickup_latitude
- pickup_longitude
- Store them in a variable named X.

3Ô∏è‚É£ Data Preprocessing
- Apply StandardScaler to scale the selected - features.
- Store the scaled data in X_scaled.

4Ô∏è‚É£ DBSCAN Model ‚Äì Experiment 1
- Apply DBSCAN with:
- eps = 0.2
- min_samples = 5
- Store cluster labels in labels_1.

5Ô∏è‚É£ DBSCAN Model ‚Äì Experiment 2
- Apply DBSCAN with:
- eps = 0.3
- min_samples = 5
- Store cluster labels in labels_2.

6Ô∏è‚É£ DBSCAN Model ‚Äì Experiment 3
- Apply DBSCAN with:
- eps = 0.5
- min_samples = 5
- Store cluster labels in labels_3.

7Ô∏è‚É£ Cluster Evaluation
- For each experiment:
- Print:
    Number of clusters (excluding noise)
    Number of noise points
    Noise ratio

8Ô∏è‚É£ Silhouette Score Calculation
- Remove noise points (-1) from each experiment.
- Calculate and print the Silhouette Score for:
    - Experiment 1
    - Experiment 2
    - Experiment 3
- If silhouette score is not applicable, print "Not Applicable".

9Ô∏è‚É£ Visualization
- For each experiment:
    - Plot pickup locations using a scatter plot.
    - Color points based on cluster labels.
    - Highlight noise points using a separate color.
    
üîü Best Model Selection
- Based on:
    - Number of clusters
    - Noise ratio
    - Silhouette score
- Print:
    Best eps value = ___

In [None]:
# Import all required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:

from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define file path inside Google Drive
file_path = "/content/drive/MyDrive/Colab Notebooks/DataSets/NewYorkCityTaxiTripDuration.csv"

df=pd.read_csv(file_path)
df.head()

In [None]:
# 2Ô∏è‚É£ Feature Selection - Extract pickup_latitude & pickup_longitude

X = df[['pickup_latitude', 'pickup_longitude']].copy()
X.dropna(inplace=True)

X.head()

In [None]:
# 3Ô∏è‚É£ Data Preprocessing - Apply StandardScaler

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled[:5]


In [None]:
# 4Ô∏è‚É£ DBSCAN Model ‚Äì Experiment 1 (eps=0.2, min_samples=5)

from sklearn.cluster import DBSCAN

db1 = DBSCAN(eps=0.2, min_samples=5)
labels_1 = db1.fit_predict(X_scaled)

labels_1[:10]


In [None]:
# 5Ô∏è‚É£ DBSCAN Model ‚Äì Experiment 2 (eps=0.3, min_samples=5)

db2 = DBSCAN(eps=0.3, min_samples=5)
labels_2 = db2.fit_predict(X_scaled)

labels_2[:10]


In [None]:
# 6Ô∏è‚É£ DBSCAN Model ‚Äì Experiment 3 (eps=0.5, min_samples=5)

db3 = DBSCAN(eps=0.5, min_samples=5)
labels_3 = db3.fit_predict(X_scaled)

labels_3[:10]


In [None]:
# 7Ô∏è‚É£ Cluster Evaluation - Print clusters, noise count, noise ratio

import numpy as np

def evaluate(labels, name):
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise = list(labels).count(-1)
    noise_ratio = n_noise / len(labels)

    print(f"{name}")
    print("Clusters:", n_clusters)
    print("Noise Points:", n_noise)
    print("Noise Ratio:", round(noise_ratio, 4))
    print("-"*40)

evaluate(labels_1, "Experiment 1 (eps=0.2)")
evaluate(labels_2, "Experiment 2 (eps=0.3)")
evaluate(labels_3, "Experiment 3 (eps=0.5)")


In [None]:
# 8Ô∏è‚É£ Silhouette Score - Remove noise and calculate score

from sklearn.metrics import silhouette_score

def calculate_silhouette(X_scaled, labels, name):
    mask = labels != -1

    if len(set(labels[mask])) > 1:
        score = silhouette_score(X_scaled[mask], labels[mask])
        print(f"{name} Silhouette Score:", round(score, 4))
        return score
    else:
        print(f"{name} Silhouette Score: Not Applicable")
        return None

score1 = calculate_silhouette(X_scaled, labels_1, "Experiment 1")
score2 = calculate_silhouette(X_scaled, labels_2, "Experiment 2")
score3 = calculate_silhouette(X_scaled, labels_3, "Experiment 3")


In [None]:
# 9Ô∏è‚É£ Visualization - Scatter plot with cluster colors & noise highlighted

import matplotlib.pyplot as plt

def plot_clusters(X, labels, title):
    plt.figure(figsize=(6,5))

    unique_labels = set(labels)
    for label in unique_labels:
        if label == -1:
            color = 'black'
            marker = 'x'
            label_name = 'Noise'
        else:
            color = None
            marker = 'o'
            label_name = f'Cluster {label}'

        plt.scatter(
            X[labels == label, 0],
            X[labels == label, 1],
            c=color,
            marker=marker,
            label=label_name,
            s=10
        )

    plt.title(title)
    plt.xlabel("Latitude (scaled)")
    plt.ylabel("Longitude (scaled)")
    plt.legend()
    plt.show()

plot_clusters(X_scaled, labels_1, "Experiment 1 (eps=0.2)")
plot_clusters(X_scaled, labels_2, "Experiment 2 (eps=0.3)")
plot_clusters(X_scaled, labels_3, "Experiment 3 (eps=0.5)")


In [None]:
# üîü Best Model Selection - Based on clusters, noise ratio & silhouette score

results = {
    0.2: score1,
    0.3: score2,
    0.5: score3
}

# Remove None values
valid_scores = {k: v for k, v in results.items() if v is not None}

if valid_scores:
    best_eps = max(valid_scores, key=valid_scores.get)
    print("Best eps value =", best_eps)
else:
    print("Best eps value = Not Applicable")
