In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors


df_cross = pd.read_csv("cross_set_insurance.csv")
df_cross = df_cross.drop(columns=["id"])  # Drop ID column

# Define categorical and numerical features
categorical_cols = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']
numerical_cols = ['Age', 'Driving_License', 'Region_Code', 'Previously_Insured', 
                  'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(drop='first'), categorical_cols),
    ("num", StandardScaler(), numerical_cols)
])

X_cross_preprocessed = preprocessor.fit_transform(df_cross)

# PCA reduction for visualization
pca_cross = PCA(n_components=2)
X_cross_pca = pca_cross.fit_transform(X_cross_preprocessed)

# K-distance graph
neighbors = NearestNeighbors(n_neighbors=5)
neighbors_fit = neighbors.fit(X_cross_preprocessed)
distances, indices = neighbors_fit.kneighbors(X_cross_preprocessed)

# Plot sorted 5-NN distances
k_distances = np.sort(distances[:, 4])
plt.figure(figsize=(10, 5))
plt.plot(k_distances)
plt.title('K-distance Graph (5 Nearest Neighbors)')
plt.xlabel('Data Points sorted by distance')
plt.ylabel('5-NN Distance')
plt.grid(True)
plt.show()

# DBSCAN clustering
dbscan = DBSCAN(eps=1.0, min_samples=5, algorithm='auto')
labels_cross = dbscan.fit_predict(X_cross_preprocessed)

# Plot DBSCAN results
plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_cross_pca[:, 0], X_cross_pca[:, 1], c=labels_cross, cmap='viridis', s=30)

# Centroids (mean of each cluster's PCA points)
unique_labels = set(labels_cross)
for label in unique_labels:
    if label != -1:
        cluster_points = X_cross_pca[labels_cross == label]
        centroid = cluster_points.mean(axis=0)
        plt.scatter(centroid[0], centroid[1], color='red', marker='x', s=200)

plt.colorbar(scatter, label='Cluster Label')
plt.title("DBSCAN Clustering on Cross-Set Insurance Dataset (2D PCA Projection)")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.grid(True)
plt.show()

# Summary
num_clusters = len(set(labels_cross)) - (1 if -1 in labels_cross else 0)
num_noise = list(labels_cross).count(-1)

# Optional: Cluster count histogram
plt.figure(figsize=(8, 5))
sns.countplot(x=labels_cross, palette='Set2')
plt.title("DBSCAN Cluster Sizes")
plt.xlabel("Cluster Label")
plt.ylabel("Number of Points")
plt.grid(True)
plt.show()

# Silhouette score comparison
mask = labels_cross != -1
if np.sum(mask) > 1:
    dbscan_silhouette = silhouette_score(X_cross_preprocessed[mask], labels_cross[mask])
else:
    dbscan_silhouette = None

# KMeans baseline
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
kmeans_labels = kmeans.fit_predict(X_cross_preprocessed)
kmeans_silhouette = silhouette_score(X_cross_preprocessed, kmeans_labels)

dbscan_silhouette, kmeans_silhouette


FileNotFoundError: [Errno 2] No such file or directory: 'cross_set_insurance.csv'