In [1]:
# %% [markdown]
# # Driver Behavior Analysis - Clustering Analysis
#
# ## Overview
# This notebook performs clustering analysis on driver behavior data to identify distinct driving patterns and risk profiles.
#
# ### Objectives:
# 1. Determine optimal number of clusters
# 2. Apply K-means clustering algorithm
# 3. Analyze and interpret clusters
# 4. Validate clustering results
# 5. Create driver personas

# %% [markdown]
# ## 1. Setup and Data Preparation

# %%
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Clustering and ML libraries
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.manifold import TSNE

# Import custom modules
import sys
sys.path.append('..')
from src.clustering import DriverClustering
from src.data_processor import DataProcessor
from src.utils import save_results

# Set display options
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 50)
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("viridis")

# %%
# Load configuration
import yaml
with open('../config/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Load engineered features
print("Loading engineered features...")
df = pd.read_csv('../data/driver_features_engineered.csv')
print(f"Dataset shape: {df.shape}")
print(f"Number of drivers: {len(df)}")

# Display sample data
print("\nSample data:")
print(df.head())

# %%
# Prepare features for clustering
print("\nPreparing features for clustering...")

# Select key features for clustering (based on feature importance from previous analysis)
clustering_features = [
    # Safety metrics
    'safety_score',
    'harsh_accel_count',
    'harsh_brake_count',

    # Efficiency metrics
    'fuel_efficiency_composite',
    'rpm_efficiency_score',
    'smooth_driving_score',

    # Speed metrics
    'speed_mean',
    'speed_p90',
    'high_speed_ratio',

    # Aggressiveness
    'aggressive_index',

    # RPM metrics
    'rpm_mean',

    # Time metrics
    'time_response_std',
    'time_consistency'
]

# Filter to available features
available_features = [f for f in clustering_features if f in df.columns]
print(f"Selected {len(available_features)} features for clustering:")

# Display feature statistics
feature_stats = df[available_features].describe().T[['mean', 'std', 'min', '50%', 'max']]
print(feature_stats.round(3))

# %%
# Handle missing values
print("\nHandling missing values...")
missing_before = df[available_features].isnull().sum().sum()
df_cluster = df[available_features].copy()

# Fill missing values with median
for col in df_cluster.columns:
    if df_cluster[col].isnull().sum() > 0:
        df_cluster[col] = df_cluster[col].fillna(df_cluster[col].median())

missing_after = df_cluster.isnull().sum().sum()
print(f"Missing values before: {missing_before}")
print(f"Missing values after: {missing_after}")

# Check for infinite values
inf_count = np.isinf(df_cluster).sum().sum()
if inf_count > 0:
    print(f"Found {inf_count} infinite values, replacing with max/min...")
    for col in df_cluster.columns:
        col_max = df_cluster[col].replace([np.inf, -np.inf], np.nan).max()
        col_min = df_cluster[col].replace([np.inf, -np.inf], np.nan).min()
        df_cluster[col] = df_cluster[col].replace(np.inf, col_max)
        df_cluster[col] = df_cluster[col].replace(-np.inf, col_min)

# %%
# Standardize features
print("Standardizing features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_cluster)
df_scaled = pd.DataFrame(X_scaled, columns=df_cluster.columns, index=df_cluster.index)

print("Feature standardization complete.")
print(f"Scaled data shape: {df_scaled.shape}")

# %% [markdown]
# ## 2. Optimal Cluster Determination

# %%
# Determine optimal number of clusters using multiple methods
print("Determining optimal number of clusters...")

# Range of clusters to test
cluster_range = range(2, 11)
results = []

# Initialize clustering analyzer
cluster_analyzer = DriverClustering('../config/config.yaml')

# Use the find_optimal_clusters method
optimal_k = cluster_analyzer.find_optimal_clusters(df_scaled, max_k=10)
print(f"\nOptimal number of clusters (Elbow method): {optimal_k}")

# %%
# Additional validation methods
print("\nPerforming additional cluster validation...")

# Calculate metrics for different k values
silhouette_scores = []
davies_bouldin_scores = []
calinski_harabasz_scores = []

for k in cluster_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(df_scaled)

    # Calculate metrics
    if k > 1:  # Silhouette score requires at least 2 clusters
        silhouette_scores.append(silhouette_score(df_scaled, clusters))

    davies_bouldin_scores.append(davies_bouldin_score(df_scaled, clusters))
    calinski_harabasz_scores.append(calinski_harabasz_score(df_scaled, clusters))

# Create visualization
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Silhouette scores
axes[0].plot(list(cluster_range)[1:], silhouette_scores, 'bo-', linewidth=2, markersize=8)
axes[0].set_xlabel('Number of Clusters')
axes[0].set_ylabel('Silhouette Score')
axes[0].set_title('Silhouette Score (Higher is better)')
axes[0].grid(True, alpha=0.3)
best_k_silhouette = list(cluster_range)[1:][np.argmax(silhouette_scores)]
axes[0].axvline(x=best_k_silhouette, color='r', linestyle='--', alpha=0.5)
axes[0].text(best_k_silhouette + 0.1, max(silhouette_scores) * 0.95,
             f'Best: k={best_k_silhouette}', color='red')

# Davies-Bouldin scores
axes[1].plot(cluster_range, davies_bouldin_scores, 'go-', linewidth=2, markersize=8)
axes[1].set_xlabel('Number of Clusters')
axes[1].set_ylabel('Davies-Bouldin Score')
axes[1].set_title('Davies-Bouldin Index (Lower is better)')
axes[1].grid(True, alpha=0.3)
best_k_db = cluster_range[np.argmin(davies_bouldin_scores)]
axes[1].axvline(x=best_k_db, color='r', linestyle='--', alpha=0.5)
axes[1].text(best_k_db + 0.1, min(davies_bouldin_scores) * 1.05,
             f'Best: k={best_k_db}', color='red')

# Calinski-Harabasz scores
axes[2].plot(cluster_range, calinski_harabasz_scores, 'mo-', linewidth=2, markersize=8)
axes[2].set_xlabel('Number of Clusters')
axes[2].set_ylabel('Calinski-Harabasz Score')
axes[2].set_title('Calinski-Harabasz Index (Higher is better)')
axes[2].grid(True, alpha=0.3)
best_k_ch = cluster_range[np.argmax(calinski_harabasz_scores)]
axes[2].axvline(x=best_k_ch, color='r', linestyle='--', alpha=0.5)
axes[2].text(best_k_ch + 0.1, max(calinski_harabasz_scores) * 0.95,
             f'Best: k={best_k_ch}', color='red')

plt.tight_layout()
plt.savefig('../results/clusters/cluster_validation_metrics.png', dpi=300, bbox_inches='tight')
plt.show()

# %%
# Display optimal k from different methods
print("="*60)
print("OPTIMAL CLUSTER DETERMINATION SUMMARY")
print("="*60)
print(f"\nElbow Method (WCSS): k = {optimal_k}")
print(f"Silhouette Score:    k = {best_k_silhouette}")
print(f"Davies-Bouldin:      k = {best_k_db}")
print(f"Calinski-Harabasz:   k = {best_k_ch}")

# Choose final k (majority vote or business decision)
final_k = optimal_k  # Using elbow method result
print(f"\nSelected number of clusters: k = {final_k}")

# Business context considerations
print("\nBusiness Considerations:")
print(f"- 5 clusters provide good differentiation for targeted interventions")
print(f"- Cluster sizes will be manageable for training programs")
print(f"- Enough granularity without being too complex")

# %% [markdown]
# ## 3. K-Means Clustering

# %%
# Perform K-means clustering with selected k
print(f"\nPerforming K-means clustering with k={final_k}...")

# Initialize K-means
kmeans = KMeans(
    n_clusters=final_k,
    random_state=42,
    n_init=20,  # Run multiple times with different centroids
    max_iter=300,
    tol=1e-4,
    init='k-means++'  # Smart initialization
)

# Fit and predict
clusters = kmeans.fit_predict(df_scaled)
df_clustered = df.copy()
df_clustered['cluster'] = clusters
df_clustered['cluster_label'] = df_clustered['cluster'].apply(lambda x: f'Cluster {x+1}')

print("Clustering completed successfully!")
print(f"\nCluster distribution:")
cluster_counts = df_clustered['cluster'].value_counts().sort_index()
for cluster, count in cluster_counts.items():
    percentage = (count / len(df_clustered)) * 100
    print(f"  Cluster {cluster+1}: {count} drivers ({percentage:.1f}%)")

# %%
# Calculate cluster validation metrics
print("\nCluster Validation Metrics:")
silhouette_avg = silhouette_score(df_scaled, clusters)
db_index = davies_bouldin_score(df_scaled, clusters)
ch_index = calinski_harabasz_score(df_scaled, clusters)

print(f"Silhouette Score: {silhouette_avg:.3f}")
print(f"Davies-Bouldin Index: {db_index:.3f}")
print(f"Calinski-Harabasz Index: {ch_index:.3f}")

# Interpret scores
print("\nInterpretation:")
if silhouette_avg > 0.7:
    print("✓ Excellent cluster structure (Silhouette > 0.7)")
elif silhouette_avg > 0.5:
    print("✓ Reasonable cluster structure (Silhouette > 0.5)")
elif silhouette_avg > 0.25:
    print("✓ Weak but possible cluster structure (Silhouette > 0.25)")
else:
    print("✗ No substantial cluster structure")

if db_index < 0.7:
    print("✓ Good cluster separation (DB Index < 0.7)")
elif db_index < 1.0:
    print("✓ Acceptable cluster separation (DB Index < 1.0)")
else:
    print("✗ Poor cluster separation")

# %% [markdown]
# ## 4. Cluster Analysis and Interpretation

# %%
# Analyze cluster characteristics
print("Analyzing cluster characteristics...")

# Calculate mean values for each cluster
cluster_profiles = df_clustered.groupby('cluster')[available_features].mean()
cluster_sizes = df_clustered.groupby('cluster').size()
cluster_profiles['cluster_size'] = cluster_sizes
cluster_profiles['percentage'] = (cluster_sizes / len(df_clustered) * 100).round(1)

# Display cluster profiles
print("\nCluster Profiles (Mean Values):")
display(cluster_profiles.style.background_gradient(cmap='viridis', axis=0).format("{:.3f}"))

# %%
# Create normalized cluster profiles for comparison
print("\nCreating normalized cluster profiles...")

# Normalize features for better comparison
scaler_minmax = MinMaxScaler()
features_normalized = scaler_minmax.fit_transform(cluster_profiles[available_features])
cluster_profiles_normalized = pd.DataFrame(
    features_normalized,
    columns=available_features,
    index=cluster_profiles.index
)

# Add cluster size info
cluster_profiles_normalized['cluster_size'] = cluster_profiles['cluster_size']
cluster_profiles_normalized['percentage'] = cluster_profiles['percentage']

# Display normalized profiles
print("Normalized Cluster Profiles (0-1 scale):")
display(cluster_profiles_normalized.style.background_gradient(cmap='viridis', axis=0).format("{:.3f}"))

# %%
# Visualize cluster characteristics
print("Creating cluster visualization...")

# Select top features for visualization
top_features = [
    'safety_score',
    'fuel_efficiency_composite',
    'aggressive_index',
    'harsh_accel_count',
    'harsh_brake_count',
    'speed_p90',
    'rpm_mean',
    'time_consistency'
]

# Filter to available top features
available_top_features = [f for f in top_features if f in cluster_profiles.columns]

# Create radar chart for each cluster
from math import pi

# Normalize features for radar chart
radar_data = cluster_profiles_normalized[available_top_features].copy()

# Create radar chart
fig, axes = plt.subplots(2, 3, figsize=(15, 10), subplot_kw=dict(projection='polar'))
axes = axes.flatten()

# Categories for radar chart
categories = available_top_features
N = len(categories)

# What will be the angle of each axis in the plot
angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1]  # Close the loop

# Plot each cluster
for idx, (cluster_num, row) in enumerate(radar_data.iterrows()):
    if idx < len(axes):
        ax = axes[idx]

        # Values for each category
        values = row.values.tolist()
        values += values[:1]  # Close the loop

        # Draw polygon
        ax.plot(angles, values, 'o-', linewidth=2)
        ax.fill(angles, values, alpha=0.25)

        # Set category labels
        ax.set_xticks(angles[:-1])
        ax.set_xticklabels(categories, size=8)
        ax.set_ylim(0, 1)

        # Set title
        size = cluster_profiles.loc[cluster_num, 'cluster_size']
        pct = cluster_profiles.loc[cluster_num, 'percentage']
        ax.set_title(f'Cluster {cluster_num+1}\n{size} drivers ({pct}%)',
                    size=10, fontweight='bold')

        # Add grid
        ax.grid(True)

# Adjust layout
plt.tight_layout()
plt.suptitle('Cluster Profiles - Radar Charts', y=1.02, fontsize=14, fontweight='bold')
plt.savefig('../results/clusters/cluster_radar_charts.png', dpi=300, bbox_inches='tight')
plt.show()

# %%
# Create parallel coordinates plot
print("Creating parallel coordinates plot...")

from pandas.plotting import parallel_coordinates

# Prepare data for parallel coordinates
parallel_data = df_clustered[['cluster'] + available_top_features].copy()
parallel_data['cluster'] = parallel_data['cluster'].astype(str)

plt.figure(figsize=(14, 8))
parallel_coordinates(parallel_data, 'cluster', colormap='viridis', alpha=0.5)
plt.title('Parallel Coordinates Plot of Driver Clusters', fontsize=14, pad=20)
plt.xlabel('Features', fontsize=12)
plt.ylabel('Normalized Values', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3)
plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig('../results/clusters/parallel_coordinates.png', dpi=300, bbox_inches='tight')
plt.show()

# %% [markdown]
# ## 5. Dimensionality Reduction for Visualization

# %%
# Perform PCA for 2D visualization
print("Creating 2D visualization using PCA...")

pca = PCA(n_components=2)
X_pca = pca.fit_transform(df_scaled)

# Create PCA dataframe
pca_df = pd.DataFrame(data=X_pca, columns=['PC1', 'PC2'])
pca_df['cluster'] = clusters
pca_df['cluster_label'] = pca_df['cluster'].apply(lambda x: f'Cluster {x+1}')

# Plot PCA results
plt.figure(figsize=(12, 8))
scatter = plt.scatter(pca_df['PC1'], pca_df['PC2'],
                     c=pca_df['cluster'], cmap='viridis',
                     alpha=0.7, s=50, edgecolor='black', linewidth=0.5)

# Add centroids
centroids_pca = pca.transform(kmeans.cluster_centers_)
plt.scatter(centroids_pca[:, 0], centroids_pca[:, 1],
           c='red', marker='X', s=200, label='Centroids',
           edgecolor='black', linewidth=1.5)

plt.xlabel(f'Principal Component 1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
plt.ylabel(f'Principal Component 2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
plt.title('Driver Clusters - PCA Visualization', fontsize=14, pad=20)
plt.legend(*scatter.legend_elements(), title="Clusters", loc='upper right')
plt.grid(True, alpha=0.3)
plt.colorbar(scatter, label='Cluster')
plt.tight_layout()
plt.savefig('../results/clusters/pca_cluster_visualization.png', dpi=300, bbox_inches='tight')
plt.show()

# %%
# Perform t-SNE for better separation visualization
print("Creating t-SNE visualization...")

tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000)
X_tsne = tsne.fit_transform(df_scaled)

# Create t-SNE dataframe
tsne_df = pd.DataFrame(data=X_tsne, columns=['TSNE1', 'TSNE2'])
tsne_df['cluster'] = clusters
tsne_df['cluster_label'] = tsne_df['cluster'].apply(lambda x: f'Cluster {x+1}')

# Plot t-SNE results
plt.figure(figsize=(12, 8))
scatter = plt.scatter(tsne_df['TSNE1'], tsne_df['TSNE2'],
                     c=tsne_df['cluster'], cmap='viridis',
                     alpha=0.7, s=50, edgecolor='black', linewidth=0.5)

plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.title('Driver Clusters - t-SNE Visualization', fontsize=14, pad=20)
plt.legend(*scatter.legend_elements(), title="Clusters", loc='upper right')
plt.grid(True, alpha=0.3)
plt.colorbar(scatter, label='Cluster')
plt.tight_layout()
plt.savefig('../results/clusters/tsne_cluster_visualization.png', dpi=300, bbox_inches='tight')
plt.show()

# %% [markdown]
# ## 6. Cluster Interpretation and Labeling

# %%
# Interpret and label clusters based on characteristics
print("Interpreting and labeling clusters...")

# Calculate z-scores for each feature within clusters
z_scores = {}
for feature in available_top_features:
    overall_mean = df_clustered[feature].mean()
    overall_std = df_clustered[feature].std()

    cluster_means = df_clustered.groupby('cluster')[feature].mean()
    z_scores[feature] = (cluster_means - overall_mean) / overall_std

z_df = pd.DataFrame(z_scores)

# Define cluster labels based on characteristics
cluster_labels = {}
cluster_descriptions = {}

for cluster_num in range(final_k):
    # Get z-scores for this cluster
    cluster_z = z_df.loc[cluster_num]

    # Analyze characteristics
    characteristics = []

    # Safety analysis
    safety_z = cluster_z.get('safety_score', 0)
    if safety_z > 1.0:
        characteristics.append('Very Safe')
    elif safety_z > 0.5:
        characteristics.append('Safe')
    elif safety_z < -1.0:
        characteristics.append('Risky')
    elif safety_z < -0.5:
        characteristics.append('Moderately Risky')

    # Efficiency analysis
    efficiency_z = cluster_z.get('fuel_efficiency_composite', 0)
    if efficiency_z > 1.0:
        characteristics.append('Highly Efficient')
    elif efficiency_z > 0.5:
        characteristics.append('Efficient')
    elif efficiency_z < -1.0:
        characteristics.append('Fuel Inefficient')
    elif efficiency_z < -0.5:
        characteristics.append('Moderately Inefficient')

    # Aggressiveness analysis
    aggressive_z = cluster_z.get('aggressive_index', 0)
    if aggressive_z > 1.0:
        characteristics.append('Highly Aggressive')
    elif aggressive_z > 0.5:
        characteristics.append('Aggressive')
    elif aggressive_z < -1.0:
        characteristics.append('Very Cautious')

    # Speed analysis
    speed_z = cluster_z.get('speed_p90', 0)
    if speed_z > 1.0:
        characteristics.append('Fast Driver')
    elif speed_z < -1.0:
        characteristics.append('Slow Driver')

    # Create label
    if len(characteristics) > 0:
        label = f"{', '.join(characteristics[:2])} Drivers"
    else:
        label = f"Cluster {cluster_num+1}"

    # Create detailed description
    description = f"Cluster {cluster_num+1} ({cluster_profiles.loc[cluster_num, 'percentage']}% of drivers): "
    description += f"Characterized by "

    # Add top 3 features (by absolute z-score)
    top_features = cluster_z.abs().sort_values(ascending=False).head(3)
    desc_features = []
    for feature, z in top_features.items():
        if z > 0.5:  # Only mention significant features
            direction = "high" if cluster_z[feature] > 0 else "low"
            desc_features.append(f"{direction} {feature}")

    if desc_features:
        description += ", ".join(desc_features)
    else:
        description += "average performance across most metrics"

    cluster_labels[cluster_num] = label
    cluster_descriptions[cluster_num] = description

# Display cluster interpretations
print("\n" + "="*60)
print("CLUSTER INTERPRETATION AND LABELING")
print("="*60)

for cluster_num in range(final_k):
    print(f"\n{cluster_labels[cluster_num]}:")
    print(f"  - {cluster_descriptions[cluster_num]}")
    print(f"  - Size: {cluster_profiles.loc[cluster_num, 'cluster_size']} drivers")

    # Show key metrics
    print("  - Key Metrics (z-scores):")
    for feature in available_top_features[:4]:
        z = z_df.loc[cluster_num, feature]
        if abs(z) > 0.5:
            print(f"    * {feature}: {z:+.2f}")

# %% [markdown]
# ## 7. Driver Personas Development

# %%
# Create detailed driver personas
print("\nCreating driver personas...")

# Define personas based on cluster characteristics
personas = {}

for cluster_num in range(final_k):
    persona = {
        'cluster_number': cluster_num + 1,
        'label': cluster_labels[cluster_num],
        'size': int(cluster_profiles.loc[cluster_num, 'cluster_size']),
        'percentage': float(cluster_profiles.loc[cluster_num, 'percentage']),
        'key_characteristics': [],
        'risk_level': '',
        'efficiency_level': '',
        'training_needs': [],
        'monitoring_frequency': '',
        'intervention_priority': ''
    }

    # Get cluster statistics
    cluster_data = df_clustered[df_clustered['cluster'] == cluster_num]

    # Determine risk level
    safety_mean = cluster_profiles.loc[cluster_num, 'safety_score']
    if safety_mean > 0.7:
        persona['risk_level'] = 'Low'
    elif safety_mean > 0.5:
        persona['risk_level'] = 'Moderate'
    elif safety_mean > 0.3:
        persona['risk_level'] = 'High'
    else:
        persona['risk_level'] = 'Critical'

    # Determine efficiency level
    efficiency_mean = cluster_profiles.loc[cluster_num, 'fuel_efficiency_composite']
    if efficiency_mean > 0.7:
        persona['efficiency_level'] = 'High'
    elif efficiency_mean > 0.5:
        persona['efficiency_level'] = 'Moderate'
    else:
        persona['efficiency_level'] = 'Low'

    # Identify key characteristics (based on z-scores)
    cluster_z = z_df.loc[cluster_num]
    significant_features = cluster_z[abs(cluster_z) > 0.8].sort_values(key=abs, ascending=False)

    for feature, z_score in significant_features.head(3).items():
        if z_score > 0:
            persona['key_characteristics'].append(f"High {feature}")
        else:
            persona['key_characteristics'].append(f"Low {feature}")

    # Determine training needs
    if cluster_profiles.loc[cluster_num, 'harsh_accel_count'] > df_clustered['harsh_accel_count'].mean():
        persona['training_needs'].append('Smooth Acceleration')

    if cluster_profiles.loc[cluster_num, 'harsh_brake_count'] > df_clustered['harsh_brake_count'].mean():
        persona['training_needs'].append('Defensive Braking')

    if cluster_profiles.loc[cluster_num, 'speed_p90'] > df_clustered['speed_p90'].mean() * 1.1:
        persona['training_needs'].append('Speed Management')

    if cluster_profiles.loc[cluster_num, 'rpm_mean'] > df_clustered['rpm_mean'].mean() * 1.1:
        persona['training_needs'].append('Gear Optimization')

    # Determine monitoring frequency
    if persona['risk_level'] in ['Critical', 'High']:
        persona['monitoring_frequency'] = 'Daily'
    elif persona['risk_level'] == 'Moderate':
        persona['monitoring_frequency'] = 'Weekly'
    else:
        persona['monitoring_frequency'] = 'Monthly'

    # Determine intervention priority
    if persona['risk_level'] == 'Critical':
        persona['intervention_priority'] = 'Immediate'
    elif persona['risk_level'] == 'High':
        persona['intervention_priority'] = 'High'
    elif persona['risk_level'] == 'Moderate':
        persona['intervention_priority'] = 'Medium'
    else:
        persona['intervention_priority'] = 'Low'

    personas[cluster_num] = persona

# Display personas
print("\n" + "="*60)
print("DRIVER PERSONAS")
print("="*60)

for cluster_num, persona in personas.items():
    print(f"\n{'='*40}")
    print(f"PERSONA: {persona['label']}")
    print(f"{'='*40}")
    print(f"Cluster: {persona['cluster_number']}")
    print(f"Size: {persona['size']} drivers ({persona['percentage']}%)")
    print(f"Risk Level: {persona['risk_level']}")
    print(f"Efficiency Level: {persona['efficiency_level']}")
    print(f"Key Characteristics: {', '.join(persona['key_characteristics'])}")
    print(f"Training Needs: {', '.join(persona['training_needs']) if persona['training_needs'] else 'Minimal'}")
    print(f"Monitoring Frequency: {persona['monitoring_frequency']}")
    print(f"Intervention Priority: {persona['intervention_priority']}")

# %% [markdown]
# ## 8. Cluster Validation and Stability

# %%
# Validate cluster stability
print("Validating cluster stability...")

# Method 1: Compare with different random states
print("\n1. Stability across different random seeds:")
different_seeds_results = []
seeds = [42, 123, 456, 789, 999]

for seed in seeds:
    kmeans_temp = KMeans(n_clusters=final_k, random_state=seed, n_init=10)
    clusters_temp = kmeans_temp.fit_predict(df_scaled)
    different_seeds_results.append(clusters_temp)

# Calculate agreement between different runs
from sklearn.metrics import adjusted_rand_score

agreement_scores = []
for i in range(len(seeds)):
    for j in range(i+1, len(seeds)):
        score = adjusted_rand_score(different_seeds_results[i], different_seeds_results[j])
        agreement_scores.append(score)

print(f"  Average agreement between different seeds: {np.mean(agreement_scores):.3f}")
print(f"  Minimum agreement: {np.min(agreement_scores):.3f}")
print(f"  Maximum agreement: {np.max(agreement_scores):.3f}")

if np.mean(agreement_scores) > 0.9:
    print("  ✓ Excellent cluster stability")
elif np.mean(agreement_scores) > 0.7:
    print("  ✓ Good cluster stability")
elif np.mean(agreement_scores) > 0.5:
    print("  ✓ Acceptable cluster stability")
else:
    print("  ✗ Poor cluster stability")

# %%
# Method 2: Compare with different clustering algorithms
print("\n2. Comparison with alternative clustering algorithms:")

# Try DBSCAN
print("  Testing DBSCAN...")
dbscan = DBSCAN(eps=0.5, min_samples=5)
clusters_dbscan = dbscan.fit_predict(df_scaled)
n_clusters_dbscan = len(set(clusters_dbscan)) - (1 if -1 in clusters_dbscan else 0)
print(f"    DBSCAN found {n_clusters_dbscan} clusters")

# Try Agglomerative Clustering
print("  Testing Agglomerative Clustering...")
agglo = AgglomerativeClustering(n_clusters=final_k)
clusters_agglo = agglo.fit_predict(df_scaled)

# Compare with K-means results
ari_dbscan = adjusted_rand_score(clusters, clusters_dbscan)
ari_agglo = adjusted_rand_score(clusters, clusters_agglo)

print(f"    Agreement with DBSCAN: {ari_dbscan:.3f}")
print(f"    Agreement with Agglomerative: {ari_agglo:.3f}")

# Method 3: Bootstrap stability
print("\n3. Bootstrap stability analysis...")
n_bootstraps = 10
bootstrap_scores = []

for i in range(n_bootstraps):
    # Create bootstrap sample
    bootstrap_indices = np.random.choice(len(df_scaled), size=len(df_scaled), replace=True)
    X_bootstrap = df_scaled.iloc[bootstrap_indices]

    # Cluster bootstrap sample
    kmeans_bootstrap = KMeans(n_clusters=final_k, random_state=42)
    clusters_bootstrap = kmeans_bootstrap.fit_predict(X_bootstrap)

    # Map to original clusters using subset
    common_indices = list(set(bootstrap_indices))
    if len(common_indices) > 10:
        original_clusters_subset = np.array([clusters[idx] for idx in common_indices])
        bootstrap_clusters_subset = np.array([clusters_bootstrap[list(bootstrap_indices).index(idx)]
                                            for idx in common_indices])

        score = adjusted_rand_score(original_clusters_subset, bootstrap_clusters_subset)
        bootstrap_scores.append(score)

if bootstrap_scores:
    print(f"  Average bootstrap stability: {np.mean(bootstrap_scores):.3f}")
    print(f"  Bootstrap stability range: [{np.min(bootstrap_scores):.3f}, {np.max(bootstrap_scores):.3f}]")

# %% [markdown]
# ## 9. Business Insights and Recommendations

# %%
# Generate business insights from clustering
print("Generating business insights...")

# Calculate overall statistics
total_drivers = len(df_clustered)
high_risk_drivers = df_clustered[df_clustered['safety_score'] < 0.4].shape[0]
low_efficiency_drivers = df_clustered[df_clustered['fuel_efficiency_composite'] < 0.4].shape[0]

print("\n" + "="*60)
print("BUSINESS INSIGHTS FROM CLUSTERING ANALYSIS")
print("="*60)

print(f"\n1. Overall Statistics:")
print(f"   - Total drivers analyzed: {total_drivers}")
print(f"   - High-risk drivers identified: {high_risk_drivers} ({high_risk_drivers/total_drivers*100:.1f}%)")
print(f"   - Low-efficiency drivers identified: {low_efficiency_drivers} ({low_efficiency_drivers/total_drivers*100:.1f}%)")

print(f"\n2. Cluster Distribution:")
for cluster_num in range(final_k):
    size = cluster_profiles.loc[cluster_num, 'cluster_size']
    pct = cluster_profiles.loc[cluster_num, 'percentage']
    label = cluster_labels[cluster_num]
    print(f"   - {label}: {size} drivers ({pct}%)")

print(f"\n3. Risk Analysis by Cluster:")
risk_analysis = []
for cluster_num in range(final_k):
    cluster_data = df_clustered[df_clustered['cluster'] == cluster_num]
    high_risk_in_cluster = cluster_data[cluster_data['safety_score'] < 0.4].shape[0]
    cluster_risk_pct = (high_risk_in_cluster / len(cluster_data)) * 100 if len(cluster_data) > 0 else 0
    risk_analysis.append((cluster_num, cluster_risk_pct))

# Sort by risk percentage
risk_analysis.sort(key=lambda x: x[1], reverse=True)

for cluster_num, risk_pct in risk_analysis:
    print(f"   - Cluster {cluster_num+1}: {risk_pct:.1f}% high-risk drivers")

print(f"\n4. Efficiency Analysis by Cluster:")
efficiency_analysis = []
for cluster_num in range(final_k):
    cluster_data = df_clustered[df_clustered['cluster'] == cluster_num]
    avg_efficiency = cluster_data['fuel_efficiency_composite'].mean()
    efficiency_analysis.append((cluster_num, avg_efficiency))

# Sort by efficiency
efficiency_analysis.sort(key=lambda x: x[1])

for cluster_num, efficiency in efficiency_analysis:
    label = cluster_labels[cluster_num]
    print(f"   - {label}: Average efficiency = {efficiency:.2f}")

print(f"\n5. Actionable Recommendations:")

# Priority 1: Critical clusters
critical_clusters = []
for cluster_num in range(final_k):
    cluster_risk_pct = risk_analysis[cluster_num][1]
    if cluster_risk_pct > 30:  # More than 30% high-risk drivers
        critical_clusters.append(cluster_num)

if critical_clusters:
    print("   a. IMMEDIATE ACTION REQUIRED for clusters with >30% high-risk drivers:")
    for cluster_num in critical_clusters:
        print(f"      - {cluster_labels[cluster_num]} (Cluster {cluster_num+1})")
    print("      Recommended: Mandatory retraining program")

# Priority 2: Efficiency improvement
efficient_clusters = [c for c, eff in efficiency_analysis if eff < 0.5]
if efficient_clusters:
    print("\n   b. EFFICIENCY IMPROVEMENT opportunities:")
    for cluster_num in efficient_clusters[:2]:  # Top 2 least efficient
        print(f"      - {cluster_labels[cluster_num]} (Cluster {cluster_num+1})")
    print("      Recommended: Fuel efficiency training")

# Priority 3: Best practices sharing
best_clusters = [c for c, eff in efficiency_analysis if eff > 0.7][:2]
if best_clusters:
    print("\n   c. BEST PRACTICES from top-performing clusters:")
    for cluster_num in best_clusters:
        print(f"      - {cluster_labels[cluster_num]} (Cluster {cluster_num+1})")
    print("      Recommended: Peer mentoring program")

# %% [markdown]
# ## 10. Results Export and Next Steps

# %%
# Save clustering results
print("Saving clustering results...")

# Add cluster labels to original dataframe
df_final = df.copy()
df_final['cluster'] = clusters
df_final['cluster_label'] = df_final['cluster'].apply(lambda x: f'Cluster {x+1}')

# Add persona information
for cluster_num, persona in personas.items():
    mask = df_final['cluster'] == cluster_num
    df_final.loc[mask, 'persona_label'] = persona['label']
    df_final.loc[mask, 'risk_level'] = persona['risk_level']
    df_final.loc[mask, 'efficiency_level'] = persona['efficiency_level']
    df_final.loc[mask, 'intervention_priority'] = persona['intervention_priority']

# Save results
output_files = {
    'clustered_data': '../data/driver_data_clustered.csv',
    'cluster_profiles': '../results/clusters/cluster_profiles.csv',
    'personas': '../results/clusters/driver_personas.csv',
    'clustering_metrics': '../results/clusters/clustering_metrics.json'
}

# Save clustered data
df_final.to_csv(output_files['clustered_data'], index=False)
print(f"✓ Clustered data saved to: {output_files['clustered_data']}")

# Save cluster profiles
cluster_profiles.to_csv(output_files['cluster_profiles'])
print(f"✓ Cluster profiles saved to: {output_files['cluster_profiles']}")

# Save personas
personas_df = pd.DataFrame.from_dict(personas, orient='index')
personas_df.to_csv(output_files['personas'])
print(f"✓ Driver personas saved to: {output_files['personas']}")

# Save clustering metrics
clustering_metrics = {
    'optimal_clusters': int(final_k),
    'silhouette_score': float(silhouette_avg),
    'davies_bouldin_index': float(db_index),
    'calinski_harabasz_index': float(ch_index),
    'cluster_distribution': cluster_counts.to_dict(),
    'pca_variance_explained': {
        'pc1': float(pca.explained_variance_ratio_[0]),
        'pc2': float(pca.explained_variance_ratio_[1]),
        'total_2d': float(pca.explained_variance_ratio_[0] + pca.explained_variance_ratio_[1])
    },
    'stability_metrics': {
        'average_seed_agreement': float(np.mean(agreement_scores)) if agreement_scores else 0,
        'bootstrap_stability': float(np.mean(bootstrap_scores)) if bootstrap_scores else 0
    }
}

import json
with open(output_files['clustering_metrics'], 'w') as f:
    json.dump(clustering_metrics, f, indent=2)
print(f"✓ Clustering metrics saved to: {output_files['clustering_metrics']}")

# %%
print("\n" + "="*60)
print("CLUSTERING ANALYSIS COMPLETE")
print("="*60)

print("\nSummary of Results:")
print(f"• Optimal clusters identified: {final_k}")
print(f"• Cluster quality: Silhouette = {silhouette_avg:.3f}")
print(f"• High-risk drivers identified: {high_risk_drivers} ({high_risk_drivers/total_drivers*100:.1f}%)")
print(f"• Driver personas created: {final_k} distinct profiles")

print("\nNext Steps:")
print("1. Review cluster profiles and personas")
print("2. Design targeted intervention programs")
print("3. Implement monitoring dashboards")
print("4. Schedule follow-up analysis in 3 months")
print("5. Integrate with existing safety programs")

print("\nFiles Created:")
for key, path in output_files.items():
    print(f"  • {key}: {path}")

SyntaxError: incomplete input (ipython-input-2308304716.py, line 688)