In [1]:
# %% [markdown]
# # Driver Behavior Analysis - Feature Engineering
#
# ## Overview
# This notebook focuses on creating meaningful features from raw telematics data for driver behavior analysis.
#
# ### Objectives:
# 1. Extract time-based behavior patterns
# 2. Calculate acceleration/deceleration metrics
# 3. Derive speed and RPM features
# 4. Create composite safety and efficiency scores
# 5. Prepare data for clustering analysis

# %% [markdown]
# ## 1. Setup and Configuration

# %%
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.signal import savgol_filter
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 50)
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("viridis")

# Import custom modules
import sys
sys.path.append('..')
from src.feature_extractor import FeatureExtractor
from src.data_processor import DataProcessor

# %%
# Load configuration
import yaml
with open('../config/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Load processed data
print("Loading processed data...")
processor = DataProcessor('../config/config.yaml')
df = processor.load_data()
df = processor.calculate_basic_metrics(df)

print(f"Dataset shape: {df.shape}")
print(f"Number of drivers: {len(df)}")

# %% [markdown]
# ## 2. Time-Based Feature Extraction

# %%
# Initialize feature extractor
extractor = FeatureExtractor()

# Extract time-based features
print("Extracting time-based features...")
df_time = extractor.extract_time_based_features(df.copy())

# Display time features
time_features = [col for col in df_time.columns if 'time' in col.lower()]
print(f"\nTime-based features created: {len(time_features)}")
print("\nSample time features:")
print(df_time[time_features].describe().round(3).T.head(10))

# %%
# Visualize time-based features
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# Time response distribution
axes[0, 0].hist(df_time['time_response_mean'].dropna(), bins=30, alpha=0.7)
axes[0, 0].axvline(df_time['time_response_mean'].mean(), color='red', linestyle='--', label=f'Mean: {df_time["time_response_mean"].mean():.2f}')
axes[0, 0].set_xlabel('Response Time (seconds)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Driver Response Time Distribution')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Time consistency score
axes[0, 1].hist(df_time['time_consistency'].dropna(), bins=30, alpha=0.7)
axes[0, 1].axvline(df_time['time_consistency'].mean(), color='red', linestyle='--', label=f'Mean: {df_time["time_consistency"].mean():.2f}')
axes[0, 1].set_xlabel('Time Consistency Score')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Time Consistency Distribution')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Response time vs consistency scatter
scatter = axes[0, 2].scatter(df_time['time_response_mean'], df_time['time_consistency'],
                             alpha=0.6, c=df_time['time_response_std'], cmap='viridis')
axes[0, 2].set_xlabel('Mean Response Time')
axes[0, 2].set_ylabel('Time Consistency')
axes[0, 2].set_title('Response Time vs Consistency')
plt.colorbar(scatter, ax=axes[0, 2], label='Response Time STD')
axes[0, 2].grid(True, alpha=0.3)

# Percentile analysis
time_percentiles = ['time_p95', 'time_p99']
percentile_data = df_time[time_percentiles]
box_data = [percentile_data[col].dropna() for col in time_percentiles]
axes[1, 0].boxplot(box_data, labels=['95th %ile', '99th %ile'])
axes[1, 0].set_ylabel('Time (seconds)')
axes[1, 0].set_title('Response Time Percentiles')
axes[1, 0].grid(True, alpha=0.3)

# Correlation heatmap
time_corr_features = ['time_response_mean', 'time_response_std', 'time_reaction_mean',
                      'time_reaction_std', 'time_consistency', 'time_p95', 'time_p99']
time_corr = df_time[time_corr_features].corr()
sns.heatmap(time_corr, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, ax=axes[1, 1])
axes[1, 1].set_title('Time Feature Correlations')

# Time variability distribution
axes[1, 2].hist(df_time['time_response_std'].dropna(), bins=30, alpha=0.7)
axes[1, 2].axvline(df_time['time_response_std'].mean(), color='red', linestyle='--',
                   label=f'Mean: {df_time["time_response_std"].mean():.2f}')
axes[1, 2].set_xlabel('Response Time STD')
axes[1, 2].set_ylabel('Frequency')
axes[1, 2].set_title('Response Time Variability')
axes[1, 2].legend()
axes[1, 2].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../results/features/time_features_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

# %% [markdown]
# ## 3. Acceleration Feature Extraction

# %%
# Extract acceleration features
print("Extracting acceleration features...")
df_accel = extractor.extract_acceleration_features(df_time)

# Display acceleration features
accel_features = [col for col in df_accel.columns if any(x in col for x in ['accel', 'decel', 'jerk', 'harsh', 'smooth'])]
print(f"\nAcceleration features created: {len(accel_features)}")
print("\nSample acceleration features:")
print(df_accel[accel_features].describe().round(3).T.head(15))

# %%
# Visualize acceleration features
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# Harsh acceleration count distribution
axes[0, 0].hist(df_accel['harsh_accel_count'].dropna(), bins=30, alpha=0.7)
axes[0, 0].axvline(df_accel['harsh_accel_count'].mean(), color='red', linestyle='--',
                   label=f'Mean: {df_accel["harsh_accel_count"].mean():.2f}')
axes[0, 0].set_xlabel('Harsh Acceleration Count')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Harsh Acceleration Distribution')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Harsh braking count distribution
axes[0, 1].hist(df_accel['harsh_brake_count'].dropna(), bins=30, alpha=0.7)
axes[0, 1].axvline(df_accel['harsh_brake_count'].mean(), color='red', linestyle='--',
                   label=f'Mean: {df_accel["harsh_brake_count"].mean():.2f}')
axes[0, 1].set_xlabel('Harsh Braking Count')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Harsh Braking Distribution')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Aggressiveness vs smooth driving
scatter = axes[0, 2].scatter(df_accel['accel_aggressiveness'], df_accel['smooth_driving_score'],
                             alpha=0.6, c=df_accel['jerk_score'], cmap='plasma')
axes[0, 2].set_xlabel('Acceleration Aggressiveness')
axes[0, 2].set_ylabel('Smooth Driving Score')
axes[0, 2].set_title('Aggressiveness vs Smooth Driving')
plt.colorbar(scatter, ax=axes[0, 2], label='Jerk Score')
axes[0, 2].grid(True, alpha=0.3)

# Extreme events comparison
extreme_events = ['very_harsh_accel_count', 'harsh_accel_count',
                  'very_harsh_brake_count', 'harsh_brake_count']
extreme_data = df_accel[extreme_events]
box_data = [extreme_data[col].dropna() for col in extreme_events]
axes[1, 0].boxplot(box_data, labels=['V.H. Accel', 'H. Accel', 'V.H. Brake', 'H. Brake'])
axes[1, 0].set_ylabel('Event Count')
axes[1, 0].set_title('Extreme Event Distribution')
axes[1, 0].tick_params(axis='x', rotation=45)
axes[1, 0].grid(True, alpha=0.3)

# Correlation heatmap
accel_corr_features = ['accel_aggressiveness', 'decel_aggressiveness', 'jerk_score',
                       'smooth_driving_score', 'harsh_accel_count', 'harsh_brake_count']
accel_corr = df_accel[accel_corr_features].corr()
sns.heatmap(accel_corr, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, ax=axes[1, 1])
axes[1, 1].set_title('Acceleration Feature Correlations')

# Smooth driving score distribution
axes[1, 2].hist(df_accel['smooth_driving_score'].dropna(), bins=30, alpha=0.7)
axes[1, 2].axvline(df_accel['smooth_driving_score'].mean(), color='red', linestyle='--',
                   label=f'Mean: {df_accel["smooth_driving_score"].mean():.2f}')
axes[1, 2].set_xlabel('Smooth Driving Score')
axes[1, 2].set_ylabel('Frequency')
axes[1, 2].set_title('Smooth Driving Distribution')
axes[1, 2].legend()
axes[1, 2].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../results/features/acceleration_features_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

# %% [markdown]
# ## 4. Speed Feature Extraction

# %%
# Extract speed features
print("Extracting speed features...")
df_speed = extractor.extract_speed_features(df_accel)

# Display speed features
speed_features = [col for col in df_speed.columns if 'speed' in col.lower() and col not in df.columns]
print(f"\nSpeed features created: {len(speed_features)}")
print("\nSample speed features:")
print(df_speed[speed_features].describe().round(3).T.head(15))

# %%
# Visualize speed features
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# Average speed distribution
axes[0, 0].hist(df_speed['speed_mean'].dropna(), bins=30, alpha=0.7)
axes[0, 0].axvline(df_speed['speed_mean'].mean(), color='red', linestyle='--',
                   label=f'Mean: {df_speed["speed_mean"].mean():.2f} mph')
axes[0, 0].set_xlabel('Average Speed (mph)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Average Speed Distribution')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Speed consistency distribution
axes[0, 1].hist(df_speed['speed_consistency'].dropna(), bins=30, alpha=0.7)
axes[0, 1].axvline(df_speed['speed_consistency'].mean(), color='red', linestyle='--',
                   label=f'Mean: {df_speed["speed_consistency"].mean():.2f}')
axes[0, 1].set_xlabel('Speed Consistency Score')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Speed Consistency Distribution')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# High speed ratio distribution
axes[0, 2].hist(df_speed['high_speed_ratio'].dropna(), bins=30, alpha=0.7)
axes[0, 2].axvline(df_speed['high_speed_ratio'].mean(), color='red', linestyle='--',
                   label=f'Mean: {df_speed["high_speed_ratio"].mean():.2f}')
axes[0, 2].set_xlabel('High Speed Ratio')
axes[0, 2].set_ylabel('Frequency')
axes[0, 2].set_title('High Speed Driving Ratio')
axes[0, 2].legend()
axes[0, 2].grid(True, alpha=0.3)

# Speed percentiles comparison
speed_percentiles = ['speed_p50', 'speed_p75', 'speed_p90', 'speed_p95', 'speed_p99']
percentile_data = df_speed[speed_percentiles]
box_data = [percentile_data[col].dropna() for col in speed_percentiles]
axes[1, 0].boxplot(box_data, labels=['50%', '75%', '90%', '95%', '99%'])
axes[1, 0].set_ylabel('Speed (mph)')
axes[1, 0].set_title('Speed Percentile Distribution')
axes[1, 0].grid(True, alpha=0.3)

# Correlation heatmap
speed_corr_features = ['speed_mean', 'speed_std', 'speed_cv', 'speed_consistency',
                       'high_speed_ratio', 'speed_p90', 'speed_p95']
speed_corr = df_speed[speed_corr_features].corr()
sns.heatmap(speed_corr, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, ax=axes[1, 1])
axes[1, 1].set_title('Speed Feature Correlations')

# Speed variability vs consistency
scatter = axes[1, 2].scatter(df_speed['speed_std'], df_speed['speed_consistency'],
                             alpha=0.6, c=df_speed['speed_cv'], cmap='coolwarm')
axes[1, 2].set_xlabel('Speed Standard Deviation')
axes[1, 2].set_ylabel('Speed Consistency Score')
axes[1, 2].set_title('Speed Variability vs Consistency')
plt.colorbar(scatter, ax=axes[1, 2], label='Coefficient of Variation')
axes[1, 2].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../results/features/speed_features_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

# %% [markdown]
# ## 5. RPM Feature Extraction

# %%
# Extract RPM features
print("Extracting RPM features...")
df_rpm = extractor.extract_rpm_features(df_speed)

# Display RPM features
rpm_features = [col for col in df_rpm.columns if 'rpm' in col.lower() and col not in df.columns]
print(f"\nRPM features created: {len(rpm_features)}")
print("\nSample RPM features:")
print(df_rpm[rpm_features].describe().round(3).T.head(10))

# %%
# Visualize RPM features
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# Average RPM distribution
axes[0, 0].hist(df_rpm['rpm_mean'].dropna(), bins=30, alpha=0.7)
axes[0, 0].axvline(df_rpm['rpm_mean'].mean(), color='red', linestyle='--',
                   label=f'Mean: {df_rpm["rpm_mean"].mean():.0f} RPM')
axes[0, 0].axvline(2000, color='green', linestyle='-', alpha=0.5, label='Optimal Min (2000)')
axes[0, 0].axvline(2500, color='green', linestyle='-', alpha=0.5, label='Optimal Max (2500)')
axes[0, 0].set_xlabel('Average RPM')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Average RPM Distribution')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# RPM efficiency score distribution
axes[0, 1].hist(df_rpm['rpm_efficiency_score'].dropna(), bins=30, alpha=0.7)
axes[0, 1].axvline(df_rpm['rpm_efficiency_score'].mean(), color='red', linestyle='--',
                   label=f'Mean: {df_rpm["rpm_efficiency_score"].mean():.2f}')
axes[0, 1].set_xlabel('RPM Efficiency Score')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('RPM Efficiency Distribution')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# RPM variability distribution
axes[0, 2].hist(df_rpm['rpm_std'].dropna(), bins=30, alpha=0.7)
axes[0, 2].axvline(df_rpm['rpm_std'].mean(), color='red', linestyle='--',
                   label=f'Mean: {df_rpm["rpm_std"].mean():.0f}')
axes[0, 2].set_xlabel('RPM Standard Deviation')
axes[0, 2].set_ylabel('Frequency')
axes[0, 2].set_title('RPM Variability Distribution')
axes[0, 2].legend()
axes[0, 2].grid(True, alpha=0.3)

# RPM range distribution
axes[1, 0].hist(df_rpm['rpm_range'].dropna(), bins=30, alpha=0.7)
axes[1, 0].axvline(df_rpm['rpm_range'].mean(), color='red', linestyle='--',
                   label=f'Mean: {df_rpm["rpm_range"].mean():.0f}')
axes[1, 0].set_xlabel('RPM Range')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('RPM Range Distribution')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Correlation heatmap
rpm_corr_features = ['rpm_mean', 'rpm_std', 'rpm_range', 'rpm_efficiency_score',
                     'rpm_variability_score']
rpm_corr = df_rpm[rpm_corr_features].corr()
sns.heatmap(rpm_corr, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, ax=axes[1, 1])
axes[1, 1].set_title('RPM Feature Correlations')

# RPM efficiency vs variability
scatter = axes[1, 2].scatter(df_rpm['rpm_efficiency_score'], df_rpm['rpm_variability_score'],
                             alpha=0.6, c=df_rpm['rpm_mean'], cmap='plasma')
axes[1, 2].set_xlabel('RPM Efficiency Score')
axes[1, 2].set_ylabel('RPM Variability Score')
axes[1, 2].set_title('RPM Efficiency vs Variability')
plt.colorbar(scatter, ax=axes[1, 2], label='Average RPM')
axes[1, 2].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../results/features/rpm_features_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

# %% [markdown]
# ## 6. Composite Feature Engineering

# %%
# Create composite features
print("Creating composite features...")
df_composite = extractor.extract_composite_features(df_rpm)

# Display composite features
composite_features = [col for col in df_composite.columns if any(x in col for x in ['score', 'index', 'style', 'composite'])]
print(f"\nComposite features created: {len(composite_features)}")
print("\nComposite features:")
print(df_composite[composite_features].describe().round(3).T)

# %%
# Visualize composite features
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# Safety score distribution
axes[0, 0].hist(df_composite['safety_score'].dropna(), bins=30, alpha=0.7)
axes[0, 0].axvline(df_composite['safety_score'].mean(), color='red', linestyle='--',
                   label=f'Mean: {df_composite["safety_score"].mean():.2f}')
axes[0, 0].set_xlabel('Safety Score')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Safety Score Distribution')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Fuel efficiency composite distribution
axes[0, 1].hist(df_composite['fuel_efficiency_composite'].dropna(), bins=30, alpha=0.7)
axes[0, 1].axvline(df_composite['fuel_efficiency_composite'].mean(), color='red', linestyle='--',
                   label=f'Mean: {df_composite["fuel_efficiency_composite"].mean():.2f}')
axes[0, 1].set_xlabel('Fuel Efficiency Composite Score')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Fuel Efficiency Distribution')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Aggressive index distribution
axes[0, 2].hist(df_composite['aggressive_index'].dropna(), bins=30, alpha=0.7)
axes[0, 2].axvline(df_composite['aggressive_index'].mean(), color='red', linestyle='--',
                   label=f'Mean: {df_composite["aggressive_index"].mean():.2f}')
axes[0, 2].set_xlabel('Aggressive Index')
axes[0, 2].set_ylabel('Frequency')
axes[0, 2].set_title('Aggressive Driving Index')
axes[0, 2].legend()
axes[0, 2].grid(True, alpha=0.3)

# Driver style distribution
style_counts = df_composite['driver_style'].value_counts()
axes[1, 0].bar(range(len(style_counts)), style_counts.values)
axes[1, 0].set_xticks(range(len(style_counts)))
axes[1, 0].set_xticklabels(style_counts.index, rotation=45, ha='right')
axes[1, 0].set_ylabel('Number of Drivers')
axes[1, 0].set_title('Driver Style Classification')
for i, v in enumerate(style_counts.values):
    axes[1, 0].text(i, v, str(v), ha='center', va='bottom')
axes[1, 0].grid(True, alpha=0.3)

# Correlation heatmap
composite_corr_features = ['safety_score', 'fuel_efficiency_composite',
                           'aggressive_index', 'harsh_accel_count',
                           'harsh_brake_count', 'speed_p90']
composite_corr = df_composite[composite_corr_features].corr()
sns.heatmap(composite_corr, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, ax=axes[1, 1])
axes[1, 1].set_title('Composite Feature Correlations')

# Safety vs Efficiency scatter
styles = df_composite['driver_style'].unique()
colors = plt.cm.Set3(np.linspace(0, 1, len(styles)))
for style, color in zip(styles, colors):
    mask = df_composite['driver_style'] == style
    axes[1, 2].scatter(df_composite.loc[mask, 'safety_score'],
                       df_composite.loc[mask, 'fuel_efficiency_composite'],
                       alpha=0.6, label=style, color=color, s=50)
axes[1, 2].set_xlabel('Safety Score')
axes[1, 2].set_ylabel('Fuel Efficiency Score')
axes[1, 2].set_title('Safety vs Fuel Efficiency')
axes[1, 2].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
axes[1, 2].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../results/features/composite_features_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

# %% [markdown]
# ## 7. Feature Selection and Correlation Analysis

# %%
# Select key features for further analysis
key_features = [
    # Safety metrics
    'safety_score',
    'harsh_accel_count',
    'harsh_brake_count',
    'very_harsh_accel_count',
    'very_harsh_brake_count',

    # Efficiency metrics
    'fuel_efficiency_composite',
    'rpm_efficiency_score',
    'smooth_driving_score',
    'speed_consistency',
    'time_consistency',

    # Speed metrics
    'speed_mean',
    'speed_std',
    'speed_p90',
    'speed_p95',
    'high_speed_ratio',

    # Aggressiveness
    'aggressive_index',
    'accel_aggressiveness',
    'decel_aggressiveness',
    'jerk_score',

    # RPM metrics
    'rpm_mean',
    'rpm_std',
    'rpm_range',

    # Time metrics
    'time_response_mean',
    'time_response_std',
    'time_p95'
]

# Filter to available features
available_features = [f for f in key_features if f in df_composite.columns]
print(f"Selected {len(available_features)} key features for analysis")

# %%
# Create comprehensive correlation matrix
print("Creating correlation matrix...")
correlation_matrix = df_composite[available_features].corr()

# Plot correlation heatmap
plt.figure(figsize=(16, 14))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=False, cmap='coolwarm',
            center=0, square=True, cbar_kws={"shrink": 0.8})
plt.title('Comprehensive Feature Correlation Matrix', fontsize=16, pad=20)
plt.tight_layout()
plt.savefig('../results/features/comprehensive_correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

# %%
# Identify highly correlated feature pairs
print("Identifying highly correlated features...")
corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:  # High correlation threshold
            corr_pairs.append((
                correlation_matrix.columns[i],
                correlation_matrix.columns[j],
                correlation_matrix.iloc[i, j]
            ))

print(f"\nFound {len(corr_pairs)} feature pairs with |correlation| > 0.8")
if corr_pairs:
    print("\nHighly correlated feature pairs:")
    for pair in sorted(corr_pairs, key=lambda x: abs(x[2]), reverse=True)[:10]:
        print(f"{pair[0]} vs {pair[1]}: {pair[2]:.3f}")

# %%
# Feature importance using mutual information (simulated)
print("\nCalculating feature importance...")
from sklearn.feature_selection import mutual_info_regression

# Prepare target variable (simulated safety incidents)
np.random.seed(42)
df_composite['safety_incidents'] = (
    (1 - df_composite['safety_score']) * 10 +
    df_composite['harsh_accel_count'] * 0.5 +
    df_composite['harsh_brake_count'] * 0.5 +
    np.random.normal(0, 2, len(df_composite))
).clip(0, 20)

# Calculate mutual information
X = df_composite[available_features].fillna(df_composite[available_features].mean())
mi_scores = mutual_info_regression(X, df_composite['safety_incidents'], random_state=42)

# Create feature importance DataFrame
feature_importance = pd.DataFrame({
    'feature': available_features,
    'importance': mi_scores
}).sort_values('importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(12, 8))
bars = plt.barh(range(len(feature_importance)), feature_importance['importance'])
plt.yticks(range(len(feature_importance)), feature_importance['feature'])
plt.xlabel('Mutual Information Score')
plt.title('Feature Importance for Safety Incident Prediction')
plt.gca().invert_yaxis()

# Add value labels
for i, (bar, importance) in enumerate(zip(bars, feature_importance['importance'])):
    plt.text(importance + 0.001, i, f'{importance:.3f}', va='center')

plt.tight_layout()
plt.savefig('../results/features/feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

# %% [markdown]
# ## 8. Dimensionality Reduction Analysis

# %%
# Perform PCA for dimensionality reduction
print("Performing PCA analysis...")
from sklearn.decomposition import PCA

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_composite[available_features].fillna(0))

# Fit PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Plot explained variance
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1),
         np.cumsum(pca.explained_variance_ratio_), marker='o')
plt.axhline(y=0.95, color='r', linestyle='--', alpha=0.5, label='95% Variance')
plt.axhline(y=0.90, color='g', linestyle='--', alpha=0.5, label='90% Variance')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Explained Variance')
plt.grid(True, alpha=0.3)
plt.legend()

# Annotate specific points
for i, (n, var) in enumerate(zip(range(1, 11), np.cumsum(pca.explained_variance_ratio_)[:10])):
    if i % 2 == 0:
        plt.annotate(f'{var:.1%}', (n, var), textcoords="offset points",
                     xytext=(0,10), ha='center')

plt.tight_layout()
plt.savefig('../results/features/pca_explained_variance.png', dpi=300, bbox_inches='tight')
plt.show()

# %%
# Show top components
print("\nPrincipal Components Analysis:")
n_components_95 = np.where(np.cumsum(pca.explained_variance_ratio_) > 0.95)[0][0] + 1
n_components_90 = np.where(np.cumsum(pca.explained_variance_ratio_) > 0.90)[0][0] + 1

print(f"Components needed for 90% variance: {n_components_90}")
print(f"Components needed for 95% variance: {n_components_95}")

# Display top component loadings
print("\nTop 5 Principal Component Loadings:")
for i in range(min(5, len(available_features))):
    print(f"\nPC{i+1} (Explained Variance: {pca.explained_variance_ratio_[i]:.2%}):")
    # Get top 10 features for this PC
    loadings = pd.DataFrame({
        'feature': available_features,
        'loading': pca.components_[i]
    })
    top_features = loadings.reindex(loadings.loading.abs().sort_values(ascending=False).index)
    print(top_features.head(10).to_string(index=False))

# %% [markdown]
# ## 9. Feature Engineering Summary

# %%
# Create feature engineering summary
feature_categories = {
    'Safety Features': ['safety_score', 'harsh_accel_count', 'harsh_brake_count',
                       'very_harsh_accel_count', 'very_harsh_brake_count'],
    'Efficiency Features': ['fuel_efficiency_composite', 'rpm_efficiency_score',
                          'smooth_driving_score', 'speed_consistency'],
    'Speed Features': ['speed_mean', 'speed_std', 'speed_p90', 'speed_p95',
                      'high_speed_ratio', 'speed_cv'],
    'Aggressiveness Features': ['aggressive_index', 'accel_aggressiveness',
                              'decel_aggressiveness', 'jerk_score'],
    'RPM Features': ['rpm_mean', 'rpm_std', 'rpm_range', 'rpm_variability_score'],
    'Time Features': ['time_response_mean', 'time_response_std', 'time_consistency',
                     'time_p95', 'time_p99'],
    'Composite Features': ['driver_style']
}

# Filter to available features in each category
available_categories = {}
for category, features in feature_categories.items():
    available = [f for f in features if f in df_composite.columns]
    if available:
        available_categories[category] = available

# Display summary
print("="*60)
print("FEATURE ENGINEERING SUMMARY")
print("="*60)
print(f"\nTotal features created: {len(df_composite.columns) - len(df.columns)}")
print(f"Total features available: {len(df_composite.columns)}")

print("\nFeature Categories:")
for category, features in available_categories.items():
    print(f"\n{category} ({len(features)} features):")
    print(f"  {', '.join(features[:5])}{'...' if len(features) > 5 else ''}")

# Statistical summary of key features
print("\n\nKey Feature Statistics:")
key_stats_features = []
for features in available_categories.values():
    key_stats_features.extend(features[:2])  # Take first 2 from each category

key_stats = df_composite[key_stats_features].describe().T[['mean', 'std', 'min', '50%', 'max']]
print(key_stats.round(3))

# %%
# Save engineered features
print("\nSaving engineered features...")
output_path = '../data/driver_features_engineered.csv'
df_composite.to_csv(output_path, index=False)
print(f"Engineered features saved to: {output_path}")

# Save feature metadata
feature_metadata = {
    'total_features': len(df_composite.columns),
    'engineered_features': len(df_composite.columns) - len(df.columns),
    'feature_categories': available_categories,
    'key_features': available_features,
    'correlation_insights': {
        'highly_correlated_pairs': corr_pairs[:10],
        'feature_importance_top10': feature_importance.head(10).to_dict()
    },
    'pca_insights': {
        'components_90_variance': int(n_components_90),
        'components_95_variance': int(n_components_95),
        'total_variance_explained': float(np.sum(pca.explained_variance_ratio_))
    }
}

import json
metadata_path = '../results/features/feature_metadata.json'
with open(metadata_path, 'w') as f:
    json.dump(feature_metadata, f, indent=2, default=str)
print(f"Feature metadata saved to: {metadata_path}")

# %% [markdown]
# ## 10. Next Steps

# %%
print("="*60)
print("NEXT STEPS FOR ANALYSIS")
print("="*60)
print("\n1. Feature Selection for Clustering:")
print("   - Remove highly correlated features")
print("   - Select top features based on importance")
print("   - Consider domain knowledge")

print("\n2. Data Preparation:")
print("   - Handle missing values")
print("   - Normalize/standardize features")
print("   - Consider outlier treatment")

print("\n3. Clustering Analysis:")
print("   - Determine optimal number of clusters")
print("   - Apply K-means or other algorithms")
print("   - Validate cluster quality")

print("\n4. Interpretation:")
print("   - Profile each cluster")
print("   - Identify risk patterns")
print("   - Develop driver personas")

print("\n5. Actionable Insights:")
print("   - Design targeted interventions")
print("   - Create training programs")
print("   - Develop monitoring dashboards")

SyntaxError: incomplete input (ipython-input-2308304716.py, line 688)