In [None]:
# This would be a complete Jupyter notebook. Here's the structure:

"""
# Driver Behavior Analysis - Data Exploration Notebook

## 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

## 2. Load Data
df = pd.read_csv('../data/data_cleaned.csv')
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

## 3. Data Understanding
# Column categories
speed_cols = [col for col in df.columns if 'speed' in col]
accel_cols = [col for col in df.columns if 'accel' in col]
tpos_cols = [col for col in df.columns if 'tPos' in col]
rpm_cols = [col for col in df.columns if 'rpm' in col]

print(f"Speed columns: {len(speed_cols)}")
print(f"Acceleration columns: {len(accel_cols)}")
print(f"Time position columns: {len(tpos_cols)}")
print(f"RPM columns: {len(rpm_cols)}")

## 4. Missing Values Analysis
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_percentage
})
missing_df = missing_df[missing_df['Missing Values'] > 0].sort_values('Percentage', ascending=False)

## 5. Statistical Summary
statistical_summary = df.describe().T
statistical_summary['cv'] = statistical_summary['std'] / statistical_summary['mean']
statistical_summary = statistical_summary.round(3)

## 6. Distribution Analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Speed distribution
sample_speed_cols = speed_cols[:4]
df[sample_speed_cols].hist(ax=axes[0, 0], bins=30, alpha=0.7)
axes[0, 0].set_title('Speed Distributions')
axes[0, 0].legend(sample_speed_cols)

# Acceleration distribution
sample_accel_cols = accel_cols[:4]
df[sample_accel_cols].hist(ax=axes[0, 1], bins=30, alpha=0.7)
axes[0, 1].set_title('Acceleration Distributions')
axes[0, 1].legend(sample_accel_cols)

# Correlation matrix (sample)
corr_cols = speed_cols[:5] + accel_cols[:5]
correlation_matrix = df[corr_cols].corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm',
            ax=axes[1, 0], center=0)
axes[1, 0].set_title('Feature Correlation Matrix')

# Outlier detection using boxplot
sample_data = pd.DataFrame({
    'Max Speed': df[speed_cols].max(axis=1),
    'Avg Acceleration': df[accel_cols].mean(axis=1),
    'RPM Variability': df[[col for col in rpm_cols if 'std' in col]].mean(axis=1)
})
sample_data.boxplot(ax=axes[1, 1])
axes[1, 1].set_title('Boxplot of Key Metrics')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('../results/exploratory_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

## 7. Time-Series Pattern Analysis
# Extract percentile values
percentiles = [0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Speed percentiles over time
speed_percentiles = df[speed_cols].quantile(percentiles)
axes[0, 0].plot(speed_percentiles.T)
axes[0, 0].set_xlabel('Speed Measurement Index')
axes[0, 0].set_ylabel('Speed (mph)')
axes[0, 0].set_title('Speed Distribution Percentiles')
axes[0, 0].legend([f'{p*100}%' for p in percentiles])
axes[0, 0].grid(True, alpha=0.3)

# Acceleration pattern
accel_pattern = df[accel_cols[:20]].mean()
axes[0, 1].plot(accel_pattern.index, accel_pattern.values, marker='o')
axes[0, 1].set_xlabel('Acceleration Measurement Index')
axes[0, 1].set_ylabel('Acceleration (m/sÂ²)')
axes[0, 1].set_title('Average Acceleration Pattern')
axes[0, 1].tick_params(axis='x', rotation=45)
axes[0, 1].grid(True, alpha=0.3)

# RPM distribution
rpm_sample = rpm_cols[:20]
rpm_data = df[rpm_sample].mean()
axes[1, 0].plot(rpm_data.index, rpm_data.values, marker='s', color='green')
axes[1, 0].set_xlabel('RPM Measurement Index')
axes[1, 0].set_ylabel('RPM')
axes[1, 0].set_title('Average RPM Pattern')
axes[1, 0].tick_params(axis='x', rotation=45)
axes[1, 0].grid(True, alpha=0.3)

# Time position analysis
tpos_sample = tpos_cols[:15]
tpos_data = df[tpos_sample].mean()
axes[1, 1].plot(tpos_data.index, tpos_data.values, marker='^', color='purple')
axes[1, 1].set_xlabel('Time Position Index')
axes[1, 1].set_ylabel('Time Position')
axes[1, 1].set_title('Average Time Position Pattern')
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../results/time_series_patterns.png', dpi=300, bbox_inches='tight')
plt.show()

## 8. Save Exploration Results
exploration_results = {
    'dataset_shape': df.shape,
    'column_categories': {
        'speed': len(speed_cols),
        'acceleration': len(accel_cols),
        'time_position': len(tpos_cols),
        'rpm': len(rpm_cols)
    },
    'missing_values_summary': missing_df.to_dict(),
    'statistical_summary': statistical_summary.head(20).to_dict()
}

import json
with open('../results/exploration_summary.json', 'w') as f:
    json.dump(exploration_results, f, indent=4)

print("Exploration complete. Results saved to ../results/")
"""