# SafeDriver-IQ: Data Exploration

**Notebook 1: Initial Data Exploration**

This notebook explores the CRSS (Crash Report Sampling System) dataset to understand:
- Data structure and quality
- VRU crash patterns
- Key features and distributions
- Data preparation for modeling

In [None]:
# Import libraries
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from data_loader import CRSSDataLoader
from preprocessing import CrashPreprocessor
from visualization import CrashVisualizer

# Settings
pd.set_option('display.max_columns', 50)
sns.set_style('whitegrid')

print("Libraries loaded successfully!")

## 1. Load CRSS Data

Load data from the CRSS_Data directory for years 2016-2023.

In [None]:
# Initialize data loader
loader = CRSSDataLoader(data_dir='../CRSS_Data', years=list(range(2016, 2024)))

print("Loading CRSS data...")
print("This may take several minutes...\n")

In [None]:
# Load accident data for a single year first (to check structure)
sample_accident = loader.load_accident_data(2023)

print(f"Sample accident data (2023): {len(sample_accident):,} records")
print(f"Columns: {len(sample_accident.columns)}")
print("\nFirst few columns:")
print(sample_accident.columns[:20].tolist())

In [None]:
# Load complete dataset (all years, all files)
datasets = loader.load_complete_dataset()

print("\n=== Dataset Summary ===")
for name, df in datasets.items():
    print(f"\n{name.upper()}:")
    print(f"  Records: {len(df):,}")
    print(f"  Columns: {len(df.columns)}")
    print(f"  Years: {sorted(df['YEAR'].unique())}")
    print(f"  Memory: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

## 2. Data Quality Assessment

Check for missing values, data types, and quality issues.

In [None]:
# Initialize preprocessor
preprocessor = CrashPreprocessor()

# Check quality for each dataset
for name, df in datasets.items():
    print(f"\n{'='*60}")
    quality_metrics = preprocessor.check_data_quality(df, name)
    print(f"{'='*60}")

In [None]:
# Examine accident data structure
accident_df = datasets['accident']

print("Accident Data Sample:")
print(accident_df.head())

In [None]:
# Check data types
print("Data Types:")
print(accident_df.dtypes.value_counts())

print("\nSample numeric columns:")
print(accident_df.select_dtypes(include=[np.number]).columns[:10].tolist())

## 3. VRU Crash Analysis

Focus on Vulnerable Road User (pedestrian, bicyclist) crashes.

In [None]:
# Get VRU crashes
person_df = datasets['person']

print("Person Type Distribution:")
print(person_df['PER_TYP'].value_counts())

# Filter for VRUs
vru_persons = person_df[person_df['PER_TYP'].isin([5, 6])]
print(f"\nTotal VRU persons: {len(vru_persons):,}")
print(f"  Pedestrians (PER_TYP=5): {(vru_persons['PER_TYP']==5).sum():,}")
print(f"  Bicyclists (PER_TYP=6): {(vru_persons['PER_TYP']==6).sum():,}")

In [None]:
# Get unique VRU crashes
vru_case_ids = vru_persons['CASENUM'].unique()
print(f"Unique VRU crashes: {len(vru_case_ids):,}")

# Filter accident data for VRU crashes
vru_accidents = accident_df[accident_df['CASENUM'].isin(vru_case_ids)]
print(f"VRU accident records: {len(vru_accidents):,}")

In [None]:
# VRU crashes by year
vru_by_year = vru_accidents['YEAR'].value_counts().sort_index()

plt.figure(figsize=(12, 6))
plt.plot(vru_by_year.index, vru_by_year.values, marker='o', linewidth=2, markersize=8)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Number of VRU Crashes', fontsize=12)
plt.title('VRU Crashes Over Time (2016-2023)', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\nVRU Crashes by Year:")
print(vru_by_year)

## 4. Key Feature Exploration

Explore important features for safety modeling.

In [None]:
# Check available features in accident data
print("Available columns in accident data:")
print(f"Total: {len(accident_df.columns)}")
print("\nColumn names:")
for i, col in enumerate(accident_df.columns, 1):
    print(f"{i:3d}. {col}", end="   ")
    if i % 4 == 0:
        print()

In [None]:
# Temporal features
if 'HOUR' in vru_accidents.columns:
    print("VRU Crashes by Hour of Day:")
    hour_dist = vru_accidents['HOUR'].value_counts().sort_index()
    
    plt.figure(figsize=(12, 6))
    plt.bar(hour_dist.index, hour_dist.values, color='steelblue')
    plt.xlabel('Hour of Day', fontsize=12)
    plt.ylabel('Number of VRU Crashes', fontsize=12)
    plt.title('VRU Crashes by Hour of Day', fontsize=14, fontweight='bold')
    plt.grid(True, alpha=0.3, axis='y')
    plt.tight_layout()
    plt.show()

In [None]:
# Weather and lighting conditions
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

if 'WEATHER' in vru_accidents.columns:
    weather_dist = vru_accidents['WEATHER'].value_counts().head(10)
    axes[0].barh(range(len(weather_dist)), weather_dist.values)
    axes[0].set_yticks(range(len(weather_dist)))
    axes[0].set_yticklabels(weather_dist.index)
    axes[0].set_xlabel('Count')
    axes[0].set_title('Top 10 Weather Conditions')
    axes[0].invert_yaxis()

if 'LGT_COND' in vru_accidents.columns:
    light_dist = vru_accidents['LGT_COND'].value_counts().head(10)
    axes[1].barh(range(len(light_dist)), light_dist.values, color='orange')
    axes[1].set_yticks(range(len(light_dist)))
    axes[1].set_yticklabels(light_dist.index)
    axes[1].set_xlabel('Count')
    axes[1].set_title('Top 10 Lighting Conditions')
    axes[1].invert_yaxis()

plt.tight_layout()
plt.show()

## 5. Injury Severity Analysis

Examine injury outcomes for VRU crashes.

In [None]:
# Injury severity for VRU persons
if 'INJ_SEV' in vru_persons.columns:
    print("VRU Injury Severity Distribution:")
    injury_dist = vru_persons['INJ_SEV'].value_counts().sort_index()
    print(injury_dist)
    
    # Plot
    plt.figure(figsize=(10, 6))
    injury_dist.plot(kind='bar', color='coral')
    plt.xlabel('Injury Severity Code', fontsize=12)
    plt.ylabel('Count', fontsize=12)
    plt.title('VRU Injury Severity Distribution', fontsize=14, fontweight='bold')
    plt.xticks(rotation=0)
    plt.grid(True, alpha=0.3, axis='y')
    plt.tight_layout()
    plt.show()
    
    # Fatal crashes
    fatal_vru = (vru_persons['INJ_SEV'] == 4).sum()
    print(f"\nFatal VRU injuries: {fatal_vru:,}")
    print(f"Fatality rate: {fatal_vru/len(vru_persons)*100:.2f}%")

## 6. Save Processed Data

Save cleaned data for next notebooks.

In [None]:
# Save VRU data
print("Saving processed data...")

vru_accidents.to_parquet('../data/processed/vru_accidents.parquet', index=False)
vru_persons.to_parquet('../data/processed/vru_persons.parquet', index=False)

# Save VRU case IDs
pd.Series(vru_case_ids).to_csv('../data/processed/vru_case_ids.csv', index=False, header=['CASENUM'])

print("✓ Saved VRU accidents")
print("✓ Saved VRU persons")
print("✓ Saved VRU case IDs")

print("\nData exploration complete!")

## Summary

### Key Findings:
1. **Dataset Size**: Successfully loaded 8 years of CRSS data (2016-2023)
2. **VRU Crashes**: Identified thousands of VRU crashes involving pedestrians and bicyclists
3. **Data Quality**: Assessed missing values and data completeness
4. **Key Features**: Identified temporal, environmental, and location features
5. **Injury Patterns**: Analyzed severity distributions for VRU crashes

### Next Steps:
- **Notebook 2**: Feature engineering for safety modeling
- **Notebook 3**: Crash pattern clustering
- **Notebook 4**: Inverse safety model training
- **Notebook 5**: Good driver profile extraction