# RaceIQ Pro - Data Exploration

This notebook demonstrates how to load and explore the race data using RaceIQ Pro's data pipeline.

In [None]:
# Import required libraries
import sys
from pathlib import Path

# Add src to path
sys.path.insert(0, str(Path.cwd().parent / "src"))

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pipeline.data_loader import DataLoader, load_data_for_track
from pipeline.validator import DataValidator
from pipeline.feature_engineer import FeatureEngineer
from utils.metrics import *
from utils.visualization import *

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

%matplotlib inline

## 1. Load Sample Data

In [None]:
# Initialize data loader
loader = DataLoader()

# Load all sample data
data = loader.load_all_sample_data()

print("Loaded datasets:")
for key, df in data.items():
    print(f"  {key}: {len(df)} records, {len(df.columns)} columns")

## 2. Validate Data Quality

In [None]:
# Initialize validator
validator = DataValidator()

# Validate all datasets
validation_results = validator.validate_all(data)

# Print summary
print(validator.get_summary_report())

## 3. Explore Lap Time Data

In [None]:
# Examine lap time data
if 'lap_time' in data:
    lap_df = data['lap_time']
    print("Lap Time Data:")
    print(lap_df.head())
    print("\nData types:")
    print(lap_df.dtypes)
    print("\nBasic statistics:")
    print(lap_df.describe())

## 4. Explore Section Analysis Data

In [None]:
# Examine section analysis data
if 'section_analysis' in data:
    section_df = data['section_analysis']
    print("Section Analysis Data:")
    print(section_df.head())
    print("\nDrivers in dataset:")
    print(section_df['DRIVER_NUMBER'].unique())
    print("\nLap time statistics:")
    print(section_df['LAP_TIME_SECONDS'].describe())

## 5. Feature Engineering

In [None]:
# Initialize feature engineer
engineer = FeatureEngineer()

# Engineer features for all datasets
engineered_data = engineer.engineer_all_features(data)

print("Feature engineering complete!")
if 'section_analysis' in engineered_data:
    print("\nNew columns in section analysis:")
    original_cols = set(data['section_analysis'].columns)
    new_cols = set(engineered_data['section_analysis'].columns) - original_cols
    for col in sorted(new_cols):
        print(f"  - {col}")

## 6. Visualizations

In [None]:
# Plot lap times if data is available
if 'section_analysis' in engineered_data:
    section_df = engineered_data['section_analysis']
    
    # Select a few drivers for visualization
    drivers = section_df['DRIVER_NUMBER'].unique()[:5]
    
    # Plot lap times
    fig = plot_lap_times(
        section_df,
        driver_col='DRIVER_NUMBER',
        lap_col='LAP_NUMBER',
        time_col='LAP_TIME_SECONDS',
        drivers=drivers,
        title="Lap Times - Barber Motorsports Park Race 1"
    )
    plt.show()

In [None]:
# Plot sector comparison
if 'section_analysis' in engineered_data:
    fig = plot_sector_comparison(
        section_df,
        driver_col='DRIVER_NUMBER',
        drivers=drivers
    )
    plt.show()

## 7. Performance Metrics

In [None]:
# Calculate theoretical best laps
if 'section_analysis' in engineered_data:
    theoretical_best = calculate_theoretical_best_lap(
        section_df,
        driver_col='DRIVER_NUMBER'
    )
    print("Theoretical Best Lap Times:")
    print(theoretical_best.sort_values('theoretical_best_lap').head(10))

In [None]:
# Calculate driver consistency
if 'section_analysis' in engineered_data:
    consistency = calculate_driver_consistency(section_df)
    print("Driver Consistency Scores:")
    print(consistency[['DRIVER_NUMBER', 'consistency_score', 'lap_time_cv']].sort_values('consistency_score', ascending=False).head(10))