In [1]:
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
import sys
from pathlib import Path

current = Path.cwd()

indicators = [
    '.git', 'pyproject.toml'
]

for parent in [current] + list(current.parents):
    if any((parent / indicator).exists() for indicator in indicators):
        project_root = parent

sys.path.insert(0, str(project_root))

In [None]:
filepath = Path(project_root / "data/raw/2023/Australian Grand Prix/FP1/session_info.json")

In [None]:
print(filepath)

In [None]:
import json

In [None]:
with open(filepath, "r", encoding="utf-8") as f:
    data = json.load(f)

print(data)

In [None]:
type(data)

In [None]:
copy = data.copy()

In [None]:
copy

In [None]:
location_clean = str(copy['location'].strip().title())
location_clean

In [None]:
copy['location'].title()

In [None]:
off_event_name = copy["official_event_name"]
off_event_name

In [None]:
event_name_cleaning = {
    "FORMULA 1": "",
    "GRAND PRIX": "GP",
    "EMIRATES": "",  # Sponsor names
    "ARAMCO": "",  # Sponsor names
}

In [None]:
off_event_name = "FORMULA 1 ROLEX AUSTRALIAN GRAND PRIX 2023"
off_event_name

In [None]:
for pattern, replacement in event_name_cleaning.items():
    print(f"Pattern = {pattern}")
    print(f"Replacement = {replacement}")
    off_event_name = off_event_name.replace(pattern, replacement)

off_event_name

In [None]:
off_event_name = "FORMULA 1 ROLEX AUSTRALIAN GRAND PRIX 2023"
off_event_name = off_event_name.replace("FORMULA 1", "")
off_event_name

In [None]:
session_type_mapping = {
    "Practice 1": "FP1",
    "Practice 2": "FP2",
    "Practice 3": "FP3",
    "Qualifying": "Q",
    "Race": "R",
    "Sprint": "S",
    "Sprint Qualifying": "SQ",
    "Sprint Shootout": "SS",
}

In [None]:
session_name = str(copy["session_name"]).strip()
session_name

In [None]:
session_type = session_type_mapping.get(session_name, session_name)
session_type


In [None]:
"""
Test script for SessionProcessor
"""

from config.logging import setup_logging, get_logger
from src.data_processing.core.session_processor import SessionProcessor
from src.data_processing.base.processing_context import ProcessingContext
import json


def test_session_processor():
    """Test SessionProcessor with sample data"""

    setup_logging()
    logger = get_logger("test_session_processor")

    logger.info("=== Testing SessionProcessor ===")

    # Create sample session data (like what your ingestion produces)
    sample_session_data = {
        "event_name": "Monaco Grand Prix",
        "location": "Monte Carlo",
        "country": "Monaco",
        "session_name": "Qualifying",
        "session_date": "2023-05-27",
        "official_event_name": "FORMULA 1 GRAND PRIX DE MONACO 2023",
        "event_format": "conventional",
        "round_number": 6,
    }

    try:
        # Create processor
        processor = SessionProcessor()

        # Create context
        context = ProcessingContext(year=2023, event_name="Monaco", session_type="Q")

        # Process data
        logger.info("Processing sample session data...")
        result_df, updated_context = processor.process(sample_session_data, context)

        # Display results
        logger.info("✅ Processing completed successfully!")
        logger.info(f"Output shape: {result_df.shape}")
        logger.info(f"Output columns: {list(result_df.columns)}")
        logger.info("Sample output:")
        print(result_df.to_string())

        # Display processing stats
        stats = processor.get_processing_stats()
        logger.info(f"Processing stats: {stats}")

        # Display context metadata
        logger.info("Context metadata:")
        context_dict = updated_context.to_dict()
        print(json.dumps(context_dict, indent=2, default=str))

        return True

    except Exception as e:
        logger.error(f"❌ Test failed: {str(e)}")
        import traceback

        traceback.print_exc()
        return False


In [None]:
test1_passed = test_session_processor()
print("Test 1 passed successfully.")

In [None]:
def test_with_real_data():
    """Test with real ingested data"""
    
    setup_logging()
    logger = get_logger('test_session_processor_real')
    
    logger.info("=== Testing SessionProcessor with Real Data ===")
    
    # Try to load real session data
    try:
        sample_file = Path(project_root / "data/raw/2023/Monaco Grand Prix/Q/session_info.json")
        
        if not sample_file.exists():
            logger.warning("Real data file not found: %s", sample_file)
            logger.info("Skipping real data test")
            return True
        
        # Load real session info
        with open(sample_file, 'r', encoding='utf-8') as f:
            real_session_info = json.load(f)
        
        # Process with SessionProcessor
        processor = SessionProcessor()
        context = ProcessingContext(year=2023, event_name='Monaco', session_type='Q')
        
        result_df, updated_context = processor.process(real_session_info, context)
        
        logger.info("✅ Real data processing completed!")
        logger.info(f"Output shape: {result_df.shape}")
        print(result_df.to_string())
        
        return True
        
    except Exception as e:
        logger.error(f"❌ Real data test failed: {str(e)}")
        import traceback
        traceback.print_exc()
        return False

In [None]:
test_with_real_data()

In [None]:
sample_file = Path(project_root / "data/raw/2023/Monaco Grand Prix/Q/session_info.json")

In [None]:
with open(sample_file, 'r', encoding='utf-8') as f:
            real_session_info = json.load(f)

In [None]:
real_session_info

In [None]:
processor = SessionProcessor()
context = ProcessingContext(year=2023, event_name='Monaco Grand Prix', session_type='Q')

In [None]:
result_df, updated_context = processor.process(real_session_info, context)

In [None]:
result_df

In [None]:
laps_data = pd.read_parquet("/Volumes/ExternalSSD/My Projects/Data Science Projects/f1_ds_project/data/raw/2023/Australian Grand Prix/FP1/laps.parquet")
laps_data.head()

In [None]:
list(laps_data.columns)

In [None]:
laps_data.iloc[0, :]

In [None]:
test_laps = pd.DataFrame(
    {
        "Driver": ["HAM", "HAM", "VER", "VER", "HAM", "VER"],
        "LapNumber": [1, None, None, 2, None, None]
    }
)
test_laps

In [None]:
test_laps["lap_number"] = test_laps.groupby("Driver").cumcount() + 1

In [None]:
test_laps

In [None]:
test_laptime = laps_data.iloc[0, 3]
test_laptime

In [None]:
isinstance(test_laptime, pd.Timedelta)

In [None]:
test_laptime2 = 107.43
isinstance(test_laptime2, (int, float))

In [3]:
from config.logging import setup_logging, get_logger
from src.data_processing.core.lap_processor import LapProcessor
from src.data_processing.base.processing_context import ProcessingContext
import pandas as pd
import json


def test_lap_processor_with_sample_data():
    """Test LapProcessor with synthetic sample data"""
    
    setup_logging()
    logger = get_logger('test_lap_processor')
    
    logger.info("=== Testing LapProcessor with Sample Data ===")
    
    # Create sample lap data (like what your ingestion produces)
    sample_laps = pd.DataFrame({
        'Driver': ['VER', 'VER', 'VER', 'HAM', 'HAM', 'HAM', 'LEC', 'LEC', 'LEC'],
        'DriverNumber': [1, 1, 1, 44, 44, 44, 16, 16, 16],
        'Team': ['Red Bull Racing', 'Red Bull Racing', 'Red Bull Racing',
                 'Mercedes', 'Mercedes', 'Mercedes',
                 'Ferrari', 'Ferrari', 'Ferrari'],
        'LapNumber': [1, 2, 3, 1, 2, 3, 1, 2, 3],
        'LapTime': [pd.Timedelta(seconds=78.5), pd.Timedelta(seconds=77.2), pd.Timedelta(seconds=76.8),
                    pd.Timedelta(seconds=79.1), pd.Timedelta(seconds=78.3), pd.Timedelta(seconds=77.9),
                    pd.Timedelta(seconds=78.8), pd.Timedelta(seconds=77.8), pd.Timedelta(seconds=77.5)],
        'LapTimeSeconds': [78.5, 77.2, 76.8, 79.1, 78.3, 77.9, 78.8, 77.8, 77.5],
        'Position': [1, 1, 1, 3, 2, 2, 2, 3, 3],
        'Compound': ['SOFT', 'SOFT', 'SOFT', 'MEDIUM', 'MEDIUM', 'MEDIUM', 'SOFT', 'SOFT', 'SOFT'],
        'TyreLife': [1, 2, 3, 1, 2, 3, 1, 2, 3],
        'Stint': [1, 1, 1, 1, 1, 1, 1, 1, 1],
        'FreshTyre': [True, False, False, True, False, False, True, False, False],
        'Sector1Time': [pd.Timedelta(seconds=25.1), pd.Timedelta(seconds=24.8), pd.Timedelta(seconds=24.6),
                        pd.Timedelta(seconds=25.4), pd.Timedelta(seconds=25.1), pd.Timedelta(seconds=25.0),
                        pd.Timedelta(seconds=25.2), pd.Timedelta(seconds=24.9), pd.Timedelta(seconds=24.7)],
        'Sector2Time': [pd.Timedelta(seconds=28.2), pd.Timedelta(seconds=27.8), pd.Timedelta(seconds=27.5),
                        pd.Timedelta(seconds=28.5), pd.Timedelta(seconds=28.1), pd.Timedelta(seconds=27.9),
                        pd.Timedelta(seconds=28.3), pd.Timedelta(seconds=27.9), pd.Timedelta(seconds=27.7)],
        'Sector3Time': [pd.Timedelta(seconds=25.2), pd.Timedelta(seconds=24.6), pd.Timedelta(seconds=24.7),
                        pd.Timedelta(seconds=25.2), pd.Timedelta(seconds=25.1), pd.Timedelta(seconds=25.0),
                        pd.Timedelta(seconds=25.3), pd.Timedelta(seconds=25.0), pd.Timedelta(seconds=25.1)],
        'EventName': ['Monaco'] * 9,
        'SessionName': ['Qualifying'] * 9,
        'SessionDate': ['2023-05-27'] * 9,
        "Deleted": [False] * 9,
        "DeletedReason": [""] * 9,
    })
    
    # Wrap in session data structure
    session_data = {
        'session_info': {},
        'laps': sample_laps,
        'results': None,
        'weather': None,
        'telemetry': {}
    }
    
    try:
        # Create processor
        processor = LapProcessor()
        
        # Create context
        context = ProcessingContext(
            year=2023,
            event_name='Monaco',
            session_type='Q'
        )
        
        # Process data
        logger.info("Processing sample lap data...")
        result_df, updated_context = processor.process(session_data, context)
        
        # Display results
        logger.info("✅ Processing completed successfully!")
        logger.info(f"Output shape: {result_df.shape}")
        logger.info(f"Output columns: {list(result_df.columns)}")
        
        logger.info("\nSample output (first 5 laps):")
        display_columns = ['lap_id', 'driver_clean', 'lap_number', 'lap_time_seconds', 
                          'lap_time_delta_to_fastest', 'is_fastest_lap', 'is_valid_lap']
        print(result_df[display_columns].head())
        
        logger.info("\nLap statistics:")
        print(f"Total laps: {len(result_df)}")
        print(f"Unique drivers: {result_df['driver_clean'].nunique()}")
        print(f"Valid laps: {result_df['is_valid_lap'].sum()}")
        print(f"Fastest lap time: {result_df['lap_time_seconds'].min():.3f}s")
        print(f"Slowest lap time: {result_df['lap_time_seconds'].max():.3f}s")
        
        # Display processing stats
        stats = processor.get_processing_stats()
        logger.info(f"\nProcessing stats: {stats}")
        
        # Display validation results
        logger.info("\nValidation summary:")
        logger.info(f"Errors: {updated_context.metadata.errors_count}")
        logger.info(f"Warnings: {updated_context.metadata.warnings_count}")
        
        # if updated_context.has_warnings():
        #     logger.info("Warnings:")
        #     for warning in updated_context.warnings[:5]:  # Show first 5
        #         logger.info(f"  - {warning}")
        
        return True
        
    except Exception as e:
        logger.error(f"❌ Test failed: {str(e)}")
        import traceback
        traceback.print_exc()
        return False

In [None]:
print("=" * 80)
print("LAP PROCESSOR TESTS")
print("=" * 80)

test1_passed = test_lap_processor_with_sample_data()
print("\n" + "=" * 80 + "\n")

In [4]:
def test_lap_processor_with_real_data():
    """Test LapProcessor with real ingested data"""
    
    setup_logging()
    logger = get_logger('test_lap_processor_real')
    
    logger.info("=== Testing LapProcessor with Real Data ===")
    
    try:
        # Try to load real lap data
        sample_file = Path(project_root / "data/raw/2023/Monaco Grand Prix/Q/laps.parquet")
        
        if not sample_file.exists():
            logger.warning(f"Real data file not found: {sample_file}")
            logger.info("Skipping real data test")
            return True
        
        # Load real laps
        real_laps = pd.read_parquet(sample_file)
        logger.info(f"Loaded {len(real_laps)} real laps from {sample_file}")
        
        # Create data structure
        real_session_data = {
            'session_info': {},
            'laps': real_laps,
            'results': None,
            'weather': None,
            'telemetry': {}
        }
        
        # Process with LapProcessor
        processor = LapProcessor()
        context = ProcessingContext(year=2023, event_name='Monaco', session_type='Q')
        
        logger.info("Processing real lap data...")
        result_df, updated_context = processor.process(real_session_data, context)
        
        logger.info("✅ Real data processing completed!")
        logger.info(f"Output shape: {result_df.shape}")
        
        # Show some interesting statistics
        logger.info("\nReal data statistics:")
        print(f"Total laps processed: {len(result_df)}")
        print(f"Unique drivers: {result_df['driver_clean'].nunique()}")
        print(f"Valid laps: {result_df['is_valid_lap'].sum()}")
        print(f"Invalid laps: {(~result_df['is_valid_lap']).sum()}")
        
        if 'lap_time_seconds' in result_df.columns:
            valid_laps = result_df[result_df['is_valid_lap']]
            if len(valid_laps) > 0:
                print(f"\nLap time statistics (valid laps only):")
                print(f"Fastest: {valid_laps['lap_time_seconds'].min():.3f}s")
                print(f"Slowest: {valid_laps['lap_time_seconds'].max():.3f}s")
                print(f"Mean: {valid_laps['lap_time_seconds'].mean():.3f}s")
                print(f"Median: {valid_laps['lap_time_seconds'].median():.3f}s")
        
        # Show fastest lap holder
        if 'is_fastest_lap' in result_df.columns:
            fastest = result_df[result_df['is_fastest_lap']]
            if len(fastest) > 0:
                print(f"\nFastest lap:")
                print(f"Driver: {fastest.iloc[0]['driver_clean']}")
                print(f"Time: {fastest.iloc[0]['lap_time_seconds']:.3f}s")
                print(f"Lap Number: {fastest.iloc[0]['lap_number']}")
        
        # Sample of processed data
        logger.info("\nSample of processed laps:")
        display_columns = ['driver_clean', 'lap_number', 'lap_time_seconds', 
                          'compound_clean', 'is_valid_lap', 'is_fastest_lap']
        existing_display_cols = [col for col in display_columns if col in result_df.columns]
        print(result_df[existing_display_cols].head(10).to_string())
        
        return True
        
    except Exception as e:
        logger.error(f"❌ Real data test failed: {str(e)}")
        import traceback
        traceback.print_exc()
        return False

In [5]:
print("=" * 80)
print("LAP PROCESSOR TESTS")
print("=" * 80)

test2_passed = test_lap_processor_with_real_data()
print("\n" + "=" * 80 + "\n")

LAP PROCESSOR TESTS
2025-09-29 16:04:09 - config.logging - INFO - setup_logging:100 - Logging configured. Log directory: /Volumes/ExternalSSD/My Projects/Data Science Projects/f1_ds_project/notebooks/EDA_and_DataProcessing/monitoring/logs
2025-09-29 16:04:09 - config.logging - INFO - setup_logging:101 - Environment: development
2025-09-29 16:04:09 - test_lap_processor_real - INFO - test_lap_processor_with_real_data:7 - === Testing LapProcessor with Real Data ===
2025-09-29 16:04:09 - test_lap_processor_real - INFO - test_lap_processor_with_real_data:20 - Loaded 420 real laps from /Volumes/ExternalSSD/My Projects/Data Science Projects/f1_ds_project/data/raw/2023/Monaco Grand Prix/Q/laps.parquet
2025-09-29 16:04:09 - test_lap_processor_real - INFO - test_lap_processor_with_real_data:35 - Processing real lap data...
2025-09-29 16:04:09 - data_processing.lap_processor - INFO - process:91 - Starting processing with lap_processor
2025-09-29 16:04:09 - data_processing.lap_processor - INFO - _

In [None]:
quali_data = pd.read_parquet(Path(project_root / "data/raw/2023/Monaco Grand Prix/Q/laps.parquet"))

In [None]:
print(quali_data.shape)
quali_data.head()

In [None]:
race_data = pd.read_parquet(Path(project_root / "data/raw/2023/Monaco Grand Prix/R/laps.parquet"))
print(race_data.shape)
race_data.head()