In [None]:
"""
Dataset Validation Notebook
Purpose: Verify all configuration values match actual dataset
"""

In [None]:
import pandas as pd
import numpy as np
import sys
import json
import yaml
from pathlib import Path

In [None]:
# Add src to path
sys.path.append('../src')

from common.constants import *
from data_processing.data_loader import DataLoader
from data_processing.unit_converter import UnitConverter
from data_processing.window_generator import WindowGenerator

# %% [markdown]
# # 1. Load and Inspect Raw Data

# %%
# Load raw CSV files directly to check structure
cell_df_raw = pd.read_csv('../data/raw/CellReports.csv')
ue_df_raw = pd.read_csv('../data/raw/UEReports.csv')

print("=== RAW DATA STRUCTURE ===")
print(f"Cell data shape: {cell_df_raw.shape}")
print(f"Cell columns: {list(cell_df_raw.columns)}")
print(f"\nUE data shape: {ue_df_raw.shape}")
print(f"UE columns: {list(ue_df_raw.columns)}")

# %% 
# Verify entity counts
print("\n=== ENTITY COUNTS ===")
print(f"Config expects: {EXPECTED_ENTITIES['cells']} cells, {EXPECTED_ENTITIES['ues']} UEs")
print(f"Actual: {cell_df_raw['Viavi.Cell.Name'].nunique()} cells, {ue_df_raw['Viavi.UE.Name'].nunique()} UEs")

assert cell_df_raw['Viavi.Cell.Name'].nunique() == EXPECTED_ENTITIES['cells'], "Cell count mismatch!"
assert ue_df_raw['Viavi.UE.Name'].nunique() == EXPECTED_ENTITIES['ues'], "UE count mismatch!"

# %%
# Check timestamp intervals
print("\n=== TIMESTAMP ANALYSIS ===")
cell_df_raw['timestamp_dt'] = pd.to_datetime(cell_df_raw['timestamp'], unit='s')
intervals = cell_df_raw.groupby('Viavi.Cell.Name')['timestamp_dt'].diff().dt.total_seconds()
print(f"Timestamp intervals: median={intervals.median()}, mean={intervals.mean():.1f}")
print(f"Expected from config: {VIAVI_CONFIG['measurement_interval_seconds']} seconds")

# %%
# Verify band configuration
print("\n=== BAND ANALYSIS ===")
cell_df_raw['Band'] = cell_df_raw['Viavi.Cell.Name'].str.extract(r'S\d+/([^/]+)/C\d+')[0]
bands_in_data = cell_df_raw['Band'].unique()
print(f"Bands in data: {sorted(bands_in_data)}")
print(f"Bands in config: {list(BAND_SPECS.keys())}")

for band in bands_in_data:
    if band in BAND_SPECS:
        max_prb = cell_df_raw[cell_df_raw['Band'] == band]['RRU.PrbAvailDl'].max()
        config_prb = BAND_SPECS[band]['prb_count']
        print(f"  {band}: Data max PRB={max_prb}, Config={config_prb}")

# %%
# Verify PrbTot is percentage (critical!)
print("\n=== PRB TOT VERIFICATION ===")
print("PrbTotDl range:", cell_df_raw['RRU.PrbTotDl'].min(), "-", cell_df_raw['RRU.PrbTotDl'].max())
print("PrbTotUl range:", cell_df_raw['RRU.PrbTotUl'].min(), "-", cell_df_raw['RRU.PrbTotUl'].max())

if cell_df_raw['RRU.PrbTotDl'].max() <= 100:
    print("✓ PrbTot appears to be percentage (0-100)")
else:
    print("✗ WARNING: PrbTot may not be percentage!")

# %%
# Check energy is cumulative
print("\n=== ENERGY ANALYSIS ===")
sample_cell = cell_df_raw['Viavi.Cell.Name'].iloc[0]
cell_energy = cell_df_raw[cell_df_raw['Viavi.Cell.Name'] == sample_cell].sort_values('timestamp')
energy_diff = cell_energy['PEE.Energy'].diff()
print(f"Energy differences for {sample_cell}:")
print(f"  All positive: {(energy_diff > 0).all()}")
print(f"  Monotonic increasing: {cell_energy['PEE.Energy'].is_monotonic_increasing}")
print("✓ Energy appears cumulative" if cell_energy['PEE.Energy'].is_monotonic_increasing else "✗ Energy may not be cumulative")

# %%
# Verify expected patterns
print("\n=== PATTERN VALIDATION ===")
cqi_zero_rate = (ue_df_raw['DRB.UECqiDl'] == 0).mean()
print(f"CQI zero rate: {cqi_zero_rate:.3f} (expected: {EXPECTED_PATTERNS['cqi_zero_rate']})")

if 'CARR.AverageLayersDl' in cell_df_raw.columns:
    mimo_zero_rate = (cell_df_raw['CARR.AverageLayersDl'] == 0).mean()
    print(f"MIMO zero rate: {mimo_zero_rate:.3f} (expected: {EXPECTED_PATTERNS.get('mimo_zero_rate', 'N/A')})")

# Check TB unreliability
tb_zero_with_thp = ((ue_df_raw['TB.TotNbrDl'] == 0) & (ue_df_raw['DRB.UEThpDl'] > 0)).mean()
print(f"TB zero despite throughput: {tb_zero_with_thp:.3f} (confirms TB unreliability)")

# %%
# Window completeness check
print("\n=== WINDOW COMPLETENESS ===")
timestamps = sorted(cell_df_raw['timestamp'].unique())[:5]  # First 5 timestamps
window_cells = cell_df_raw[cell_df_raw['timestamp'].isin(timestamps)]
window_ues = ue_df_raw[ue_df_raw['timestamp'].isin(timestamps)]

print(f"5-minute window would have:")
print(f"  Cell records: {len(window_cells)} (expected: {WINDOW_SPECS['expected_records']['cells_per_window']})")
print(f"  UE records: {len(window_ues)} (expected: {WINDOW_SPECS['expected_records']['ues_per_window']})")
completeness = (len(window_cells) + len(window_ues)) / WINDOW_SPECS['expected_records']['total_per_window']
print(f"  Completeness: {completeness:.3f}")