## Usage
1. Place ShellData.csv in the same directory as this notebook
2. Run all cells sequentially
3. Checkpoints are saved automatically (resume from last if interrupted), also possible to comment out if not needed

In [None]:
import pandas as pd
import gc
import os

def resume_from_checkpoint():
    """Load from the most recent checkpoint"""
    checkpoints = [
        ("checkpoint_05_missing_handled.parquet", "dt_df_clean", "Step 5: Missing values handled"),
        ("checkpoint_04_time_features.parquet", "dt_df", "Step 4: Time features added"),
        ("checkpoint_03_one_hot_encoded.parquet", "ohe_df", "Step 3: One-hot encoded"),
        ("checkpoint_02_datetime_converted.parquet", "dt_df", "Step 2: Datetime converted"),
        ("checkpoint_01_value_separated.parquet", "value_sep_df", "Step 1: Values separated"),
    ]
    
    for checkpoint_file, var_name, description in checkpoints:
        if os.path.exists(checkpoint_file):
            df = pd.read_parquet(checkpoint_file)
            print(f"✓ Resumed from {checkpoint_file}")
            print(f"  {description}")
            print(f"  Loaded {len(df):,} rows, {len(df.columns)} columns")
            return df, var_name
    
    print("No checkpoints found. Load data from CSV.")
    return None, None

# Load and assign to the correct variable name
df, var_name = resume_from_checkpoint()

if df is not None:
    if var_name == "dt_df_clean":
        dt_df_clean = df
    elif var_name == "dt_df":
        dt_df = df
    elif var_name == "ohe_df":
        ohe_df = df
    elif var_name == "value_sep_df":
        value_sep_df = df
    
    print(f"\n▶ Variable '{var_name}' is ready. Continue from next step.")
else:
    print("\n▶ Load data from CSV first.")

✓ Resumed from checkpoint_04_time_features.parquet
  Step 4: Time features added
  Loaded 214,991,102 rows, 14 columns

▶ Variable 'dt_df' is ready. Continue from next step.


In [None]:
import pandas as pd
df = pd.read_csv("ShellData.csv")
df.head()

In [None]:
df = df.drop(columns=["Unnamed: 0"])

In [None]:
def separate_textual_attrivutes_from_numerical_column(df: pd.DataFrame, column: str) -> pd.DataFrame:
    """
    Checks whether the specified column contains non-numeric string values.
    If yes:
      - converts numeric-looking strings (e.g., "3.14", "1e-5") into real numbers
      - creates a new column <column>_str containing non-numeric strings
      - replaces those original values in <column> with -1
      - fills missing entries in <column>_str with the string "NaN"
    Returns a modified copy of the DataFrame, leaving the original unchanged.
    """

    if column not in df.columns:
        raise ValueError(f"Column '{column}' does not exist in the DataFrame.")

    # Create a copy so the original DataFrame is not modified
    df_copy = df.copy()

    # Helper: check if something is a string that looks like a number
    def is_numeric_string(x):
        if not isinstance(x, str):
            return False
        try:
            float(x)
            return True
        except ValueError:
            return False

    # Convert numeric-looking strings to real numbers
    df_copy[column] = df_copy[column].apply(
        lambda x: float(x) if is_numeric_string(x) else x
    )

    # Identify non-numeric strings
    def is_non_numeric_string(x):
        return isinstance(x, str) and not is_numeric_string(x)

    # Create the new string column
    df_copy[f"{column}_str"] = df_copy[column].where(
        df_copy[column].apply(is_non_numeric_string)
    )
    df_copy[f"{column}_str"] = df_copy[f"{column}_str"].fillna("NaN")

    # Replace non-numeric strings in the main column with -1
    df_copy[column] = df_copy[column].apply(
        lambda x: -1 if is_non_numeric_string(x) else x
    )

    return df_copy

In [None]:
value_sep_df = separate_textual_attrivutes_from_numerical_column(df, "Value")


value_sep_df.to_parquet("checkpoint_01_value_separated.parquet", index=False)
print("Saved checkpoint 1")

del df
import gc
gc.collect()

value_sep_df

In [None]:
value_sep_df["Value_str"].value_counts()

In [None]:
def convert_string_column_to_datetime_inplace(
    df: pd.DataFrame,
    column: str,
    fmt_with_frac: str = "%Y-%m-%d %H:%M:%S.%f",
    out_col: str | None = None,
    strip_trailing_000: bool = True,
) -> pd.DataFrame:
    """
    Converts a string-based timestamp column to datetime (in-place result).
    - If 'strip_trailing_000' is True, values ending with '.000' are trimmed
      and parsed without fractional seconds.
    - All other values are parsed using the provided 'fmt_with_frac' format.
    - Invalid entries are converted to NaT.
    """

    if column not in df.columns:
        raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
    out_col = out_col or f"{column}_DT"

    # Convert column to string once (avoids dtype mix issues)
    s = df[column].astype(str)
    # Prepare result column
    result = pd.Series(pd.NaT, index=df.index, dtype="datetime64[ns]")

    if strip_trailing_000:
        # Identify entries that end with '.000'
        mask000 = s.str.endswith(".000")

        # Fast path: remove '.000' and parse without microseconds
        if mask000.any():
            result.loc[mask000] = pd.to_datetime(
                s.loc[mask000].str[:-4],
                format="%Y-%m-%d %H:%M:%S",
                errors="coerce",
            )

        # Remaining values: parse using the full format (with %f)
        other = ~mask000
        if other.any():
            result.loc[other] = pd.to_datetime(
                s.loc[other],
                format=fmt_with_frac,
                errors="coerce",
            )
    else:
        # Single-pass parsing if we do not strip trailing zeros
        result[:] = pd.to_datetime(s, format=fmt_with_frac, errors="coerce")

    # Assign result column directly into the same DataFrame (in-place)
    df[out_col] = result
    return df


In [None]:
dt_df = convert_string_column_to_datetime_inplace(value_sep_df, "EventTime", "%Y-%m-%d %H:%M:%S.%f")

dt_df.to_parquet("checkpoint_02_datetime_converted.parquet", index=False)
print("Saved checkpoint 2")

del value_sep_df
gc.collect()

dt_df

In [None]:
dt_df["EventTime_DT"]

In [None]:
def one_hot_encode(df: pd.DataFrame,column: str,sparse: bool = False)-> pd.DataFrame:
    return pd.get_dummies(df, columns=[column], sparse=sparse)

In [None]:
ohe_df = one_hot_encode(dt_df, "Status")

ohe_df.to_parquet("checkpoint_03_one_hot_encoded.parquet", index=False)
print("Saved checkpoint 3")

del dt_df
gc.collect()

ohe_df

In [None]:
def add_time_features(df: pd.DataFrame, datetime_col: str) -> pd.DataFrame:
    """
    Expand a DataFrame with time-based features from a datetime column.
    Robust parsing for mixed datetime formats (ISO8601 / with or without microseconds).
    """
    # Validate column presence
    if datetime_col not in df.columns:
        raise KeyError(f"Column '{datetime_col}' not found in DataFrame.")

    out = df.copy()

    # 1) Ensure datetime dtype with robust parsing for mixed formats
    #    Try ISO8601 first (fast & strict), then mixed, then final fallback with errors='coerce'.
    if not pd.api.types.is_datetime64_any_dtype(out[datetime_col]):
        try:
            # Fast path for ISO strings (handles "YYYY-MM-DD HH:MM:SS[.ffffff][Z][±HH:MM]")
            out[datetime_col] = pd.to_datetime(out[datetime_col], format="ISO8601")
        except Exception:
            try:
                # Pandas 2.x: parse per-element with inferred format (handles microseconds optional)
                out[datetime_col] = pd.to_datetime(out[datetime_col], format="mixed")
            except Exception:
                # Last resort: coerce bad rows to NaT so we can at least proceed and diagnose
                out[datetime_col] = pd.to_datetime(out[datetime_col], errors="coerce")

    dt = out[datetime_col].dt

    # 2) Add numeric parts
    out[f"{datetime_col}_day"]   = dt.day
    out[f"{datetime_col}_week"]  = dt.isocalendar().week.astype("Int16")

    # 3) Seconds since midnight (works even with NaT → becomes <NA>)
    out[f"{datetime_col}_seconds"] = (dt.hour * 3600 + dt.minute * 60 + dt.second).astype("Int32")

    # 4) Month as ordered categorical (no per-row strings allocated)
    month_categories = [
        "January","February","March","April","May","June",
        "July","August","September","October","November","December"
    ]
    month_codes = (dt.month - 1).astype("Int8")
    month_codes = month_codes.where(~month_codes.isna(), other=-1)
    out[f"{datetime_col}_month"] = pd.Categorical.from_codes(
        codes=month_codes.astype("int8"),
        categories=month_categories,
        ordered=True
    )

    # 5) Weekday as ordered categorical (0=Mon .. 6=Sun)
    weekday_categories = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
    wday_codes = dt.dayofweek.astype("Int8")
    wday_codes = wday_codes.where(~wday_codes.isna(), other=-1)
    out[f"{datetime_col}_weekday"] = pd.Categorical.from_codes(
        codes=wday_codes.astype("int8"),
        categories=weekday_categories,
        ordered=True
    )

    return out


In [None]:
dt_df = add_time_features(ohe_df, "EventTime")

dt_df.to_parquet("checkpoint_04_time_features.parquet", index=False)
print("Saved checkpoint 4")

del ohe_df
gc.collect()

dt_df

In [None]:
def recover_failed_timestamps(df: pd.DataFrame, datetime_str_col: str, datetime_col: str) -> pd.DataFrame:
    """
    Attempt to re-parse timestamps that failed initial conversion.
    """
    df_fixed = df.copy()
    
    # Find rows with NaT
    nat_mask = df_fixed[datetime_col].isna()
    nat_count_before = nat_mask.sum()
    
    print(f"Attempting to recover {nat_count_before:,} failed timestamps")
    
    # Try multiple formats
    formats_to_try = [
        "%Y-%m-%d %H:%M:%S",           # Without milliseconds
        "%Y-%m-%d %H:%M:%S.%f",        # With milliseconds
        "%Y/%m/%d %H:%M:%S",           # Different separator
        "%d-%m-%Y %H:%M:%S",           # DD-MM-YYYY format
        "ISO8601",                      # ISO standard
    ]
    
    for fmt in formats_to_try:
        # Only try to fix rows that are still NaT
        still_nat = df_fixed[datetime_col].isna()
        if still_nat.sum() == 0:
            break
            
        print(f"Trying format: {fmt}")
        
        try:
            if fmt == "ISO8601":
                parsed = pd.to_datetime(df_fixed.loc[still_nat, datetime_str_col], 
                                       format="ISO8601", errors='coerce')
            else:
                parsed = pd.to_datetime(df_fixed.loc[still_nat, datetime_str_col], 
                                       format=fmt, errors='coerce')
            
            # Update only the ones that successfully parsed
            successfully_parsed = ~parsed.isna()
            df_fixed.loc[still_nat & successfully_parsed, datetime_col] = parsed[successfully_parsed]
            
            recovered = successfully_parsed.sum()
            if recovered > 0:
                print(f"Recovered {recovered:,} timestamps")
        except Exception as e:
            print(f"Failed: {e}")
            continue
    
    nat_count_after = df_fixed[datetime_col].isna().sum()
    total_recovered = nat_count_before - nat_count_after
    
    print(f"\nRecovered {total_recovered:,} timestamps ({total_recovered/nat_count_before*100:.2f}%)")
    print(f"Still missing: {nat_count_after:,} timestamps ({nat_count_after/len(df)*100:.4f}%)")
    
    return df_fixed

In [None]:
def handle_missing_values_final(df: pd.DataFrame) -> pd.DataFrame:
    """
    Final missing value handling after timestamp recovery.
    
    Strategy:
    - Drop rows with unrecoverable timestamps (can't use for time-series)
    - Drop rows with missing Value if < 0.1% of data
    """
    initial_rows = len(df)
    
    print("FINAL MISSING VALUE HANDLING")
    
    # Drop rows with NaT timestamps (unrecoverable)
    nat_count = df['EventTime_DT'].isna().sum()
    df_clean = df.dropna(subset=['EventTime_DT']).copy()
    
    print(f"\nDropped {nat_count:,} rows with unrecoverable timestamps ({nat_count/initial_rows*100:.4f}%)")
    
    # Handle missing Values
    value_missing = df_clean['Value'].isna().sum()
    
    if value_missing > 0:
        # If very few (<0.1%), just drop them
        if value_missing / len(df_clean) < 0.001:
            df_clean = df_clean.dropna(subset=['Value'])
            print(f"Dropped {value_missing:,} rows with missing Value ({value_missing/initial_rows*100:.4f}%)")
        else:
            # Otherwise, fill with -1 
            df_clean['Value'] = df_clean['Value'].fillna(-1)
            print(f"Filled {value_missing:,} missing Values with -1 (error indicator)")
    else:
        print("No missing Values to handle")
    
    total_dropped = initial_rows - len(df_clean)
    print(f"Initial rows: {initial_rows:,}")
    print(f"Final rows: {len(df_clean):,}")
    print(f"Dropped: {total_dropped:,} ({total_dropped/initial_rows*100:.2f}%)")
    print(f"Retention rate: {len(df_clean)/initial_rows*100:.2f}%")
    
    return df_clean

In [None]:
dt_df_recovered = recover_failed_timestamps(dt_df, 'EventTime', 'EventTime_DT')
dt_df_clean = handle_missing_values_final(dt_df_recovered)

dt_df_clean.to_parquet("checkpoint_05_missing_handled.parquet", index=False)
print("\nSaved checkpoint 5: Missing values handled")

del dt_df, dt_df_recovered
import gc
gc.collect()
dt_df_clean.head()

Attempting to recover 4,600,000 failed timestamps...
  Trying format: %Y-%m-%d %H:%M:%S...
    ✓ Recovered 4,600,000 timestamps

✓ Recovered 4,600,000 timestamps (100.00%)
✗ Still missing: 0 timestamps (0.0000%)

FINAL MISSING VALUE HANDLING

1. Dropped 0 rows with unrecoverable timestamps (0.0000%)
2. Dropped 74,875 rows with missing Value (0.0348%)

SUMMARY:
  Initial rows: 214,991,102
  Final rows: 214,916,227
  Dropped: 74,875 (0.03%)
  Retention rate: 99.97%

✓ Saved checkpoint 5: Missing values handled


Unnamed: 0,TagName,EventTime,Value,Value_str,EventTime_DT,Status_Bad,Status_Good,Status_Questionable,"Status_Substituted, Good",EventTime_day,EventTime_week,EventTime_seconds,EventTime_month,EventTime_weekday
0,A2PS64V0J.:ZUX09R,2024-01-02 20:03:46,0.34,,2024-01-02 20:03:46,False,True,False,False,2,1,72226,January,Tuesday
1,A2PS64V0J.:ZUX09R,2024-01-02 16:00:12,0.15,,2024-01-02 16:00:12,False,True,False,False,2,1,57612,January,Tuesday
2,A2PS64V0J.:ZUX09R,2024-01-02 11:56:42,0.13,,2024-01-02 11:56:42,False,True,False,False,2,1,43002,January,Tuesday
3,A2PS64V0J.:ZUX09R,2024-01-02 07:53:11,0.12,,2024-01-02 07:53:11,False,True,False,False,2,1,28391,January,Tuesday
4,A2PS64V0J.:ZUX09R,2024-01-02 03:49:45,0.13,,2024-01-02 03:49:45,False,True,False,False,2,1,13785,January,Tuesday


In [None]:
def sort_by_datetime(df: pd.DataFrame, datetime_col: str, ascending: bool = True) -> pd.DataFrame:
    """
    Sorts the DataFrame by a datetime column in chronological order.
    Sorts in-place to avoid creating additional copies for memory efficiency.
    """
    
    if datetime_col not in df.columns:
        raise KeyError(f"Column '{datetime_col}' not found in DataFrame.")
    
    print(f"Sorting {len(df):,} rows by '{datetime_col}'")
    
    df.sort_values(
        by=datetime_col, 
        ascending=ascending,
        na_position='last',
        inplace=True,
        kind='mergesort'  
    )
    
    # Reset index in-place
    df.reset_index(drop=True, inplace=True)
    
    print(f"Sorted in-place")
    
    return df

In [None]:
sorted_df = sort_by_datetime(dt_df_clean, "EventTime_DT")

# Overwrite checkpoint 5 with sorted data
sorted_df.to_parquet("checkpoint_05_missing_handled.parquet", index=False)
print("Updated checkpoint 5 with sorted data")

sorted_df

Sorting 214,916,227 rows by 'EventTime_DT'...
✓ Sorted in-place
✓ Updated checkpoint 5 with sorted data


Unnamed: 0,TagName,EventTime,Value,Value_str,EventTime_DT,Status_Bad,Status_Good,Status_Questionable,"Status_Substituted, Good",EventTime_day,EventTime_week,EventTime_seconds,EventTime_month,EventTime_weekday
0,20PX.D20V:04SXA,2023-12-31 00:00:00.000,4414.219727,,2023-12-31 00:00:00.000,False,True,False,False,31,52,0,December,Sunday
1,S:.2P8RVXA39UX06,2023-12-31 00:00:00.000,15.995943,,2023-12-31 00:00:00.000,False,True,False,False,31,52,0,December,Sunday
2,X8VPHB16088.0:V,2023-12-31 00:00:00.000,247.606781,,2023-12-31 00:00:00.000,False,True,False,False,31,52,0,December,Sunday
3,NGX0.11S:7EXN07R,2023-12-31 00:00:00.000,-2777.647461,,2023-12-31 00:00:00.000,False,True,False,False,31,52,0,December,Sunday
4,X0N71R1S7PV:.X0,2023-12-31 00:00:00.000,666.784058,,2023-12-31 00:00:00.000,False,True,False,False,31,52,0,December,Sunday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214916222,:0CCTP530TNYOHS1.,2024-07-27 23:59:57.101,234.000290,,2024-07-27 23:59:57.101,False,True,False,False,27,30,86397,July,Saturday
214916223,:NC1HPT44Y0OC5T.S,2024-07-27 23:59:57.101,143.000000,,2024-07-27 23:59:57.101,False,True,False,False,27,30,86397,July,Saturday
214916224,NPSH593TCOY01.:CT,2024-07-27 23:59:57.101,42.000000,,2024-07-27 23:59:57.101,False,True,False,False,27,30,86397,July,Saturday
214916225,COHP35YTT1CN21:S.,2024-07-27 23:59:57.101,130.000000,,2024-07-27 23:59:57.101,False,True,False,False,27,30,86397,July,Saturday


In [None]:
#### Optional, just for information ####

def investigate_extreme_values(df: pd.DataFrame) -> None:
    """
    Investigate extreme values in the Value column.
    """
    print("EXTREME VALUE INVESTIGATION")
    
    # Filter out error values
    valid_values = df[df['Value'] != -1]['Value']
    
    print(f"\nValid values statistics:")
    print(f"  Count: {len(valid_values):,}")
    print(f"  Min: {valid_values.min():.2f}")
    print(f"  Max: {valid_values.max():.2f}")
    print(f"  Mean: {valid_values.mean():.2f}")
    print(f"  Median: {valid_values.median():.2f}")
    print(f"  Std: {valid_values.std():.2f}")
    
    # Check percentiles
    print(f"\nPercentiles:")
    for p in [0.01, 0.1, 1, 5, 25, 50, 75, 95, 99, 99.9, 99.99]:
        val = valid_values.quantile(p/100)
        print(f"  {p:5.2f}%: {val:20,.2f}")
    
    # Find extreme outliers (beyond 6 sigma)
    mean = valid_values.mean()
    std = valid_values.std()
    
    extreme_high = valid_values > (mean + 6 * std)
    extreme_low = valid_values < (mean - 6 * std)
    
    print(f"\nExtreme outliers (> 6 sigma):")
    print(f"Extremely high: {extreme_high.sum():,}")
    print(f"Extremely low: {extreme_low.sum():,}")
    
    # examples of extreme values
    if extreme_high.sum() > 0:
        print(f"\nSample extremely high values:")
        high_samples = df[df['Value'] > mean + 6 * std][['TagName', 'EventTime_DT', 'Value', 'Value_str']].head(10)
        print(high_samples)
    
    if extreme_low.sum() > 0:
        print(f"\nSample extremely low values:")
        low_samples = df[df['Value'] < mean - 6 * std][['TagName', 'EventTime_DT', 'Value', 'Value_str']].head(10)
        print(low_samples)
    
    # Check value ranges by sensor type 
    print(f"\nValue ranges for sample sensors:")
    sample_sensors = df['TagName'].value_counts().head(5).index
    for sensor in sample_sensors:
        sensor_data = df[df['TagName'] == sensor]['Value']
        sensor_valid = sensor_data[sensor_data != -1]
        if len(sensor_valid) > 0:
            print(f"\n  {sensor}:")
            print(f"    Count: {len(sensor_valid):,}")
            print(f"    Range: {sensor_valid.min():.2f} to {sensor_valid.max():.2f}")
            print(f"    Mean: {sensor_valid.mean():.2f}")

# Run investigation
import numpy as np
investigate_extreme_values(dt_df_clean)


EXTREME VALUE INVESTIGATION

Valid values statistics:
  Count: 214,419,974
  Min: -736671805447501512704.00
  Max: 6265861308416.00
  Mean: -6902653122450.62
  Median: 42.52
  Std: 71147326878169032.00

Percentiles:
   0.01%:           -57,568.29
   0.10%:           -14,841.97
   1.00%:            -2,213.50
   5.00%:                -0.11
  25.00%:                 3.23
  50.00%:                42.52
  75.00%:               142.50
  95.00%:             2,499.45
  99.00%:            43,509.33
  99.90%:           395,519.74
  99.99%:         7,120,637.80

Extreme outliers (> 6 sigma):
  Extremely high: 0
  Extremely low: 6

Sample extremely low values:
                  TagName        EventTime_DT         Value Value_str
40985198  0N:P7RXV031S0.X 2024-02-08 02:30:17 -1.643073e+18       NaN
40985201  VX0XN07P1.1:3RS 2024-02-08 02:30:17 -7.366718e+20       NaN
40985240  V0R0.:0X715PNSX 2024-02-08 02:30:22 -1.643073e+18       NaN
40985954  0N:P7RXV031S0.X 2024-02-08 02:31:13 -1.643073e+18    

In [None]:
def handle_extreme_outliers(df: pd.DataFrame, n_sigma: float = 10.0) -> pd.DataFrame:
    """
    Handle extreme outliers that are likely data corruption.
    Uses robust statistics (median + MAD) to avoid being influenced by outliers.
    """
    print("HANDLING EXTREME OUTLIERS")
    
    df_clean = df.copy()
    
    # Get valid values (exclude existing errors)
    valid_mask = df_clean['Value'] != -1
    valid_values = df_clean.loc[valid_mask, 'Value']
    
    # MAD = Median Absolute Deviation
    median = valid_values.median()
    mad = (valid_values - median).abs().median()
    
    # Convert MAD to std equivalent (for normal distribution)
    std_equivalent = mad * 1.4826
    
    print(f"\nRobust statistics (using median and MAD):")
    print(f"Median: {median:.2f}")
    print(f"MAD-based std: {std_equivalent:.2f}")
    
    # Define outlier thresholds
    lower_bound = median - (n_sigma * std_equivalent)
    upper_bound = median + (n_sigma * std_equivalent)
    
    print(f"\nOutlier thresholds ({n_sigma} sigma):")
    print(f"Lower bound: {lower_bound:,.2f}")
    print(f"Upper bound: {upper_bound:,.2f}")
    
    # Find outliers
    outlier_mask = valid_mask & ((df_clean['Value'] < lower_bound) | (df_clean['Value'] > upper_bound))
    outlier_count = outlier_mask.sum()
    
    print(f"\nOutliers detected: {outlier_count:,} ({outlier_count/len(df)*100:.4f}%)")
    
    if outlier_count > 0:
        # Show examples
        print(f"\nSample outliers to be marked as errors:")
        outlier_samples = df_clean[outlier_mask][['TagName', 'EventTime_DT', 'Value', 'Value_str']].head(10)
        print(outlier_samples)
        
        # Mark outliers as errors
        df_clean.loc[outlier_mask, 'Value'] = -1
        df_clean.loc[outlier_mask, 'Value_str'] = df_clean.loc[outlier_mask, 'Value_str'].fillna('Extreme Outlier')
        
        print(f"\nMarked {outlier_count:,} extreme outliers as errors (Value=-1)")
        
        # Update statistics
        new_valid = df_clean[df_clean['Value'] != -1]['Value']
        print(f"\nUpdated statistics (after outlier removal):")
        print(f"Count: {len(new_valid):,}")
        print(f"Min: {new_valid.min():.2f}")
        print(f"Max: {new_valid.max():.2f}")
        print(f"Mean: {new_valid.mean():.2f}")
        print(f"Median: {new_valid.median():.2f}")
        print(f"Std: {new_valid.std():.2f}")
    else:
        print("\nNo extreme outliers found")
    
    return df_clean

# Apply outlier handling
dt_df_clean = handle_extreme_outliers(dt_df_clean, n_sigma=10.0)

# Save updated checkpoint again
dt_df_clean.to_parquet("checkpoint_05_missing_handled.parquet", index=False)
print("\nUpdated checkpoint 5 with outliers handled")

HANDLING EXTREME OUTLIERS

Robust statistics (using median and MAD):
  Median: 42.52
  MAD-based std: 62.81

Outlier thresholds (10.0 sigma):
  Lower bound: -585.60
  Upper bound: 670.64

Outliers detected: 26,780,921 (12.4611%)

Sample outliers to be marked as errors:
               TagName EventTime_DT         Value Value_str
0      20PX.D20V:04SXA   2023-12-31   4414.219727       NaN
3     NGX0.11S:7EXN07R   2023-12-31  -2777.647461       NaN
5     X07:NX.7110RSOPS   2023-12-31   3444.431396       NaN
6      NXVPX7R013S.1:0   2023-12-31  -9966.505859       NaN
8      XNX0R:31SV0.P57   2023-12-31   2366.526123       NaN
11    V500PDXP7AX9I:6.   2023-12-31   1056.000000       NaN
12          092O.VL5PI   2023-12-31  32950.367188       NaN
17     CVT55:101DP4T5.   2023-12-31    876.863647       NaN
19  80CWC520PDMU0:U.5S   2023-12-31   3102.424805       NaN
24     50V0FP259:D.4TC   2023-12-31   1047.814819       NaN

✓ Marked 26,780,921 extreme outliers as errors (Value=-1)

Updated st

In [None]:
def validate_data_quality(df: pd.DataFrame) -> dict:
    """
    Comprehensive data quality validation for model readiness.
    """
    print("DATA QUALITY VALIDATION")
    
    issues = []
    warnings = []
    
    # 1. Check for missing values
    print("\n1. Missing Values Check:")
    missing = df.isnull().sum().sum()
    if missing > 0:
        issues.append(f"Found {missing:,} missing values")
        print(f"Found {missing:,} missing values")
    else:
        print(f"No missing values")
    
    # 2. Check for infinite values
    print("\n2. Infinite Values Check:")
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    inf_count = 0
    for col in numeric_cols:
        inf_in_col = np.isinf(df[col]).sum()
        if inf_in_col > 0:
            inf_count += inf_in_col
            issues.append(f"Column '{col}' has {inf_in_col:,} infinite values")
    
    if inf_count > 0:
        print(f"Found {inf_count:,} infinite values")
    else:
        print(f"No infinite values")
    
    # 3. Check data types
    print("\n3. Data Type Check:")
    object_cols = df.select_dtypes(include=['object']).columns.tolist()
    if len(object_cols) > 0:
        print(f"Found {len(object_cols)} object columns:")
        for col in object_cols:
            print(f"    - {col} (object type)")
            warnings.append(f"Column '{col}' is object type, may need encoding, check contents")
    else:
        print(f"All columns have appropriate numeric/datetime types")
    
    # 4. Check timestamp validity
    print("\n4. Timestamp Validity Check:")
    if 'EventTime_DT' in df.columns:
        nat_count = df['EventTime_DT'].isna().sum()
        if nat_count > 0:
            issues.append(f"Found {nat_count:,} NaT timestamps")
            print(f"Found {nat_count:,} NaT timestamps")
        else:
            print(f"All timestamps valid")
            min_date = df['EventTime_DT'].min()
            max_date = df['EventTime_DT'].max()
            print(f"Date range: {min_date} to {max_date}")
            
            # Check if sorted
            is_sorted = df['EventTime_DT'].is_monotonic_increasing
            if is_sorted:
                print(f"Timestamps are sorted chronologically")
            else:
                warnings.append("Timestamps are not sorted")
                print(f"Timestamps are NOT sorted chronologically")
    
    # 5. Check value distribution
    print("\n5. Value Distribution Check:")
    if 'Value' in df.columns:
        valid_values = (df['Value'] != -1).sum()
        error_values = (df['Value'] == -1).sum()
        total = len(df)
        
        print(f"Valid numeric values: {valid_values:,} ({valid_values/total*100:.2f}%)")
        print(f"Error values (-1): {error_values:,} ({error_values/total*100:.2f}%)")
        
        if error_values / total > 0.1:  # More than 10% errors
            warnings.append(f"{error_values/total*100:.2f}% of values are errors")
        
        # Check for extreme outliers in valid values
        valid_vals = df[df['Value'] != -1]['Value']
        if len(valid_vals) > 0:
            q99 = valid_vals.quantile(0.99)
            q01 = valid_vals.quantile(0.01)
            mean_val = valid_vals.mean()
            std_val = valid_vals.std()
            
            print(f"Value statistics (excluding -1):")
            print(f"Mean: {mean_val:.2f}, Std: {std_val:.2f}")
            print(f"1st percentile: {q01:.2f}, 99th percentile: {q99:.2f}")
    
    # 6. Check status distribution
    print("\n6. Status Distribution Check:")
    status_cols = [col for col in df.columns if col.startswith('Status_')]
    if status_cols:
        print(f"  Status columns:")
        for col in status_cols:
            count = df[col].sum()
            pct = count / len(df) * 100
            print(f"    {col}: {count:,} ({pct:.2f}%)")
    
    # 7. Check sensor count
    print("\n7. Sensor Coverage Check:")
    if 'TagName' in df.columns:
        unique_sensors = df['TagName'].nunique()
        print(f"  Unique sensors (TagNames): {unique_sensors:,}")
        
        # Check for sensors with very few readings
        sensor_counts = df['TagName'].value_counts()
        low_count_sensors = (sensor_counts < 10).sum()
        if low_count_sensors > 0:
            warnings.append(f"{low_count_sensors:,} sensors have fewer than 10 readings")
            print(f"  {low_count_sensors:,} sensors have < 10 readings")
    
    # 8. Memory usage
    print("\n8. Memory Usage:")
    memory_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
    print(f"  Total memory: {memory_mb:.2f} MB ({memory_mb/1024:.2f} GB)")
    
    # Summary
    print("VALIDATION SUMMARY")
    
    if len(issues) == 0 and len(warnings) == 0:
        print("All checks passed.")
        status = "READY"
    elif len(issues) == 0:
        print(f"No critical issues found.")
        print(f"{len(warnings)} warnings (non-critical):")
        for warning in warnings:
            print(f"  - {warning}")
        status = "READY_WITH_WARNINGS"
    else:
        print(f"Found {len(issues)} critical issues:")
        for issue in issues:
            print(f"  - {issue}")
        if len(warnings) > 0:
            print(f"Additional {len(warnings)} warnings:")
            for warning in warnings:
                print(f"  - {warning}")
        status = "NEEDS_ATTENTION"
    
    return {
        'status': status,
        'issues': issues,
        'warnings': warnings,
        'total_rows': len(df),
        'total_columns': len(df.columns),
        'memory_mb': memory_mb
    }

# Run validation
import numpy as np
validation_results = validate_data_quality(dt_df_clean)

DATA QUALITY VALIDATION

1. Missing Values Check:
  ✓ No missing values

2. Infinite Values Check:
  ✓ No infinite values

3. Data Type Check:
  ⚠ Found 2 object columns:
    - TagName (object type)
    - Value_str (object type)

4. Timestamp Validity Check:
  ✓ All timestamps valid
    Date range: 2023-12-31 00:00:00 to 2024-07-27 23:59:57.573000
  ✓ Timestamps are sorted chronologically

5. Value Distribution Check:
  Valid numeric values: 187,639,053 (87.31%)
  Error values (-1): 27,277,174 (12.69%)
  Value statistics (excluding -1):
    Mean: 80.00, Std: 129.38
    1st percentile: -32.50, 99th percentile: 564.69

6. Status Distribution Check:
  Status columns:
    Status_Bad: 1,273,117 (0.59%)
    Status_Good: 213,642,667 (99.41%)
    Status_Questionable: 190 (0.00%)
    Status_Substituted, Good: 253 (0.00%)

7. Sensor Coverage Check:
  Unique sensors (TagNames): 7,759
  ⚠ 976 sensors have < 10 readings

8. Memory Usage:
  Total memory: 36574.14 MB (35.72 GB)

VALIDATION SUMMARY
✓ 

In [14]:
def finalize_preprocessing(df: pd.DataFrame) -> None:
    """
    Finalize preprocessing by saving final dataset and generating metadata.
    """
    print("FINALIZING PREPROCESSING")
    
    # 1. Save final dataset
    print("\n1. Saving final preprocessed dataset...")
    df.to_parquet("ShellData_preprocessed_final.parquet", index=False, compression='snappy')
    
    import os
    file_size = os.path.getsize("ShellData_preprocessed_final.parquet") / 1024 / 1024
    print(f"Saved to ShellData_preprocessed_final.parquet ({file_size:.2f} MB)")
    
    # 2. Save a csv sample for inspection
    print("\n2. Saving sample dataset...")
    sample_size = min(100_000, len(df))
    sample = df.sample(n=sample_size, random_state=42)
    sample.to_csv("ShellData_preprocessed_sample.csv", index=False)
    print(f"Saved {sample_size:,} row sample to ShellData_preprocessed_sample.csv")
    
    # 3. Generate metadata
    print("\n3. Generating metadata...")
    
    valid_values = df[df['Value'] != -1]['Value']
    error_breakdown = df[df['Value'] == -1]['Value_str'].value_counts().to_dict()
    
    metadata = {
        "dataset_info": {
            "total_rows": int(len(df)),
            "total_columns": int(len(df.columns)),
            "date_range_start": str(df['EventTime_DT'].min()),
            "date_range_end": str(df['EventTime_DT'].max()),
            "unique_sensors": int(df['TagName'].nunique()),
            "memory_usage_mb": float(df.memory_usage(deep=True).sum() / 1024 / 1024)
        },
        "value_statistics": {
            "valid_values_count": int((df['Value'] != -1).sum()),
            "valid_values_percent": float((df['Value'] != -1).sum() / len(df) * 100),
            "error_values_count": int((df['Value'] == -1).sum()),
            "error_values_percent": float((df['Value'] == -1).sum() / len(df) * 100),
            "mean": float(valid_values.mean()),
            "median": float(valid_values.median()),
            "std": float(valid_values.std()),
            "min": float(valid_values.min()),
            "max": float(valid_values.max())
        },
        "error_breakdown": {str(k): int(v) for k, v in error_breakdown.items()},
        "status_distribution": {
            col: int(df[col].sum()) 
            for col in df.columns if col.startswith('Status_')
        },
        "columns": list(df.columns),
        "column_types": {col: str(dtype) for col, dtype in df.dtypes.items()},
        "preprocessing_steps": [
            "1. Separated text values from numeric values in Value column",
            "2. Converted EventTime strings to datetime objects",
            "3. One-hot encoded Status column",
            "4. Extracted time features (day, week, seconds, month, weekday)",
            "5. Recovered 4.6M failed timestamp parsings",
            "6. Removed 74,875 rows with missing values (0.03%)",
            "7. Sorted chronologically by EventTime_DT",
            "8. Marked 26.78M extreme outliers as errors (12.46%)"
        ]
    }
    
    import json
    with open("ShellData_preprocessed_metadata.json", "w") as f:
        json.dump(metadata, f, indent=2)
    
    print("Saved metadata to ShellData_preprocessed_metadata.json")
    
    print("\nFiles created:")
    print("  - ShellData_preprocessed_final.parquet (full dataset)")
    print("  - ShellData_preprocessed_sample.csv (100k sample)")
    print("  - ShellData_preprocessed_metadata.json (metadata)")
# Finalize
finalize_preprocessing(dt_df_clean)

FINALIZING PREPROCESSING

1. Saving final preprocessed dataset...
   ✓ Saved to ShellData_preprocessed_final.parquet (2022.29 MB)

2. Saving sample dataset...
Saved 100,000 row sample to ShellData_preprocessed_sample.csv

3. Generating metadata...
Saved metadata to ShellData_preprocessed_metadata.json

Files created:
  - ShellData_preprocessed_final.parquet (full dataset)
  - ShellData_preprocessed_sample.csv (100k sample)
  - ShellData_preprocessed_metadata.json (metadata)
