In [1]:
# notebooks/neo_pipeline.ipynb

# Ensure the project root is available to the notebook.
# The notebook adds the project root to sys.path below so local imports like `scripts.*` work.
# Install only the runtime dependency needed to read .xlsx files. If you want an editable
# install (pip -e), add a minimal pyproject.toml or setup.py to the project root and run
# that command manually. The editable install often fails when the project is not packaged.

%pip install openpyxl

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import sys
from pathlib import Path

project_root = Path.cwd().parent  # notebooks/ -> project root
project_root_str = str(project_root)
if project_root_str not in sys.path:
    sys.path.insert(0, project_root_str)

from scripts.ingestion import load_excel
from scripts.cleaning import clean_performance_data
from scripts.transformation import transform_data
# Import and reload utils to ensure latest definitions are available in the notebook kernel
import importlib
import scripts.utils as utils
importlib.reload(utils)
from scripts.utils import save_output, log_message, setup_logger
import pandas as pd

In [3]:
# Initialize logger
setup_logger("../logs/pipeline.log")

log_message("Starting NeoStats Data Pipeline...")

try:
    # Step 1: Load Excel data
    file_path = "../data/Data Engineering Use Case Dataset.xlsx"
    metadata, st1, st2 = load_excel(file_path)

    # Check if sheets loaded correctly
    if metadata is None or st1 is None or st2 is None:
        raise ValueError("Failed to load one or more sheets from the Excel file.")

    log_message("Data ingestion completed.")

    # Step 2: Combine station performance tables into one DataFrame
    perf_data = pd.concat([st1, st2], ignore_index=True)
    print('Combined station data shape:', perf_data.shape)
    log_message('Station data combined')

    # Step 3: Clean performance data (call the cleaning function)
    cleaned_data = clean_performance_data(perf_data)
    log_message("Data cleaning completed.")

    # Step 4: Transform and merge with metadata
    final_data = transform_data(cleaned_data, metadata)
    log_message("Data transformation completed.")

    # Step 5: Save final output to CSV
    save_output(final_data, "../output/structured_data.csv")
    log_message("Pipeline executed successfully!")

except Exception as e:
    # For printing and log a short message
    log_message("Pipeline failed: " + str(e), level="error")
    print('Pipeline failed. See logs for details.')

2025-10-30 17:15:12,522 | INFO | Logger initialized.
2025-10-30 17:15:12,525 | INFO | Starting NeoStats Data Pipeline...


INFO: Starting NeoStats Data Pipeline...


2025-10-30 17:15:17,702 | INFO | Data ingestion completed.
2025-10-30 17:15:17,726 | INFO | Station data combined
2025-10-30 17:15:17,804 | INFO | Data cleaning completed.
2025-10-30 17:15:17,836 | INFO | Data transformation completed.


Loaded successfully:
 - Metadata shape: (100, 9)
 - Station1 shape: (3000, 14)
 - Station2 shape: (2000, 11)
INFO: Data ingestion completed.
Combined station data shape: (5000, 14)
INFO: Station data combined
Dropped irrelevant columns (if any).
Filled 520 missing values in Memory_Usage (%) with 0.
Filled 271 missing values in Disk_IO (%) with 0.
Filled 208 missing values in Network_Traffic_Out (MB/s) with 0.
Unified schema and sorted columns.
Final cleaned data: 5000 rows, 11 columns
No major anomalies detected.
INFO: Data cleaning completed.
Starting transformation...
CPU Utilization column ready.
Memory Utilization column ready.
Disk I/O Rate column ready.
Calculated Network Throughput (average of In/Out).
Merged performance data with metadata on Server_ID.
Added Anomaly_Flag based on CPU Utilization > 85%.
Transformation complete: rows = 5000 columns = 21
INFO: Data transformation completed.


2025-10-30 17:15:18,047 | INFO | Output saved successfully: ../output/structured_data.csv
2025-10-30 17:15:18,049 | INFO | Pipeline executed successfully!


Output saved to ../output/structured_data.csv
INFO: Pipeline executed successfully!
