# Load ETH Pipeline Parquet Files

This notebook opens the parquet outputs from the ETH pipeline and loads each into a pandas DataFrame.
It also prints shape/columns and previews a few rows for quick sanity checks.


In [1]:
import os
from pathlib import Path

import pandas as pd
from IPython.display import display

try:
    from dotenv import load_dotenv
    load_dotenv()
except Exception:
    pass

PROJECT_ROOT = os.getenv("PROJECT_ROOT", "").strip()
if PROJECT_ROOT:
    project_root = Path(PROJECT_ROOT).expanduser().resolve()
else:
    project_root = Path.cwd().resolve()
    if not (project_root / "data").exists() and (project_root.parent / "data").exists():
        project_root = project_root.parent
data_path = project_root / "data"

FILENAMES = {
    "chainlink_rounds": "chainlink_rounds.parquet",
    "ethusd_hourly": "ethusd_hourly.parquet",
    "deribit_eth_dvol_raw": "deribit_eth_dvol_raw.parquet",
    "eth_iv_index_hourly": "eth_iv_index_hourly.parquet",
    "deribit_eth_option_surface_raw": "deribit_eth_option_surface_raw.parquet",
    "eth_iv_surface_hourly": "eth_iv_surface_hourly.parquet",
    "eth_training_hourly": "eth_training_hourly.parquet",
}

def resolve_parquet(name: str, filename: str) -> Path:
    candidate = data_path / filename
    if candidate.exists():
        return candidate

PARQUET_PATHS = {name: resolve_parquet(name, filename) for name, filename in FILENAMES.items()}
PARQUET_PATHS


{'chainlink_rounds': None,
 'ethusd_hourly': None,
 'deribit_eth_dvol_raw': None,
 'eth_iv_index_hourly': None,
 'deribit_eth_option_surface_raw': None,
 'eth_iv_surface_hourly': None,
 'eth_training_hourly': None}

In [3]:
data_path

PosixPath('/home/carlos/investing/notebooks/data')

In [2]:


def resolve_parquet(name: str, filename: str) -> Path | None:
    candidate = data_path / filename
    if candidate.exists():
        return candidate
    return None

def load_parquet(name: str, path: Path | None):
    if path is None:
        print(f"Missing: {name} (expected at {data_path / FILENAMES[name]})")
        return None
    if not path.exists():
        print(f"Missing: {path}")
        return None
    df = pd.read_parquet(path)
    print(f"Loaded {path.name}: rows={len(df):,}, cols={len(df.columns)} from {path}")
    return df

chainlink_rounds_df = load_parquet("chainlink_rounds", PARQUET_PATHS["chainlink_rounds"])
ethusd_hourly_df = load_parquet("ethusd_hourly", PARQUET_PATHS["ethusd_hourly"])
deribit_eth_dvol_raw_df = load_parquet("deribit_eth_dvol_raw", PARQUET_PATHS["deribit_eth_dvol_raw"])
eth_iv_index_hourly_df = load_parquet("eth_iv_index_hourly", PARQUET_PATHS["eth_iv_index_hourly"])
deribit_eth_option_surface_raw_df = load_parquet("deribit_eth_option_surface_raw", PARQUET_PATHS["deribit_eth_option_surface_raw"])
eth_iv_surface_hourly_df = load_parquet("eth_iv_surface_hourly", PARQUET_PATHS["eth_iv_surface_hourly"])
eth_training_hourly_df = load_parquet("eth_training_hourly", PARQUET_PATHS["eth_training_hourly"])


AttributeError: 'NoneType' object has no attribute 'exists'

In [None]:
eth_training_hourly_df