In [2]:
import os
import pandas as pd
from dotenv import load_dotenv

In [3]:
load_dotenv()

False

In [4]:
DATA_DIR_RAW = os.getenv("DATA_DIR_RAW", "data/raw")
DATA_DIR_PROCESSED = os.getenv("DATA_DIR_PROCESSED", "data/processed")

In [5]:
os.makedirs(DATA_DIR_RAW, exist_ok=True)
os.makedirs(DATA_DIR_PROCESSED, exist_ok=True)


In [6]:
df = pd.DataFrame({
    "id": [1, 2, 3],
    "name": ["Alice", "Bob", "Charlie"],
    "score": [85, 92, 78]
})

In [7]:
pip install pyarrow


Note: you may need to restart the kernel to use updated packages.


In [8]:
csv_path = os.path.join(DATA_DIR_RAW, "sample.csv")
df.to_csv(csv_path, index=False)

In [9]:
parquet_path = os.path.join(DATA_DIR_PROCESSED, "sample.parquet")
df.to_parquet(parquet_path, index=False)

In [24]:
def validate_files(csv_path: str, parquet_path: str, critical_cols: dict):
   
    df_csv = pd.read_csv(csv_path)
    df_parquet = pd.read_parquet(parquet_path)
    
    results = {}

    # Shape check
    results["shape_match"] = df_csv.shape == df_parquet.shape
    results["csv_shape"] = df_csv.shape
    results["parquet_shape"] = df_parquet.shape

    # Critical dtype checks
    dtype_check = {}
    for col, expected in critical_cols.items():
        dtype_check[col] = {
            "csv": str(df_csv[col].dtype),
            "parquet": str(df_parquet[col].dtype),
            "expected": expected,
            "ok_csv": str(df_csv[col].dtype) == expected,
            "ok_parquet": str(df_parquet[col].dtype) == expected,
        }
    results["dtype_check"] = dtype_check

    return results

In [25]:
critical_columns = {
    "id": "int64",
    "name": "object",   
    "score": "int64"
}

In [26]:
def write_df(df: pd.DataFrame, path: str, **kwargs):
    
    os.makedirs(os.path.dirname(path), exist_ok=True)
    suffix = os.path.splitext(path)[1].lower()

    if suffix == ".csv":
        df.to_csv(path, index=False, **kwargs)
    elif suffix == ".parquet":
        try:
            df.to_parquet(path, index=False, **kwargs)
        except ImportError as e:
            raise ImportError(
                "Parquet support requires `pyarrow` or `fastparquet`. "
                "Install one with:\n"
                "  pip install pyarrow   OR   pip install fastparquet"
            ) from e
    else:
        raise ValueError(f"Unsupported file format: {suffix}")
    
    print(f"✅ Saved DataFrame to {path}")


In [27]:
def read_df(path: str, **kwargs) -> pd.DataFrame:
   
    suffix = os.path.splitext(path)[1].lower()

    if suffix == ".csv":
        return pd.read_csv(path, **kwargs)
    elif suffix == ".parquet":
        try:
            return pd.read_parquet(path, **kwargs)
        except ImportError as e:
            raise ImportError(
                "Parquet support requires `pyarrow` or `fastparquet`. "
                "Install one with:\n"
                "  pip install pyarrow   OR   pip install fastparquet"
            ) from e
    else:
        raise ValueError(f"Unsupported file format: {suffix}")

In [29]:
write_df(df, csv_path)
write_df(df, parquet_path)

✅ Saved DataFrame to data/raw/sample.csv
✅ Saved DataFrame to data/processed/sample.parquet


In [30]:
results = validate_files(csv_path, parquet_path, critical_columns)

In [31]:
print("\n--- Validation Results ---")
pprint.pprint(results)


--- Validation Results ---
{'csv_shape': (3, 3),
 'dtype_check': {'id': {'csv': 'int64',
                        'expected': 'int64',
                        'ok_csv': True,
                        'ok_parquet': True,
                        'parquet': 'int64'},
                 'name': {'csv': 'object',
                          'expected': 'object',
                          'ok_csv': True,
                          'ok_parquet': True,
                          'parquet': 'object'},
                 'score': {'csv': 'int64',
                           'expected': 'int64',
                           'ok_csv': True,
                           'ok_parquet': True,
                           'parquet': 'int64'}},
 'parquet_shape': (3, 3),
 'shape_match': True}
