# Homework Starter — Stage 05: Data Storage
Name: Souhil Khiat
Date: 8/20/25

Objectives:
- Env-driven paths to `data/raw/` and `data/processed/`
- Save CSV and Parquet; reload and validate
- Abstract IO with utility functions; document choices

In [7]:
import os, pathlib, datetime as dt
import pandas as pd
from dotenv import load_dotenv

load_dotenv()
RAW = pathlib.Path(os.getenv('DATA_DIR_RAW', 'data/raw'))
PROC = pathlib.Path(os.getenv('DATA_DIR_PROCESSED', 'data/processed'))
RAW.mkdir(parents=True, exist_ok=True)
PROC.mkdir(parents=True, exist_ok=True)
print('RAW ->', RAW.resolve())
print('PROC ->', PROC.resolve())

RAW -> /Users/souhil/bootcamp_souhil_khiat/homework/homework5/data/raw
PROC -> /Users/souhil/bootcamp_souhil_khiat/homework/homework5/data/processed


## 1) Create or Load a Sample DataFrame
You may reuse data from prior stages or create a small synthetic dataset.

In [8]:
import numpy as np
dates = pd.date_range('2024-01-01', periods=20, freq='D')
df = pd.DataFrame({'date': dates, 'ticker': ['AAPL']*20, 'price': 150 + np.random.randn(20).cumsum()})
df.head()

Unnamed: 0,date,ticker,price
0,2024-01-01,AAPL,150.086672
1,2024-01-02,AAPL,149.302689
2,2024-01-03,AAPL,149.558998
3,2024-01-04,AAPL,150.113076
4,2024-01-05,AAPL,150.92051


## 2) Save CSV to data/raw/ and Parquet to data/processed/ (TODO)
- Use timestamped filenames.
- Handle missing Parquet engine gracefully.

In [9]:
def ts(): return dt.datetime.now().strftime('%Y%m%d-%H%M%S')

# TODO: Save CSV
csv_path = RAW / f"sample_{ts()}.csv"
df.to_csv(csv_path, index=False)
csv_path

# TODO: Save Parquet
pq_path = PROC / f"sample_{ts()}.parquet"
try:
    df.to_parquet(pq_path)
except Exception as e:
    print('Parquet engine not available. Install pyarrow or fastparquet to complete this step.')
    pq_path = None
pq_path

PosixPath('data/processed/sample_20250820-204417.parquet')

## 3) Reload and Validate (TODO)
- Compare shapes and key dtypes.

In [10]:
def validate_loaded(original, reloaded):
    checks = {
        'shape_equal': original.shape == reloaded.shape,
        'date_is_datetime': pd.api.types.is_datetime64_any_dtype(reloaded['date']) if 'date' in reloaded.columns else False,
        'price_is_numeric': pd.api.types.is_numeric_dtype(reloaded['price']) if 'price' in reloaded.columns else False,
    }
    return checks

# Load the CSV, ensuring the 'date' column is parsed correctly
df_csv = pd.read_csv(csv_path, parse_dates=['date'])
print("Validation for CSV:", validate_loaded(df, df_csv))

Validation for CSV: {'shape_equal': True, 'date_is_datetime': True, 'price_is_numeric': True}


In [11]:
if pq_path and pq_path.exists():
    try:
        df_pq = pd.read_parquet(pq_path)
        print("Validation for Parquet:", validate_loaded(df, df_pq))
    except Exception as e:
        print(f"Parquet read failed: {e}")
else:
    print("Parquet file not found, skipping validation.")

Validation for Parquet: {'shape_equal': True, 'date_is_datetime': True, 'price_is_numeric': True}


## 4) Utilities (TODO)
- Implement `detect_format`, `write_df`, `read_df`.
- Use suffix to route; create parent dirs if needed; friendly errors for Parquet.

In [12]:
import typing as t
import pathlib
import time 
def detect_format(path: t.Union[str, pathlib.Path]) -> str:
    """Detects file format from suffix. Supports csv, parquet, pq, parq."""
    s = str(path).lower()
    if s.endswith('.csv'):
        return 'csv'
    if s.endswith(('.parquet', '.pq', '.parq')):
        return 'parquet'
    raise ValueError('Unsupported file format: ' + str(path))

def write_df(df: pd.DataFrame, path: t.Union[str, pathlib.Path]):
    """Writes a DataFrame to a path, creating parent dirs and routing by suffix."""
    p = pathlib.Path(path)
    # Ensure the parent directory exists
    p.parent.mkdir(parents=True, exist_ok=True)
    
    fmt = detect_format(p)
    if fmt == 'csv':
        df.to_csv(p, index=False)
    elif fmt == 'parquet':
        try:
            df.to_parquet(p)
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Install pyarrow or fastparquet.') from e
    print(f"Successfully wrote {fmt} to {p}")
    return p

def read_df(path: t.Union[str, pathlib.Path]) -> pd.DataFrame:
    """Reads a DataFrame from a path, routing by suffix."""
    p = pathlib.Path(path)
    fmt = detect_format(p)
    
    if not p.exists():
        raise FileNotFoundError(f"File not found at: {p}")
        
    if fmt == 'csv':
        # Smartly parse dates if a 'date' column exists
        temp_df = pd.read_csv(p, nrows=0) # Read only header to check columns
        if 'date' in temp_df.columns:
            return pd.read_csv(p, parse_dates=['date'])
        else:
            return pd.read_csv(p)
    elif fmt == 'parquet':
        try:
            return pd.read_parquet(p)
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Install pyarrow or fastparquet.') from e

# --- Demo Usage ---
print("\n--- Testing Utilities ---")
# A short sleep ensures the timestamp is different
time.sleep(1)
p_csv = RAW / f"util_{ts()}.csv"
p_pq  = PROC / f"util_{ts()}.parquet"

# Test write and read for CSV
write_df(df, p_csv)
reloaded_csv = read_df(p_csv)
print("Reloaded CSV via util, shape:", reloaded_csv.shape)
print("CSV validation via util:", validate_loaded(df, reloaded_csv))
print("-" * 20)

# Test write and read for Parquet
try:
    time.sleep(1)
    write_df(df, p_pq)
    reloaded_pq = read_df(p_pq)
    print("Reloaded Parquet via util, shape:", reloaded_pq.shape)
    print("Parquet validation via util:", validate_loaded(df, reloaded_pq))
except RuntimeError as e:
    print('Skipping Parquet util demo:', e)


--- Testing Utilities ---
Successfully wrote csv to data/raw/util_20250820-204418.csv
Reloaded CSV via util, shape: (20, 3)
CSV validation via util: {'shape_equal': True, 'date_is_datetime': True, 'price_is_numeric': True}
--------------------
Successfully wrote parquet to data/processed/util_20250820-204418.parquet
Reloaded Parquet via util, shape: (20, 3)
Parquet validation via util: {'shape_equal': True, 'date_is_datetime': True, 'price_is_numeric': True}


## 5) Documentation (TODO)
- Update README with a **Data Storage** section (folders, formats, env usage).
- Summarize validation checks and any assumptions.