#HW 5 
To save/load CSV and Parquet with environment-driven paths, organize data/raw/ vs data/processed/, and document storage choices.

Name: Dhairya Gouchwal 

Date: 19th August 2025

Objectives:
- Env-driven paths to `data/raw/` and `data/processed/`
- Save CSV and Parquet; reload and validate
- Abstract IO with utility functions; document choices

In [13]:
import os, pathlib
import numpy as np
import pandas as pd
from dotenv import load_dotenv
import datetime as dt

In [19]:
# Step 1
dotenv_path = "../.env"
load_dotenv(dotenv_path)
DATA_DIR_RAW = os.getenv("DATA_DIR_RAW")
DATA_DIR_PROCESSED = os.getenv("DATA_DIR_PROCESSED")
print (DATA_DIR_RAW,",", DATA_DIR_PROCESSED)


data/raw , data/processed


In [33]:
pip install fastparquet

Note: you may need to restart the kernel to use updated packages.


In [41]:
df = pd.DataFrame({
    "Name": ["A", "B", "C"],
    "Roll No.": [1,2,3],
    "Class Taken" : ["Math", "English", "Science"]
})

csv_path = os.path.join(r"C:\Users\dhair\bootcamp_dhairya_gouchwal\homework\homework5\data\raw", "sample.csv")
df.to_csv(csv_path, index=False)



In [42]:
parquet_path = os.path.join(r"C:\Users\dhair\bootcamp_dhairya_gouchwal\homework\homework5\data\processed", "sample.parquet")
df.to_parquet(parquet_path, engine="fastparquet", index =False)

In [44]:
# Step 2 Validate and reload

df_csv = pd.read_csv(csv_path)
df_parquet = pd.read_parquet(parquet_path, engine="fastparquet", index =False)

print("CSV shape:", df_csv.shape)
print("Parquet shape:", df_parquet.shape)

def validate_df(df1, df2, key_cols):
    same_shape = df1.shape == df2.shape
    same_dtypes = all(df1[c].dtype == df2[c].dtype for c in key_cols)
    return {"same_shape": same_shape, "same_dtypes": same_dtypes}

results = validate_df(df_csv, df_parquet, key_cols=["Roll No.", "Class Taken"])
print(results)

CSV shape: (3, 3)
Parquet shape: (3, 3)
{'same_shape': True, 'same_dtypes': True}


In [51]:
#Step 3 Utilities
def write_df(df, path, **kwargs):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    if path.endswith(".csv"):
        df.to_csv(path, index=False, **kwargs)
    elif path.endswith(".parquet"):
        try:
            df.to_parquet(path, index=False, **kwargs)
        except ImportError:
            raise RuntimeError("Missing Parquet engine. Install pyarrow or fastparquet.")
    else:
        raise ValueError("Unsupported file type. Use .csv or .parquet")

def read_df(path, **kwargs):
    if path.endswith(".csv"):
        return pd.read_csv(path, **kwargs)
    elif path.endswith(".parquet"):
        return pd.read_parquet(path, **kwargs)
    else:
        raise ValueError("Unsupported file type. Use .csv or .parquet")

In [53]:
# Demo - using utitlites
write_df(df, csv_path)
write_df(df, parquet_path, engine = "fastparquet")

df_csv = read_df(csv_path)
df_parquet = read_df(parquet_path, engine = "fastparquet")

print("CSV shape:", df_csv.shape)
print("Parquet shape:", df_parquet.shape)

CSV shape: (3, 3)
Parquet shape: (3, 3)
