# NYC Trip Data - Data Validation (Raw vs Cleaned)

This notebook compares RAW vs CLEANED parquet files and is now resilient to timestamp overflow issues.

It will:
- Load raw and cleaned parquet files
- Safely handle problematic timestamps
- Compute per-file summary stats
- Save JSON + CSV summaries

Run cells from top to bottom.


In [None]:
import pandas as pd
from pathlib import Path
import json
import csv

RAW_DATA_DIR = "/Users/vidushi/Documents/bubu/Raw_data/raw_data"
CLEANED_DATA_DIR = "/Users/vidushi/Documents/bubu/cleaned_data"

print("RAW_DATA_DIR:", RAW_DATA_DIR)
print("CLEANED_DATA_DIR:", CLEANED_DATA_DIR)


In [None]:
# ---- Helper functions (with safe parquet reader) ----

def safe_read_parquet(path: Path) -> pd.DataFrame:
    """Read parquet file with timestamp overflow handling.

    This mirrors the safe_read_parquet used in analyze_and_clean.ipynb.
    """
    import pyarrow.parquet as pq

    try:
        # Fast path: normal pandas reader
        return pd.read_parquet(path)
    except Exception as e:
        error_type = type(e).__name__
        try:
            msg = str(e)
        except Exception:
            msg = ""

        is_ts_error = (
            'ArrowInvalid' in error_type or
            'out of bounds' in msg.lower() or
            'timestamp' in msg.lower() or
            'casting' in msg.lower()
        )

        if not is_ts_error:
            # Not a timestamp problem, re-raise
            raise

        print(f"    ‚ö†Ô∏è  Timestamp issue for {path.name}, using safe mode...")
        table = pq.read_table(path)
        df = table.to_pandas(timestamp_as_object=True)

        # Try to convert obvious datetime-like columns back to datetime
        for col in df.columns:
            if df[col].dtype == 'object':
                if any(x in col.lower() for x in ['time', 'datetime', 'date']):
                    try:
                        df[col] = pd.to_datetime(df[col], errors='coerce')
                    except Exception:
                        pass
        return df


def get_parquet_files(directory: str):
    """Return dict[file_name] = Path for parquet files in a directory."""
    base = Path(directory)
    return {p.name: p for p in base.glob("*.parquet")}


def compute_basic_stats(df: pd.DataFrame):
    """Basic stats for a dataframe."""
    return {
        "rows": int(len(df)),
        "cols": int(len(df.columns)),
        "total_nulls": int(df.isnull().sum().sum()),
    }


def compute_column_nulls(df: pd.DataFrame):
    """Per-column null counts and percentages."""
    total_rows = len(df) or 1
    result = {}
    for col in df.columns:
        null_count = int(df[col].isnull().sum())
        result[col] = {
            "null_count": null_count,
            "null_pct": round(null_count / total_rows * 100, 2),
            "dtype": str(df[col].dtype),
        }
    return result


def validate_one_file(raw_path: Path, cleaned_path: Path):
    """Compare one raw vs cleaned file, using safe parquet reader."""
    raw_df = safe_read_parquet(raw_path)
    clean_df = safe_read_parquet(cleaned_path)

    raw_stats = compute_basic_stats(raw_df)
    clean_stats = compute_basic_stats(clean_df)

    raw_nulls = compute_column_nulls(raw_df)
    clean_nulls = compute_column_nulls(clean_df)

    all_cols = sorted(set(raw_df.columns) | set(clean_df.columns))

    per_column_diff = {}
    for col in all_cols:
        r = raw_nulls.get(col)
        c = clean_nulls.get(col)
        per_column_diff[col] = {
            "raw_null_count": r["null_count"] if r else None,
            "raw_null_pct": r["null_pct"] if r else None,
            "clean_null_count": c["null_count"] if c else None,
            "clean_null_pct": c["null_pct"] if c else None,
            "raw_dtype": r["dtype"] if r else None,
            "clean_dtype": c["dtype"] if c else None,
        }

    summary = {
        "file_name": raw_path.name,
        "raw_rows": raw_stats["rows"],
        "clean_rows": clean_stats["rows"],
        "rows_diff": raw_stats["rows"] - clean_stats["rows"],
        "raw_cols": raw_stats["cols"],
        "clean_cols": clean_stats["cols"],
        "cols_diff": raw_stats["cols"] - clean_stats["cols"],
        "raw_total_nulls": raw_stats["total_nulls"],
        "clean_total_nulls": clean_stats["total_nulls"],
        "total_nulls_reduced": raw_stats["total_nulls"] - clean_stats["total_nulls"],
    }

    return summary, per_column_diff

print("‚úÖ Validation helper functions loaded (with safe parquet reader).")


In [None]:
# ---- Run validation across all common files ----

raw_files = get_parquet_files(RAW_DATA_DIR)
clean_files = get_parquet_files(CLEANED_DATA_DIR)

common_names = sorted(set(raw_files.keys()) & set(clean_files.keys()))

print("=" * 100)
print("üîé DATA VALIDATION: RAW vs CLEANED")
print(f"Raw dir    : {RAW_DATA_DIR}")
print(f"Cleaned dir: {CLEANED_DATA_DIR}")
print(f"Common files: {len(common_names)}")
print("=" * 100)

per_file_summary = []
per_file_columns = {}

for i, name in enumerate(common_names, 1):
    print(f"\n[{i}/{len(common_names)}] {name}")
    raw_path = raw_files[name]
    clean_path = clean_files[name]

    try:
        summary, per_col = validate_one_file(raw_path, clean_path)
        per_file_summary.append(summary)
        per_file_columns[name] = per_col

        print(
            f"  Rows: raw={summary['raw_rows']:,}  clean={summary['clean_rows']:,} "
            f"(Œî {summary['rows_diff']:,})"
        )
        print(
            f"  Cols: raw={summary['raw_cols']}  clean={summary['clean_cols']} "
            f"(Œî {summary['cols_diff']})"
        )
        print(
            f"  Total nulls: raw={summary['raw_total_nulls']:,}  "
            f"clean={summary['clean_total_nulls']:,} "
            f"(reduced by {summary['total_nulls_reduced']:,})"
        )
    except Exception as e:
        print(f"  ‚ùå ERROR validating {name}: {e}")

# Save results to JSON + CSV in the current directory
out_json = Path("validation_results.json")
out_json.write_text(
    json.dumps(
        {
            "per_file_summary": per_file_summary,
            "per_file_columns": per_file_columns,
        },
        indent=2,
    )
)

out_csv = Path("validation_summary_for_excel.csv")
if per_file_summary:
    fieldnames = list(per_file_summary[0].keys())
    with out_csv.open("w", newline="") as f:
        w = csv.DictWriter(f, fieldnames=fieldnames)
        w.writeheader()
        w.writerows(per_file_summary)

print("\n" + "=" * 100)
print("üíæ VALIDATION RESULTS SAVED")
print(f"- JSON: {out_json}")
print(f"- CSV : {out_csv}")
print("=" * 100)


# NYC Trip Data - Data Validation (Raw vs Cleaned)

This notebook mirrors the logic of `data_validation.py`.

It will:
- Compare RAW vs CLEANED parquet files
- Compute per-file and per-column stats
- Save JSON + CSV summaries for further analysis

Run cells from top to bottom.


In [1]:
import pandas as pd
from pathlib import Path
import json
import csv

RAW_DATA_DIR = "/Users/vidushi/Documents/bubu/Raw_data/raw_data"
CLEANED_DATA_DIR = "/Users/vidushi/Documents/bubu/cleaned_data"

print("RAW_DATA_DIR:", RAW_DATA_DIR)
print("CLEANED_DATA_DIR:", CLEANED_DATA_DIR)


RAW_DATA_DIR: /Users/vidushi/Documents/bubu/Raw_data/raw_data
CLEANED_DATA_DIR: /Users/vidushi/Documents/bubu/cleaned_data


In [2]:
# ---- Helper functions from data_validation.py ----

def get_parquet_files(directory: str):
    """Return dict[file_name] = Path for parquet files in a directory."""
    base = Path(directory)
    return {p.name: p for p in base.glob("*.parquet")}


def compute_basic_stats(df: pd.DataFrame):
    """Basic stats for a dataframe."""
    return {
        "rows": int(len(df)),
        "cols": int(len(df.columns)),
        "total_nulls": int(df.isnull().sum().sum()),
    }


def compute_column_nulls(df: pd.DataFrame):
    """Per-column null counts and percentages."""
    total_rows = len(df) or 1
    result = {}
    for col in df.columns:
        null_count = int(df[col].isnull().sum())
        result[col] = {
            "null_count": null_count,
            "null_pct": round(null_count / total_rows * 100, 2),
            "dtype": str(df[col].dtype),
        }
    return result


def validate_one_file(raw_path: Path, cleaned_path: Path):
    """Compare one raw vs cleaned file."""
    raw_df = pd.read_parquet(raw_path)
    clean_df = pd.read_parquet(cleaned_path)

    raw_stats = compute_basic_stats(raw_df)
    clean_stats = compute_basic_stats(clean_df)

    raw_nulls = compute_column_nulls(raw_df)
    clean_nulls = compute_column_nulls(clean_df)

    all_cols = sorted(set(raw_df.columns) | set(clean_df.columns))

    per_column_diff = {}
    for col in all_cols:
        r = raw_nulls.get(col)
        c = clean_nulls.get(col)
        per_column_diff[col] = {
            "raw_null_count": r["null_count"] if r else None,
            "raw_null_pct": r["null_pct"] if r else None,
            "clean_null_count": c["null_count"] if c else None,
            "clean_null_pct": c["null_pct"] if c else None,
            "raw_dtype": r["dtype"] if r else None,
            "clean_dtype": c["dtype"] if c else None,
        }

    summary = {
        "file_name": raw_path.name,
        "raw_rows": raw_stats["rows"],
        "clean_rows": clean_stats["rows"],
        "rows_diff": raw_stats["rows"] - clean_stats["rows"],
        "raw_cols": raw_stats["cols"],
        "clean_cols": clean_stats["cols"],
        "cols_diff": raw_stats["cols"] - clean_stats["cols"],
        "raw_total_nulls": raw_stats["total_nulls"],
        "clean_total_nulls": clean_stats["total_nulls"],
        "total_nulls_reduced": raw_stats["total_nulls"] - clean_stats["total_nulls"],
    }

    return summary, per_column_diff

print("‚úÖ Validation helper functions loaded.")


‚úÖ Validation helper functions loaded.


In [None]:
# ---- Run validation across all common files ----

raw_files = get_parquet_files(RAW_DATA_DIR)
clean_files = get_parquet_files(CLEANED_DATA_DIR)

common_names = sorted(set(raw_files.keys()) & set(clean_files.keys()))

print("=" * 100)
print("üîé DATA VALIDATION: RAW vs CLEANED")
print(f"Raw dir    : {RAW_DATA_DIR}")
print(f"Cleaned dir: {CLEANED_DATA_DIR}")
print(f"Common files: {len(common_names)}")
print("=" * 100)

per_file_summary = []
per_file_columns = {}

for i, name in enumerate(common_names, 1):
    print(f"\n[{i}/{len(common_names)}] {name}")
    raw_path = raw_files[name]
    clean_path = clean_files[name]

    try:
        summary, per_col = validate_one_file(raw_path, clean_path)
        per_file_summary.append(summary)
        per_file_columns[name] = per_col

        print(
            f"  Rows: raw={summary['raw_rows']:,}  clean={summary['clean_rows']:,} "
            f"(Œî {summary['rows_diff']:,})"
        )
        print(
            f"  Cols: raw={summary['raw_cols']}  clean={summary['clean_cols']} "
            f"(Œî {summary['cols_diff']})"
        )
        print(
            f"  Total nulls: raw={summary['raw_total_nulls']:,}  "
            f"clean={summary['clean_total_nulls']:,} "
            f"(reduced by {summary['total_nulls_reduced']:,})"
        )
    except Exception as e:
        print(f"  ‚ùå ERROR validating {name}: {e}")

# Save results to JSON + CSV in the current directory
out_json = Path("validation_results.json")
out_json.write_text(
    json.dumps(
        {
            "per_file_summary": per_file_summary,
            "per_file_columns": per_file_columns,
        },
        indent=2,
    )
)

out_csv = Path("validation_summary_for_excel.csv")
if per_file_summary:
    fieldnames = list(per_file_summary[0].keys())
    with out_csv.open("w", newline="") as f:
        w = csv.DictWriter(f, fieldnames=fieldnames)
        w.writeheader()
        w.writerows(per_file_summary)

print("\n" + "=" * 100)
print("üíæ VALIDATION RESULTS SAVED")
print(f"- JSON: {out_json}")
print(f"- CSV : {out_csv}")
print("=" * 100)


üîé DATA VALIDATION: RAW vs CLEANED
Raw dir    : /Users/vidushi/Documents/bubu/Raw_data/raw_data
Cleaned dir: /Users/vidushi/Documents/bubu/cleaned_data
Common files: 100

[1/100] fhv_tripdata_2015-02.parquet
  Rows: raw=3,053,183  clean=3,053,183 (Œî 0)
  Cols: raw=7  clean=6 (Œî 1)
  Total nulls: raw=6,577,667  clean=0 (reduced by 6,577,667)

[2/100] fhv_tripdata_2015-12.parquet
  Rows: raw=8,888,809  clean=8,888,809 (Œî 0)
  Cols: raw=7  clean=6 (Œî 1)
  Total nulls: raw=20,515,852  clean=0 (reduced by 20,515,852)

[3/100] fhv_tripdata_2019-02.parquet
  ‚ùå ERROR validating fhv_tripdata_2019-02.parquet: Casting from timestamp[us] to timestamp[ns] would result in out of bounds timestamp: 33106123800000000

[4/100] fhv_tripdata_2019-12.parquet
  Rows: raw=2,044,196  clean=2,044,196 (Œî 0)
  Cols: raw=7  clean=6 (Œî 1)
  Total nulls: raw=2,113,294  clean=13,805 (reduced by 2,099,489)

[5/100] fhv_tripdata_2024-01.parquet
  Rows: raw=1,290,116  clean=1,290,116 (Œî 0)
  Cols: raw=7  cle