# Clean vs Raw Summary

This notebook compares all RAW vs CLEANED parquet files and produces a compact tabular summary:

File, Raw Rows, Clean Rows, Raw - Clean, cleaned %, removed %, Raw Cols, Clean Cols, Dropped Cols, Added Cols, Major Null Reduction.

Run cells from top to bottom.


In [1]:
import pandas as pd
from pathlib import Path

RAW_DATA_DIR = "/Users/vidushi/Documents/bubu/Raw_data/raw_data"
CLEANED_DATA_DIR = "/Users/vidushi/Documents/bubu/cleaned_data"

print("RAW_DATA_DIR:", RAW_DATA_DIR)
print("CLEANED_DATA_DIR:", CLEANED_DATA_DIR)


RAW_DATA_DIR: /Users/vidushi/Documents/bubu/Raw_data/raw_data
CLEANED_DATA_DIR: /Users/vidushi/Documents/bubu/cleaned_data


In [2]:
# Helper functions (with safe parquet reader) and file listing

import pyarrow.parquet as pq


def safe_read_parquet(path: Path) -> pd.DataFrame:
    """Read parquet file with timestamp overflow handling (same logic as other notebooks)."""
    try:
        # Fast path: normal pandas reader
        return pd.read_parquet(path)
    except Exception as e:
        error_type = type(e).__name__
        try:
            msg = str(e)
        except Exception:
            msg = ""

        is_ts_error = (
            'ArrowInvalid' in error_type or
            'out of bounds' in msg.lower() or
            'timestamp' in msg.lower() or
            'casting' in msg.lower()
        )

        if not is_ts_error:
            # Not a timestamp problem, re-raise
            raise

        print(f"    ⚠️  Timestamp issue for {path.name}, using safe mode...")
        table = pq.read_table(path)
        df = table.to_pandas(timestamp_as_object=True)

        # Try to convert obvious datetime-like columns back to datetime
        for col in df.columns:
            if df[col].dtype == 'object' and any(x in col.lower() for x in ['time', 'datetime', 'date']):
                try:
                    df[col] = pd.to_datetime(df[col], errors='coerce')
                except Exception:
                    pass
        return df


def get_parquet_files(directory: str):
    base = Path(directory)
    return {p.name: p for p in base.glob("*.parquet")}


raw_files = get_parquet_files(RAW_DATA_DIR)
clean_files = get_parquet_files(CLEANED_DATA_DIR)

common_names = sorted(set(raw_files.keys()) & set(clean_files.keys()))
print(f"Found {len(common_names)} files present in both RAW and CLEANED.")


Found 100 files present in both RAW and CLEANED.


In [3]:
# Build compact summary table

summary_rows = []

print("\n================ CLEAN vs RAW SUMMARY ================")
print(f"Total common files: {len(common_names)}")

for idx, name in enumerate(common_names, 1):
    raw_path = raw_files[name]
    clean_path = clean_files[name]

    print(f"\n[{idx}/{len(common_names)}] {name}")
    print(f"  Raw path   : {raw_path}")
    print(f"  Clean path : {clean_path}")

    try:
        df_raw = safe_read_parquet(raw_path)
        df_clean = safe_read_parquet(clean_path)

        raw_rows = len(df_raw)
        clean_rows = len(df_clean)
        raw_cols = len(df_raw.columns)
        clean_cols = len(df_clean.columns)

        print(f"  Raw rows   : {raw_rows:,}, cols: {raw_cols}")
        print(f"  Clean rows : {clean_rows:,}, cols: {clean_cols}")

        # Row stats
        diff_rows = raw_rows - clean_rows
        cleaned_pct = (clean_rows / raw_rows * 100) if raw_rows > 0 else 0
        removed_pct = (diff_rows / raw_rows * 100) if raw_rows > 0 else 0

        print(f"  Rows removed: {diff_rows:,} ({removed_pct:.2f}% removed, {cleaned_pct:.2f}% kept)")

        # Column stats
        raw_cols_set = set(df_raw.columns)
        clean_cols_set = set(df_clean.columns)
        dropped_cols = sorted(raw_cols_set - clean_cols_set)
        added_cols = sorted(clean_cols_set - raw_cols_set)

        if dropped_cols:
            print(f"  Dropped cols: {', '.join(dropped_cols)}")
        if added_cols:
            print(f"  Added cols  : {', '.join(added_cols)}")

        # Major null reduction flag (True if total nulls reduced by > 50%)
        raw_nulls = df_raw.isnull().sum().sum()
        clean_nulls = df_clean.isnull().sum().sum()
        major_null_reduction = False
        reduction = 0.0
        if raw_nulls > 0:
            reduction = (raw_nulls - clean_nulls) / raw_nulls * 100
            major_null_reduction = reduction >= 50

        print(f"  Nulls raw   : {raw_nulls:,}")
        print(f"  Nulls clean : {clean_nulls:,}")
        print(f"  Null reduction: {reduction:.2f}%  (major={major_null_reduction})")

        summary_rows.append({
            "File": name,
            "Raw Rows": raw_rows,
            "Clean Rows": clean_rows,
            "Raw - Clean": diff_rows,
            "cleaned %": round(cleaned_pct, 2),
            "removed %": round(removed_pct, 2),
            "Raw Cols": raw_cols,
            "Clean Cols": clean_cols,
            "Dropped Cols": ", ".join(dropped_cols) if dropped_cols else "",
            "Added Cols": ", ".join(added_cols) if added_cols else "",
            "Major Null Reduction": major_null_reduction,
        })

    except Exception as e:
        print(f"  ❌ ERROR comparing {name}: {e}")

summary_df = pd.DataFrame(summary_rows)
print("\nSummary (first 10 files):")
display(summary_df.head(10))



Total common files: 100

[1/100] fhv_tripdata_2015-02.parquet
  Raw path   : /Users/vidushi/Documents/bubu/Raw_data/raw_data/fhv_tripdata_2015-02.parquet
  Clean path : /Users/vidushi/Documents/bubu/cleaned_data/fhv_tripdata_2015-02.parquet
  Raw rows   : 3,053,183, cols: 7
  Clean rows : 3,053,183, cols: 6
  Rows removed: 0 (0.00% removed, 100.00% kept)
  Dropped cols: SR_Flag
  Nulls raw   : 6,577,667
  Nulls clean : 0
  Null reduction: 100.00%  (major=True)

[2/100] fhv_tripdata_2015-12.parquet
  Raw path   : /Users/vidushi/Documents/bubu/Raw_data/raw_data/fhv_tripdata_2015-12.parquet
  Clean path : /Users/vidushi/Documents/bubu/cleaned_data/fhv_tripdata_2015-12.parquet
  Raw rows   : 8,888,809, cols: 7
  Clean rows : 8,888,809, cols: 6
  Rows removed: 0 (0.00% removed, 100.00% kept)
  Dropped cols: SR_Flag
  Nulls raw   : 20,515,852
  Nulls clean : 0
  Null reduction: 100.00%  (major=True)

[3/100] fhv_tripdata_2019-02.parquet
  Raw path   : /Users/vidushi/Documents/bubu/Raw_data/

  Nulls raw   : 4,496,235
  Nulls clean : 0
  Null reduction: 100.00%  (major=True)

[20/100] fhv_tripdata_2025-04.parquet
  Raw path   : /Users/vidushi/Documents/bubu/Raw_data/raw_data/fhv_tripdata_2025-04.parquet
  Clean path : /Users/vidushi/Documents/bubu/cleaned_data/fhv_tripdata_2025-04.parquet
  Raw rows   : 1,699,478, cols: 7
  Clean rows : 1,699,478, cols: 6
  Rows removed: 0 (0.00% removed, 100.00% kept)
  Dropped cols: SR_Flag
  Nulls raw   : 3,311,979
  Nulls clean : 0
  Null reduction: 100.00%  (major=True)

[21/100] fhv_tripdata_2025-05.parquet
  Raw path   : /Users/vidushi/Documents/bubu/Raw_data/raw_data/fhv_tripdata_2025-05.parquet
  Clean path : /Users/vidushi/Documents/bubu/cleaned_data/fhv_tripdata_2025-05.parquet
  Raw rows   : 2,210,721, cols: 7
  Clean rows : 2,210,721, cols: 6
  Rows removed: 0 (0.00% removed, 100.00% kept)
  Dropped cols: SR_Flag
  Nulls raw   : 4,358,470
  Nulls clean : 0
  Null reduction: 100.00%  (major=True)

[22/100] fhv_tripdata_2025-06.p

  Raw rows   : 21,068,851, cols: 24
  Clean rows : 21,068,851, cols: 24
  Rows removed: 0 (0.00% removed, 100.00% kept)
  Nulls raw   : 10,243,285
  Nulls clean : 10,243,285
  Null reduction: 0.00%  (major=False)

[39/100] fhvhv_tripdata_2025-01.parquet
  Raw path   : /Users/vidushi/Documents/bubu/Raw_data/raw_data/fhvhv_tripdata_2025-01.parquet
  Clean path : /Users/vidushi/Documents/bubu/cleaned_data/fhvhv_tripdata_2025-01.parquet
  Raw rows   : 20,405,666, cols: 25
  Clean rows : 20,405,666, cols: 25
  Rows removed: 0 (0.00% removed, 100.00% kept)
  Nulls raw   : 10,070,270
  Nulls clean : 10,070,270
  Null reduction: 0.00%  (major=False)

[40/100] fhvhv_tripdata_2025-02.parquet
  Raw path   : /Users/vidushi/Documents/bubu/Raw_data/raw_data/fhvhv_tripdata_2025-02.parquet
  Clean path : /Users/vidushi/Documents/bubu/cleaned_data/fhvhv_tripdata_2025-02.parquet
  Raw rows   : 19,339,461, cols: 25
  Clean rows : 19,339,461, cols: 25
  Rows removed: 0 (0.00% removed, 100.00% kept)
  Null

  Nulls raw   : 63,721
  Nulls clean : 0
  Null reduction: 100.00%  (major=True)

[67/100] green_tripdata_2025-03.parquet
  Raw path   : /Users/vidushi/Documents/bubu/Raw_data/raw_data/green_tripdata_2025-03.parquet
  Clean path : /Users/vidushi/Documents/bubu/cleaned_data/green_tripdata_2025-03.parquet
  Raw rows   : 51,539, cols: 21
  Clean rows : 50,756, cols: 20
  Rows removed: 783 (1.52% removed, 98.48% kept)
  Dropped cols: ehail_fee
  Nulls raw   : 72,840
  Nulls clean : 0
  Null reduction: 100.00%  (major=True)

[68/100] green_tripdata_2025-04.parquet
  Raw path   : /Users/vidushi/Documents/bubu/Raw_data/raw_data/green_tripdata_2025-04.parquet
  Clean path : /Users/vidushi/Documents/bubu/cleaned_data/green_tripdata_2025-04.parquet
  Raw rows   : 52,132, cols: 21
  Clean rows : 51,322, cols: 20
  Rows removed: 810 (1.55% removed, 98.45% kept)
  Dropped cols: ehail_fee
  Nulls raw   : 71,267
  Nulls clean : 0
  Null reduction: 100.00%  (major=True)

[69/100] green_tripdata_2025-0

  Nulls raw   : 2,042,880
  Nulls clean : 0
  Null reduction: 100.00%  (major=True)

[85/100] yellow_tripdata_2024-05.parquet
  Raw path   : /Users/vidushi/Documents/bubu/Raw_data/raw_data/yellow_tripdata_2024-05.parquet
  Clean path : /Users/vidushi/Documents/bubu/cleaned_data/yellow_tripdata_2024-05.parquet
  Raw rows   : 3,723,833, cols: 19
  Clean rows : 3,623,970, cols: 19
  Rows removed: 99,863 (2.68% removed, 97.32% kept)
  Nulls raw   : 2,023,330
  Nulls clean : 0
  Null reduction: 100.00%  (major=True)

[86/100] yellow_tripdata_2024-06.parquet
  Raw path   : /Users/vidushi/Documents/bubu/Raw_data/raw_data/yellow_tripdata_2024-06.parquet
  Clean path : /Users/vidushi/Documents/bubu/cleaned_data/yellow_tripdata_2024-06.parquet
  Raw rows   : 3,539,193, cols: 19
  Clean rows : 3,441,506, cols: 19
  Rows removed: 97,687 (2.76% removed, 97.24% kept)
  Nulls raw   : 2,053,905
  Nulls clean : 0
  Null reduction: 100.00%  (major=True)

[87/100] yellow_tripdata_2024-07.parquet
  Raw pa

Unnamed: 0,File,Raw Rows,Clean Rows,Raw - Clean,cleaned %,removed %,Raw Cols,Clean Cols,Dropped Cols,Added Cols,Major Null Reduction
0,fhv_tripdata_2015-02.parquet,3053183,3053183,0,100.0,0.0,7,6,SR_Flag,,True
1,fhv_tripdata_2015-12.parquet,8888809,8888809,0,100.0,0.0,7,6,SR_Flag,,True
2,fhv_tripdata_2019-02.parquet,1707650,1707650,0,100.0,0.0,7,7,,,False
3,fhv_tripdata_2019-12.parquet,2044196,2044196,0,100.0,0.0,7,6,SR_Flag,,True
4,fhv_tripdata_2024-01.parquet,1290116,1290116,0,100.0,0.0,7,6,SR_Flag,,True
5,fhv_tripdata_2024-02.parquet,1176093,1176093,0,100.0,0.0,7,6,SR_Flag,,True
6,fhv_tripdata_2024-03.parquet,1469352,1469352,0,100.0,0.0,7,6,SR_Flag,,True
7,fhv_tripdata_2024-04.parquet,1444626,1444626,0,100.0,0.0,7,6,SR_Flag,,True
8,fhv_tripdata_2024-05.parquet,1352502,1352502,0,100.0,0.0,7,6,SR_Flag,,True
9,fhv_tripdata_2024-06.parquet,1386539,1386539,0,100.0,0.0,7,6,SR_Flag,,True


In [4]:
# Save full summary to CSV for Excel / further analysis

output_path = Path("clean_vs_raw_summary.csv")
summary_df.to_csv(output_path, index=False)

print("\nSaved compact summary to:", output_path)
summary_df



Saved compact summary to: clean_vs_raw_summary.csv


Unnamed: 0,File,Raw Rows,Clean Rows,Raw - Clean,cleaned %,removed %,Raw Cols,Clean Cols,Dropped Cols,Added Cols,Major Null Reduction
0,fhv_tripdata_2015-02.parquet,3053183,3053183,0,100.00,0.00,7,6,SR_Flag,,True
1,fhv_tripdata_2015-12.parquet,8888809,8888809,0,100.00,0.00,7,6,SR_Flag,,True
2,fhv_tripdata_2019-02.parquet,1707650,1707650,0,100.00,0.00,7,7,,,False
3,fhv_tripdata_2019-12.parquet,2044196,2044196,0,100.00,0.00,7,6,SR_Flag,,True
4,fhv_tripdata_2024-01.parquet,1290116,1290116,0,100.00,0.00,7,6,SR_Flag,,True
...,...,...,...,...,...,...,...,...,...,...,...
95,yellow_tripdata_2025-04.parquet,3970553,3760427,210126,94.71,5.29,20,20,,,True
96,yellow_tripdata_2025-05.parquet,4591845,4242250,349595,92.39,7.61,20,20,,,True
97,yellow_tripdata_2025-06.parquet,4322960,4024343,298617,93.09,6.91,20,20,,,True
98,yellow_tripdata_2025-07.parquet,3898963,3631053,267910,93.13,6.87,20,20,,,True
