In [None]:
from pathlib import Path

import polars as pl

In [None]:
survey_processed_dir = Path(
    "Q:/Data/Surveys/HouseholdSurveys/MTC-SFCTA2022/Processed_20240329"
)
taz_spatial_join_dir = survey_processed_dir / "01-taz_spatial_join"
reformat_dir = survey_processed_dir / "02-reformat"
tour_extract_dir = survey_processed_dir / "03-tour_extract"
tour_extract_allwk_dir = tour_extract_dir / "wt_7day"
tour_extract_wkday_dir = tour_extract_dir / "wt_wkday"
merge_skims_dir = survey_processed_dir / "04a-merge_skims"

out_dir = survey_processed_dir / "04b-summary_notebooks"
out_dir.mkdir(exist_ok=True)
out_filepath = out_dir / "output_csv_lengths.csv"

In [None]:
def get_csv_len(csv_filepath, separator=","):
    # with open(csv_filepath, "rb") as f:
    #     return sum(1 for _ in f) - 1 # count the number of lines (drop the last newline)
    return (
        pl.scan_csv(csv_filepath, separator=separator).select(pl.len()).collect().item()
    )

In [None]:
# TODO once filenames are more standardized, maybe we can simplify this
# TODO is there a cleaner way to do this?
df = pl.from_dict(
    {
        "step": [
            "0-raw",
            "1-taz_spatial_join",
            "2a-reformat",
            "2b-link_trips_week",
            "3a-tour_extract",
            "3b-tour_extract-wt_7day",
            "3b-tour_extract-wt_wkday",
            "4a-merge_skims",
        ],
        "hh": [
            get_csv_len(survey_processed_dir / "hh.csv"),
            get_csv_len(taz_spatial_join_dir / "hh-taz_spatial_join.csv"),
            get_csv_len(reformat_dir / "hh-reformat.csv"),
            None,
            get_csv_len(tour_extract_dir / "hh-tour_extract_week.csv"),
            get_csv_len(tour_extract_allwk_dir / "hh-assign_day.csv"),
            get_csv_len(tour_extract_wkday_dir / "hh-assign_day.csv"),
            get_csv_len(merge_skims_dir / "hh-merge_skims.csv"),
        ],
        "person": [
            get_csv_len(survey_processed_dir / "person.csv"),
            get_csv_len(taz_spatial_join_dir / "person-taz_spatial_join.csv"),
            get_csv_len(reformat_dir / "person-reformat.csv"),
            None,
            get_csv_len(tour_extract_dir / "person-tour_extract_week.csv"),
            get_csv_len(tour_extract_allwk_dir / "person-assign_day.csv"),
            get_csv_len(tour_extract_wkday_dir / "person-assign_day.csv"),
            get_csv_len(merge_skims_dir / "person-merge_skims.csv"),
        ],
        "trip": [
            get_csv_len(survey_processed_dir / "trip.csv"),
            get_csv_len(taz_spatial_join_dir / "trip-taz_spatial_join.csv"),
            get_csv_len(reformat_dir / "trip-reformat.csv"),
            get_csv_len(reformat_dir / "temp_tripx_linked_week.dat", separator=" "),
            get_csv_len(tour_extract_dir / "trip-tour_extract_week.csv"),
            get_csv_len(tour_extract_allwk_dir / "trip-assign_day.csv"),
            get_csv_len(tour_extract_wkday_dir / "trip-assign_day.csv"),
            get_csv_len(merge_skims_dir / "trip-merge_skims.csv"),
        ],
        "personday": [
            None,
            None,
            None,
            None,
            get_csv_len(tour_extract_dir / "personday-tour_extract_week.csv"),
            get_csv_len(tour_extract_allwk_dir / "personday-assign_day.csv"),
            get_csv_len(tour_extract_wkday_dir / "personday-assign_day.csv"),
            get_csv_len(merge_skims_dir / "personday-merge_skims.csv"),
        ],
        "tour": [
            None,
            None,
            None,
            None,
            get_csv_len(tour_extract_dir / "tour-tour_extract_week.csv"),
            get_csv_len(tour_extract_allwk_dir / "tour-assign_day.csv"),
            get_csv_len(tour_extract_wkday_dir / "tour-assign_day.csv"),
            get_csv_len(merge_skims_dir / "tour-merge_skims.csv"),
        ],
        "trip_detailed": [
            None,
            None,
            None,
            get_csv_len(reformat_dir / "temp_tripx_linked_detail_week.csv"),
            None,
            None,
            None,
            None,
        ],
        "accegr_week": [
            None,
            None,
            None,
            get_csv_len(reformat_dir / "accegr_week.csv"),
            None,
            None,
            None,
            None,
        ],
    }
)

In [None]:
print(df)

In [None]:
df.write_csv(out_filepath)