Sort Order Verification (verify_sort.py)<br>
<small>Verifies that a gzipped CSV file is sorted by (cenyear, stateicp, countyicp, serial)</small>

In [1]:
from pathlib import Path
import duckdb
import polars as pl
from tabulate import tabulate
import time

# Define base path for input files
BASE_PATH = Path("D:/source")

Verification Functions

In [2]:
def create_sort_view(con, csv_path: Path):
    query = f"""
        CREATE VIEW src AS
        SELECT cenyear, stateicp, countyicp, serial
        FROM read_csv_auto('{csv_path}', header=true, compression='gzip');
    """
    con.execute(query)

In [3]:
def compute_inversions(con) -> tuple[int, int]:
    query = """
        SELECT COUNT(*) AS total,
               SUM(year_inv + state_inv + county_inv + serial_inv) AS inversions
        FROM (
            SELECT
                cenyear,
                stateicp,
                countyicp,
                serial,
                CASE WHEN cenyear   < LAG(cenyear)   OVER () THEN 1 ELSE 0 END AS year_inv,
                CASE WHEN stateicp < LAG(stateicp) OVER () THEN 1 ELSE 0 END AS state_inv,
                CASE WHEN countyicp< LAG(countyicp)OVER () THEN 1 ELSE 0 END AS county_inv,
                CASE WHEN serial   < LAG(serial)   OVER () THEN 1 ELSE 0 END AS serial_inv
            FROM src
        ) AS lagged
    """
    con.execute(query)
    return con.fetchone()

In [4]:
def run_sort_verification(filename: str) -> tuple[int, int]:
    csv_path = BASE_PATH / filename
    con = duckdb.connect()
    create_sort_view(con, csv_path)
    return compute_inversions(con)

Tabulation Functions

In [5]:
def collect_sort_results(filelist: list[str]) -> pl.DataFrame:
    rows = []
    for filename in filelist:
        print(f"Verifying: {filename}")
        csv_path = Path(filename)
        total, inversions = run_sort_verification(csv_path)
        rows.append([filename, total, inversions])
    return pl.DataFrame(rows, schema=["Filename", "Total rows", "Sort violations"], orient='row')

def print_table(df: pl.DataFrame):
    print("\n" + tabulate(df.rows(), headers=df.columns, tablefmt="github") + "\n")

def save_sort_summary(df: pl.DataFrame):
    output_filename = SUMMARY_PATH / "source_table_sort_verification.csv"
    df.write_csv(str(output_filename))

In [7]:
SUMMARY_PATH = Path("D:/source/summaries")
SUMMARY_PATH.mkdir(parents=True, exist_ok=True)

filelist = ["cs1850.csv.gz","cs1860.csv.gz"]
summary_table = collect_sort_results(filelist)
print_table(summary_table)
save_sort_summary(summary_table)

Verifying: cs1850.csv.gz


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Verifying: cs1860.csv.gz


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))


| Filename      |   Total rows |   Sort violations |
|---------------|--------------|-------------------|
| cs1850.csv.gz |     15176114 |                73 |
| cs1860.csv.gz |     21050108 |               256 |

