In [None]:
import duckdb
import helpers as h
from pathlib import Path

import importlib
importlib.reload(h)

parquet_path = Path('data/blob_data_100k.parquet')

# get schema
check = duckdb.sql(
        f"""
        describe select *  
        FROM read_parquet('{parquet_path}')
        """
    ).to_df()

check

Unnamed: 0,column_name,column_type,null,key,default,extra
0,BLOB_ID,VARCHAR,YES,,,
1,FULL_IMG_ID,VARCHAR,YES,,,
2,POINT,VARCHAR,YES,,,
3,SIZE,BIGINT,YES,,,
4,POLYGON_BOUNDRY_BOX,VARCHAR,YES,,,
5,BLOB_POLYGON,VARCHAR,YES,,,
6,GEO_HASHES,VARCHAR,YES,,,
7,CONSTRUCTION_STAGE,VARCHAR,YES,,,
8,BUILDING_TYPE,VARCHAR,YES,,,
9,CS_MODEL_ID,VARCHAR,YES,,,


In [8]:
# are there repeat blob IDs?
check = duckdb.sql(
        f"""
        select BLOB_ID ,count("BLOB_ID") as ct
        FROM read_parquet('{parquet_path}')
        group by "BLOB_ID"
        """
    ).to_df()

check

Unnamed: 0,BLOB_ID,ct
0,BID_a7258c95-7e44-424d-a227-1348f791ba39,1
1,BID_da5a3dbb-d451-4830-b988-df0ba199dff4,1
2,BID_fbd99840-ac72-4b9c-ad58-4136bad98287,1
3,BID_5d94c887-f046-44f2-8e6b-d8438eca69b2,1
4,BID_dae19b32-5e19-49b1-84e2-7e25cb6a5e7e,1
...,...,...
99995,BID_0a450ffd-0647-40d2-8152-e4d58b4e4e5b,1
99996,BID_0b0940d0-04cb-495f-9b64-0119c362f10c,1
99997,BID_0bd740e5-ab26-4367-8767-6179fd5c733e,1
99998,BID_0d405a56-b0c9-4caa-8855-8ceb5f3d3e89,1


In [10]:
# are there repeat blob IDs?
check = duckdb.sql(
        f"""
        select BLOB_ID ,count("BLOB_ID") as ct
        FROM read_parquet('{parquet_path}')
        group by "BLOB_ID"
        having ct > 1;
        """
    ).to_df()

check

Unnamed: 0,BLOB_ID,ct


In [7]:
# check if there are matching polygons
check = duckdb.sql(
        f"""
        select BLOB_POLYGON ,count("BLOB_POLYGON") as ct
        FROM read_parquet('{parquet_path}')
        group by "BLOB_POLYGON"
        """
    ).to_df()

check

Unnamed: 0,BLOB_POLYGON,ct
0,POLYGON ((-96.85164120370374 32.86601157407404...,1
1,POLYGON ((-96.71746064814813 32.89479398148145...,1
2,POLYGON ((-98.44759490740739 29.49336805555554...,1
3,POLYGON ((-96.71990046296295 32.88871990740737...,1
4,POLYGON ((-98.42369675925924 29.51381249999998...,1
...,...,...
99995,POLYGON ((-96.87851157407411 32.73482175925929...,1
99996,POLYGON ((-96.88253472222225 32.78133101851853...,1
99997,POLYGON ((-96.81310416666669 32.70414583333335...,1
99998,"POLYGON ((-96.8319328703704 32.71398379629632,...",1


In [12]:
# check how many are exactly matching polygons
check = duckdb.sql(
        f"""
        select BLOB_POLYGON ,count("BLOB_POLYGON") as ct
        FROM read_parquet('{parquet_path}')
        group by "BLOB_POLYGON"
        having ct > 1;
        """
    ).to_df()

check

Unnamed: 0,BLOB_POLYGON,ct


In [18]:
import duckdb
import pandas as pd

# Get all matching CSV files
parquets = list(Path('data').glob('blob_20250307_chunk*.parquet'))

# list to collect dataframes from each parquet file
dfs = []

for parquet in parquets:
    df = duckdb.sql(
        f"""
        SELECT 
        "MONTH", count(*) as count
        FROM read_parquet('{parquet}')
        GROUP BY "MONTH"
        ORDER BY "MONTH"
        """
    ).to_df()
    dfs.append(df)

# concatenate all results and sum counts per month
if dfs:
    final_df = pd.concat(dfs).groupby("MONTH", as_index=False).sum()
    print(final_df)
else:
    print("No matching files found or no data extracted.")

  MONTH   count
0    03    1557
1    04      14
2    05     118
3    06  516711
4    07   81600
