In [1]:
from sqlalchemy import create_engine
from pathlib import Path
import pandas as pd
import helpers as h

@h.Timer()
def retrieve_data():
    """Downloads appropriate selection of data from PostgreSQL database"""

    with open('data/user.txt', 'r') as file:
        user = file.read().strip('\n')

    with open('data/pass.txt', 'r') as file:
        pw = file.read().strip('\n')

    with open('data/db_host.txt', 'r') as file:
        host = file.read().strip('\n')

    with open('data/db_port.txt', 'r') as file:
        port = file.read().strip('\n')

    with open('data/db_name.txt', 'r') as file:
        name = file.read().strip('\n')

    engine = create_engine(f'postgresql://{user}:{pw}@{host}:{port}/{name}')

    query = """SELECT * FROM blob WHERE "YEAR" = '2024' AND "MONTH" BETWEEN '03' AND '07' limit 100000;"""

    df = pd.read_sql(query, engine)

    parquet_path = Path('data/blob_data_100k.parquet')
    df.to_parquet(parquet_path, engine='pyarrow')

    print(f'Data saved to: {parquet_path}')

In [2]:
retrieve_data()

Data saved to: data/blob_data_100k.parquet
Function `retrieve_data` executed in 654.5031 sec, CPU: 2.50%, Memory: 1362.84MB


21946235 rows that match that criteria of date range.

In [4]:
# test count(*) of created dataset
import duckdb

parquet_path = Path('data/blob_data_100k.parquet')

check = duckdb.sql(
        f"SELECT count(*) FROM read_parquet('{parquet_path}')"
    ).to_df()

check.head()

Unnamed: 0,count_star()
0,100000


In [6]:
parquet_path = Path('data/blob_data_100k.parquet')

check = duckdb.sql(
        f"""SELECT count(*) FROM read_parquet('{parquet_path}') where "YEAR" = '2024' and "MONTH" = '03'"""
    ).to_df()

check.head()

Unnamed: 0,count_star()
0,0


In [9]:
parquet_path = Path('data/blob_data_100k.parquet')

df = duckdb.sql(f"""select * from read_parquet('{parquet_path}') limit 5;""").to_df()
df

Unnamed: 0,BLOB_ID,FULL_IMG_ID,POINT,SIZE,POLYGON_BOUNDRY_BOX,BLOB_POLYGON,GEO_HASHES,CONSTRUCTION_STAGE,BUILDING_TYPE,CS_MODEL_ID,...,IS_DUPLICATE,IS_BLOB_ON_IMAGE_EDGE,IS_VALID,IS_IMPUTED,IS_EXCLUDED,COUNTY,IS_OVERLAPPING_FOOTPRINT,IS_OVERLAPPING_ROAD,FOOTPRINT_ID,ROAD_DATA_ID
0,BID_a7258c95-7e44-424d-a227-1348f791ba39,FIID_58e9d2e6-11aa-4bc1-b1b0-951d312e72f5_2024...,POINT (-98.4602132804818 29.46763817191083),452,POLYGON ((-98.4602662037037 29.467706018518502...,POLYGON ((-98.4602662037037 29.467645833333318...,9v1zxkr,ROOF,NON-SF,MID_6bf26a1d-f36e-4364-bc9d-d759d5c75d74,...,False,False,True,False,False,BEXAR,True,False,FP_ID_1a055cc6-21f7-416a-90a3-f8afd3e13b1c,
1,BID_b433a935-c506-4937-9be8-9b63b87ca696,FIID_58e9d2e6-11aa-4bc1-b1b0-951d312e72f5_2024...,POINT (-98.42449395672409 29.509311504753402),935,POLYGON ((-98.42457175925924 29.50939120370369...,POLYGON ((-98.42457175925924 29.50937268518517...,9v4pb7c|9v4pbk1,ROOF,RESIDENTAL SF DETACHED,MID_6bf26a1d-f36e-4364-bc9d-d759d5c75d74,...,False,False,True,False,False,BEXAR,True,False,FP_ID_0e97e0c6-451c-4a83-a304-22d6a698cb19,
2,BID_d9fa48dd-5ee7-4258-9d56-264d38f2eecc,FIID_87c905a3-ddd6-4a93-84cb-f07be70e1327_2024...,POINT (-96.87599626765075 32.86770283696455),4396,POLYGON ((-96.87620601851854 32.86782175925923...,POLYGON ((-96.87620601851854 32.86776157407404...,9vg4gx3|9vg4gx2,ROOF,NON-SF,MID_6bf26a1d-f36e-4364-bc9d-d759d5c75d74,...,True,False,True,False,False,DALLAS,True,False,FP_ID_41e0c77e-db4b-4702-bae9-17f823ef2cd8,
3,BID_44517c93-b035-42b5-90dd-de2e341aa1d8,FIID_58e9d2e6-11aa-4bc1-b1b0-951d312e72f5_2024...,POINT (-98.42545971924973 29.455358060608745),623,POLYGON ((-98.4255162037037 29.455418981481465...,POLYGON ((-98.4255162037037 29.455418981481465...,9v4p860,ROOF,BACKGROUND,MID_6bf26a1d-f36e-4364-bc9d-d759d5c75d74,...,False,False,True,False,False,BEXAR,False,False,,
4,BID_cd928abd-adbf-4481-bc2f-627c7e0e27f9,FIID_58e9d2e6-11aa-4bc1-b1b0-951d312e72f5_2024...,POINT (-98.43521898353451 29.502309820768644),451,POLYGON ((-98.43526620370369 29.50235879629628...,POLYGON ((-98.43526620370369 29.50234953703702...,9v4pb49,ROOF,RESIDENTAL SF DETACHED,MID_6bf26a1d-f36e-4364-bc9d-d759d5c75d74,...,False,False,True,False,False,BEXAR,False,False,,


In [3]:
100000/21946235

0.004556590230624979