In [2]:
# Define the base URL pattern
base_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{}.parquet"

# Define date range
start_date = "2009-01"
end_date = "2024-08"

In [3]:
! pip3 install duckdb



In [4]:
import duckdb

con = duckdb.connect()
con.install_extension("httpfs")
con.load_extension("httpfs")

In [5]:
import pandas as pd
import duckdb
import time
import logging

# Create a DuckDB connection
con = duckdb.connect()

# date range
dates = pd.date_range(start_date, end_date, freq="MS").strftime("%Y-%m").tolist()

# Generate the list of SQL commands
sql_commands = [
    f"SELECT * FROM parquet_schema('{base_url.format(date.replace('-', '-'))}')" for date in dates
]

def load_with_backoff(con, sql_command, max_retries=10, initial_delay=15, backoff_factor=2):
    """
    Load data using a SQL command with retry and backoff logic.

    Args:
        con (duckdb.DuckDBPyConnection): DuckDB connection object.
        sql_command (str): The SQL command to execute.
        max_retries (int): Maximum number of retries.
        initial_delay (int): Initial delay in seconds.
        backoff_factor (int): Factor by which the delay increases after each retry.

    Returns:
        pd.DataFrame: DataFrame containing the loaded data, or None if it failed.
    """
    delay = initial_delay
    for attempt in range(max_retries):
        try:
            print(f"Executing SQL: {sql_command} (Attempt {attempt + 1})...")
            return con.sql(sql_command).df()
        except Exception as e:
            if attempt < max_retries - 1:
                print(f"Failed to execute SQL, retrying in {delay} seconds... ({attempt + 1}/{max_retries})")
                time.sleep(delay)
                delay *= backoff_factor
            else:
                logging.error(f"Failed to execute SQL after {max_retries} retries: {e}")
    return None


# Load all SQL commands
dfs = [load_with_backoff(con, sql) for sql in sql_commands]

# Filter out None results
dfs = [df for df in dfs if df is not None]

# Concatenate all DataFrames into one
if dfs:
    final_df = pd.concat(dfs, ignore_index=True)
    print(f"Loaded {len(final_df)} rows from {len(dfs)} files.")
else:
    print("No data loaded.")

# Save to a single Parquet file (optional)
final_df.to_parquet("combined_tripdata_200901_to_202408.parquet")


Executing SQL: SELECT * FROM parquet_schema('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2009-01.parquet') (Attempt 1)...
Executing SQL: SELECT * FROM parquet_schema('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2009-02.parquet') (Attempt 1)...
Executing SQL: SELECT * FROM parquet_schema('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2009-03.parquet') (Attempt 1)...
Executing SQL: SELECT * FROM parquet_schema('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2009-04.parquet') (Attempt 1)...
Executing SQL: SELECT * FROM parquet_schema('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2009-05.parquet') (Attempt 1)...
Executing SQL: SELECT * FROM parquet_schema('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2009-06.parquet') (Attempt 1)...
Executing SQL: SELECT * FROM parquet_schema('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2009-07.parquet') (Attempt 1)...
Execut

In [6]:
final_df

Unnamed: 0,file_name,name,type,type_length,repetition_type,num_children,converted_type,scale,precision,field_id,logical_type
0,https://d37ci6vzurychx.cloudfront.net/trip-dat...,schema,,,REQUIRED,18.0,,,,,
1,https://d37ci6vzurychx.cloudfront.net/trip-dat...,vendor_name,BYTE_ARRAY,,OPTIONAL,,UTF8,,,,StringType()
2,https://d37ci6vzurychx.cloudfront.net/trip-dat...,Trip_Pickup_DateTime,BYTE_ARRAY,,OPTIONAL,,UTF8,,,,StringType()
3,https://d37ci6vzurychx.cloudfront.net/trip-dat...,Trip_Dropoff_DateTime,BYTE_ARRAY,,OPTIONAL,,UTF8,,,,StringType()
4,https://d37ci6vzurychx.cloudfront.net/trip-dat...,Passenger_Count,INT64,,OPTIONAL,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
3733,https://d37ci6vzurychx.cloudfront.net/trip-dat...,tolls_amount,DOUBLE,,OPTIONAL,,,,,,
3734,https://d37ci6vzurychx.cloudfront.net/trip-dat...,improvement_surcharge,DOUBLE,,OPTIONAL,,,,,,
3735,https://d37ci6vzurychx.cloudfront.net/trip-dat...,total_amount,DOUBLE,,OPTIONAL,,,,,,
3736,https://d37ci6vzurychx.cloudfront.net/trip-dat...,congestion_surcharge,DOUBLE,,OPTIONAL,,,,,,
