In [1]:
# Base URL and dynamic iteration range
base_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-{month}.parquet"
start_year, start_month = 2009, 1
end_year, end_month = 2024, 8

In [2]:
import pyarrow.parquet as pq
import requests
from io import BytesIO
from datetime import datetime
import pandas as pd
import time

def get_parquet_schema_with_dynamic_range(url, max_retries=15, backoff_factor=2):
    retries = 0
    session = requests.Session()  # Create the session outside the loop so it can be reused

    while retries <= max_retries:
        try:
            # Get the file size using a HEAD request
            head_response = session.head(url)  # Use session for the head request
            if head_response.status_code == 403:
                raise requests.exceptions.RequestException("Rate limit hit (403).")

            if head_response.status_code != 200:
                raise Exception(f"Failed to access {url}: {head_response.status_code} {head_response.reason}")

            file_size = int(head_response.headers.get('Content-Length', 0))
            if file_size == 0:
                raise Exception(f"Unable to retrieve file size for {url}")

            # Fetch the last 64 KB or up to the file size (whichever is smaller)
            range_size = min(65536, file_size)
            headers = {'Range': f'bytes=-{range_size}'}
            response = session.get(url, headers=headers)  # Use session for the GET request
            response.raise_for_status()

            # Read footer and parse schema
            footer = BytesIO(response.content)
            parquet_file = pq.ParquetFile(footer)
            schema = parquet_file.schema
            return schema

        except requests.exceptions.RequestException as e:
            # Handle rate-limit or other request errors
            if retries < max_retries:
                wait_time = backoff_factor ** retries
                print(f"Rate limit or error for {url}. Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
                retries += 1
                # Create a new session on retry
                print("Creating a new session...")
                session = requests.Session()  # Recreate the session on retry
            else:
                raise Exception(f"Max retries exceeded for {url}: {e}")

In [3]:
def main():

    # List to store schema details for concatenation
    schema_records = []

    for year in range(start_year, end_year + 1):
        for month in range(1, 13):
            if (year == end_year and month > end_month):
                break

            # Format the URL dynamically
            month_str = f"{month:02d}"
            url = base_url.format(year=year, month=month_str)
            print(f"Fetching schema for {url}...")

            try:
                # Get schema for the current file
                schema = get_parquet_schema_with_dynamic_range(url)
                # Add schema details to the records list
                for i, n in enumerate(schema.names):
                    col = schema.column(i)
                    schema_records.append(
                        {
                            "YearMonth": f"{year}{month_str}",
                            "Name": n,
                            "Type": col.physical_type
                        }
                    )
            except Exception as e:
                print(f"Error fetching schema for {url}: {e}")
                raise e

    # Convert the list of records to a DataFrame
    schema_df = pd.DataFrame(schema_records)

    return schema_df

schema_df = main()

Fetching schema for https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2009-01.parquet...
Rate limit or error for https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2009-01.parquet. Retrying in 1 seconds...
Creating a new session...
Rate limit or error for https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2009-01.parquet. Retrying in 2 seconds...
Creating a new session...
Rate limit or error for https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2009-01.parquet. Retrying in 4 seconds...
Creating a new session...
Rate limit or error for https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2009-01.parquet. Retrying in 8 seconds...
Creating a new session...
Rate limit or error for https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2009-01.parquet. Retrying in 16 seconds...
Creating a new session...
Rate limit or error for https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2009-01.parquet. Retrying in

In [4]:
schema_df

Unnamed: 0,YearMonth,Name,Type
0,200901,vendor_name,BYTE_ARRAY
1,200901,Trip_Pickup_DateTime,BYTE_ARRAY
2,200901,Trip_Dropoff_DateTime,BYTE_ARRAY
3,200901,Passenger_Count,INT64
4,200901,Trip_Distance,DOUBLE
...,...,...,...
3545,202408,tolls_amount,DOUBLE
3546,202408,improvement_surcharge,DOUBLE
3547,202408,total_amount,DOUBLE
3548,202408,congestion_surcharge,DOUBLE


In [5]:
# Sort by YearMonth to ensure chronological order
schema_df['YearMonth'] = schema_df['YearMonth'].astype(str)
schema_df = schema_df.sort_values(by=['YearMonth']).reset_index(drop=True)

# Function to print schema evolution
def print_schema_evolution(df):
    previous_schema = pd.DataFrame(columns=['Name', 'Type'])
    
    for year_month in df['YearMonth'].unique():
        # Filter the schema for the current YearMonth
        current_schema = df[df['YearMonth'] == year_month]
        
        # Print the schema evolution compared to the previous schema
        print(f"\nSchema changes for {year_month}:")
        
        if not previous_schema.empty:
            # Compare new fields (present in current but not in previous)
            new_fields = current_schema[~current_schema['Name'].isin(previous_schema['Name'])]
            if not new_fields.empty:
                print(f"New fields added: \n{new_fields[['Name', 'Type']]}")
            
            # Compare removed fields (present in previous but not in current)
            removed_fields = previous_schema[~previous_schema['Name'].isin(current_schema['Name'])]
            if not removed_fields.empty:
                print(f"Fields removed: \n{removed_fields[['Name', 'Type']]}")
            
            # Compare changed types (same Name but different Type)
            merged = pd.merge(previous_schema, current_schema, on='Name', how='outer', suffixes=('_prev', '_current'))
            changed_types = merged[merged['Type_prev'] != merged['Type_current']]
            if not changed_types.empty:
                print(f"Fields with changed types: \n{changed_types[['Name', 'Type_prev', 'Type_current']]}")
        else:
            print("No previous schema to compare with.")
        
        # Update previous schema to current schema for next iteration
        previous_schema = current_schema[['Name', 'Type']]

# Print the schema evolution
print_schema_evolution(schema_df)



Schema changes for 200901:
No previous schema to compare with.

Schema changes for 200902:

Schema changes for 200903:

Schema changes for 200904:

Schema changes for 200905:

Schema changes for 200906:

Schema changes for 200907:

Schema changes for 200908:

Schema changes for 200909:

Schema changes for 200910:

Schema changes for 200911:

Schema changes for 200912:

Schema changes for 201001:
New fields added: 
                   Name        Type
216        payment_type  BYTE_ARRAY
217         fare_amount      DOUBLE
219        total_amount      DOUBLE
220          tip_amount      DOUBLE
221        tolls_amount      DOUBLE
222    dropoff_latitude      DOUBLE
224   dropoff_longitude      DOUBLE
225    pickup_longitude      DOUBLE
226           rate_code  BYTE_ARRAY
227           vendor_id  BYTE_ARRAY
228     pickup_datetime  BYTE_ARRAY
229    dropoff_datetime  BYTE_ARRAY
230  store_and_fwd_flag  BYTE_ARRAY
231       trip_distance      DOUBLE
232     passenger_count       INT64
233  