# Yellow Taxi Data - Schema Analysis

Specify the files to analyse.

In [1]:
# Base URL and dynamic iteration range
base_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-{month}.parquet"
start_year, start_month = 2009, 1
end_year, end_month = 2024, 8

## Retrieve schemas from the parquet files

Don't read the whole parquet file, just the footer.

In [2]:
import pyarrow.parquet as pq
import requests
from io import BytesIO
from datetime import datetime
import pandas as pd
import time

def get_parquet_schema_with_dynamic_range(url, max_retries=15, backoff_factor=2):
    retries = 0
    session = requests.Session()  # Create the session outside the loop so it can be reused

    while retries <= max_retries:
        try:
            # Get the file size using a HEAD request
            head_response = session.head(url)  # Use session for the head request
            if head_response.status_code == 403:
                raise requests.exceptions.RequestException("Rate limit hit (403).")

            if head_response.status_code != 200:
                raise Exception(f"Failed to access {url}: {head_response.status_code} {head_response.reason}")

            file_size = int(head_response.headers.get('Content-Length', 0))
            if file_size == 0:
                raise Exception(f"Unable to retrieve file size for {url}")

            # Fetch the last 64 KB or up to the file size (whichever is smaller)
            range_size = min(65536, file_size)
            headers = {'Range': f'bytes=-{range_size}'}
            response = session.get(url, headers=headers)  # Use session for the GET request
            response.raise_for_status()

            # Read footer and parse schema
            footer = BytesIO(response.content)
            parquet_file = pq.ParquetFile(footer)
            schema = parquet_file.schema
            return schema

        except requests.exceptions.RequestException as e:
            # Handle rate-limit or other request errors
            if retries < max_retries:
                wait_time = backoff_factor ** retries
                print(f"Rate limit or error for {url}. Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
                retries += 1
                # Create a new session on retry
                print("Creating a new session...")
                session = requests.Session()  # Recreate the session on retry
            else:
                raise Exception(f"Max retries exceeded for {url}: {e}")

In [3]:
def main():

    # List to store schema details for concatenation
    schema_records = []

    for year in range(start_year, end_year + 1):
        for month in range(1, 13):
            if (year == end_year and month > end_month):
                break

            # Format the URL dynamically
            month_str = f"{month:02d}"
            url = base_url.format(year=year, month=month_str)
            print(f"Fetching schema for {url}...")

            try:
                # Get schema for the current file
                schema = get_parquet_schema_with_dynamic_range(url)
                # Add schema details to the records list
                for i, n in enumerate(schema.names):
                    col = schema.column(i)
                    schema_records.append(
                        {
                            "YearMonth": f"{year}{month_str}",
                            "Name": n,
                            "Type": col.physical_type
                        }
                    )
            except Exception as e:
                print(f"Error fetching schema for {url}: {e}")
                raise e

    # Convert the list of records to a DataFrame
    schema_df = pd.DataFrame(schema_records)

    return schema_df

schema_df = main()

Fetching schema for https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2009-01.parquet...
Rate limit or error for https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2009-01.parquet. Retrying in 1 seconds...
Creating a new session...
Rate limit or error for https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2009-01.parquet. Retrying in 2 seconds...
Creating a new session...
Rate limit or error for https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2009-01.parquet. Retrying in 4 seconds...
Creating a new session...
Rate limit or error for https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2009-01.parquet. Retrying in 8 seconds...
Creating a new session...
Rate limit or error for https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2009-01.parquet. Retrying in 16 seconds...
Creating a new session...
Rate limit or error for https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2009-01.parquet. Retrying in

## Dataframe with Case-Sensitive field names and types in each of the files

In [4]:
schema_df

Unnamed: 0,YearMonth,Name,Type
0,200901,vendor_name,BYTE_ARRAY
1,200901,Trip_Pickup_DateTime,BYTE_ARRAY
2,200901,Trip_Dropoff_DateTime,BYTE_ARRAY
3,200901,Passenger_Count,INT64
4,200901,Trip_Distance,DOUBLE
...,...,...,...
3545,202408,tolls_amount,DOUBLE
3546,202408,improvement_surcharge,DOUBLE
3547,202408,total_amount,DOUBLE
3548,202408,congestion_surcharge,DOUBLE


In [9]:
# Sort by YearMonth to ensure chronological order
schema_df['YearMonth'] = schema_df['YearMonth'].astype(str)
schema_df = schema_df.sort_values(by=['YearMonth']).reset_index(drop=True)

def print_schema_evolution(df):
    previous_schema = pd.DataFrame(columns=['Name', 'Type'])

    for year_month in df['YearMonth'].unique():
        # Filter the schema for the current YearMonth
        current_schema = df[df['YearMonth'] == year_month]

        # Print the schema evolution compared to the previous schema
        print(f"\nSchema changes for {year_month}:")

        if not previous_schema.empty:
            # Compare new fields (present in current but not in previous)
            new_fields = current_schema[~current_schema['Name'].isin(previous_schema['Name'])]
            if not new_fields.empty:
                print(f"\nNew fields added: \n\n{new_fields[['Name', 'Type']]}")

            # Compare removed fields (present in previous but not in current)
            removed_fields = previous_schema[~previous_schema['Name'].isin(current_schema['Name'])]
            if not removed_fields.empty:
                print(f"\nFields removed: \n\n{removed_fields[['Name', 'Type']]}")

            # Compare changed types (same Name but different Type)
            merged = pd.merge(previous_schema, current_schema, on='Name', how='outer', suffixes=('_prev', '_current'))
            changed_types = merged[merged['Type_prev'] != merged['Type_current']]
            if not changed_types.empty:
                print(f"\nFields with changed types: \n\n{changed_types[['Name', 'Type_prev', 'Type_current']]}")
        else:
            # Print the initial schema
            print("Initial schema:\n\n", current_schema[['Name', 'Type']])

        # Update previous schema to current schema for next iteration
        previous_schema = current_schema[['Name', 'Type']]


In [10]:
# Print the schema evolution
print_schema_evolution(schema_df)


Schema changes for 200901:
Initial schema:

                      Name        Type
0             vendor_name  byte_array
1       store_and_forward      double
2    trip_pickup_datetime  byte_array
3   trip_dropoff_datetime  byte_array
4         passenger_count       int64
5           trip_distance      double
6               start_lat      double
7               rate_code      double
8                fare_amt      double
9               start_lon      double
10                end_lat      double
11           payment_type  byte_array
12              surcharge      double
13                mta_tax      double
14                tip_amt      double
15              tolls_amt      double
16              total_amt      double
17                end_lon      double

Schema changes for 200902:

Schema changes for 200903:

Schema changes for 200904:

Schema changes for 200905:

Schema changes for 200906:

Schema changes for 200907:

Schema changes for 200908:

Schema changes for 200909:

Schema 

## Dataframe with Case-Insensitive field names and types in each of the files

In [11]:
# case insensitive field comparisons
schema_df_ci = schema_df
schema_df_ci['Name'] = schema_df_ci['Name'].str.lower()
schema_df_ci['Type'] = schema_df_ci['Type'].str.lower()
schema_df_ci

Unnamed: 0,YearMonth,Name,Type
0,200901,vendor_name,byte_array
1,200901,store_and_forward,double
2,200901,trip_pickup_datetime,byte_array
3,200901,trip_dropoff_datetime,byte_array
4,200901,passenger_count,int64
...,...,...,...
3545,202408,tip_amount,double
3546,202408,mta_tax,double
3547,202408,extra,double
3548,202408,congestion_surcharge,double


In [12]:
# Print the schema evolution
print_schema_evolution(schema_df_ci)


Schema changes for 200901:
Initial schema:

                      Name        Type
0             vendor_name  byte_array
1       store_and_forward      double
2    trip_pickup_datetime  byte_array
3   trip_dropoff_datetime  byte_array
4         passenger_count       int64
5           trip_distance      double
6               start_lat      double
7               rate_code      double
8                fare_amt      double
9               start_lon      double
10                end_lat      double
11           payment_type  byte_array
12              surcharge      double
13                mta_tax      double
14                tip_amt      double
15              tolls_amt      double
16              total_amt      double
17                end_lon      double

Schema changes for 200902:

Schema changes for 200903:

Schema changes for 200904:

Schema changes for 200905:

Schema changes for 200906:

Schema changes for 200907:

Schema changes for 200908:

Schema changes for 200909:

Schema 

## ChatGPT suggested field mapping

**Disclaimer:** While I can provide a general mapping strategy based on the given schema evolution, a precise mapping requires a deeper understanding of your specific use case, data quality, and desired analysis. It's always recommended to carefully examine the data and consult with domain experts to ensure the mapping aligns with your goals.

**General Mapping Strategy:**

Given the significant schema changes over time, a direct one-to-one mapping might not be feasible for all fields. A more flexible approach is to identify core fields and handle changes in field names, data types, and additions/removals.

**Core Fields:**

These fields are likely to be consistent across most schemas and can be used as a foundation for mapping:

* `trip_distance`
* `passenger_count`
* `payment_type`
* `fare_amount`
* `extra`
* `mta_tax`
* `tip_amount`
* `tolls_amount`
* `total_amount`

**Mapping Logic:**

1. **Direct Mapping:**
   * For fields that have remained consistent in name and data type, a direct mapping can be applied.
   * **Example:** `trip_distance` in the older schema can be directly mapped to `trip_distance` in the newer schema.

2. **Field Name Changes:**
   * Identify fields with similar meanings but different names.
   * **Example:** `vendor_name` in older schemas might map to `vendor_id` in newer schemas.

3. **Data Type Changes:**
   * Convert data types as needed, ensuring data integrity.
   * **Example:** If `rate_code` changed from `double` to `int64`, convert the values accordingly.

4. **Field Additions and Removals:**
   * For added fields, consider imputing missing values or using default values.
   * For removed fields, you might need to drop them or create new features based on related fields.

**Example Mapping Table:**

| Old Schema Field | New Schema Field | Mapping Logic | Notes |
|---|---|---|---|
| vendor_name | vendor_id | Direct mapping | Assuming 'vendor_name' and 'vendor_id' represent the same entity. |
| trip_pickup_datetime | pickup_datetime | Direct mapping | |
| trip_dropoff_datetime | dropoff_datetime | Direct mapping | |
| passenger_count | passenger_count | Direct mapping | |
| trip_distance | trip_distance | Direct mapping | |
| start_lat | pickup_latitude | Direct mapping | Assuming 'start_lat' and 'pickup_latitude' represent the same location. |
| start_lon | pickup_longitude | Direct mapping | Assuming 'start_lon' and 'pickup_longitude' represent the same location. |
| rate_code | ratecodeid | Convert to integer if necessary. | |
| fare_amt | fare_amount | Direct mapping | |
| end_lat | dropoff_latitude | Direct mapping | Assuming 'end_lat' and 'dropoff_latitude' represent the same location. |
| end_lon | dropoff_longitude | Direct mapping | Assuming 'end_lon' and 'dropoff_longitude' represent the same location. |
| payment_type | payment_type | Direct mapping | |
| surcharge | surcharge | Direct mapping | |
| mta_tax | mta_tax | Direct mapping | |
| tip_amt | tip_amount | Direct mapping | |
| tolls_amt | tolls_amount | Direct mapping | |
| total_amt | total_amount | Direct mapping | |
| store_and_forward (removed) | - | Drop this field. | |
| ... | ... | ... | ... |

**Additional Considerations:**

* **Data Quality Assessment:** Thoroughly analyze the data for inconsistencies, missing values, and outliers.
* **Data Cleaning and Preprocessing:** Clean and preprocess the data to handle missing values, outliers, and inconsistent data formats.
* **Data Integration Tools:** Utilize tools like dbt or Apache Airflow to automate the mapping and transformation process.
* **Data Validation:** Implement data validation checks to ensure data integrity after the mapping process.
* **Domain Expertise:** Consult with domain experts to understand the nuances of the data and make informed decisions about mapping and data quality.
* **Iterative Approach:** The mapping process might require adjustments as you gain deeper insights into the data and your analysis goals.
* **Documentation:** Document the mapping process, including assumptions, challenges, and solutions, for future reference and collaboration.

**To provide a more accurate and tailored mapping table, please share the specific years and months you're interested in analyzing. This will allow me to provide more specific mapping instructions.**
