In [1]:
import pandas as pd

In [2]:
pd.__version__

'2.2.3'

In [None]:
# ! pip install pyarrow

In [None]:
# Approach 1: Verify installation and troubleshoot
import sys
import subprocess
import importlib

# Show Python environment info
print(f"Python version: {sys.version}")
print(f"Python executable: {sys.executable}")

# Try to install pyarrow with pip directly through subprocess
try:
    print("Installing pyarrow...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "pyarrow", "--upgrade"])
    print("Installation completed.")
except Exception as e:
    print(f"Installation error: {e}")

# Check if pyarrow can be imported now
try:
    import pyarrow
    print(f"Successfully imported pyarrow version: {pyarrow.__version__}")
except ImportError as e:
    print(f"Import error: {e}")

# Try to read the parquet file
try:
    import pandas as pd
    print("Attempting to read parquet file...")
    df = pd.read_parquet("./green_tripdata_2025-01.parquet")
    print(f"Success! DataFrame shape: {df.shape}")
    print(df.head())
except Exception as e:
    print(f"Error reading parquet file: {e}")
    
    # Approach 2: Alternative method using pyarrow directly
    try:
        print("\nTrying alternative approach with pyarrow directly...")
        import pyarrow.parquet as pq
        table = pq.read_table("./green_tripdata_2025-01.parquet")
        df = table.to_pandas()
        print(f"Success with alternative method! DataFrame shape: {df.shape}")
        print(df.head())
    except Exception as e2:
        print(f"Alternative approach failed: {e2}")
        
        # Approach 3: Check if file exists
        import os
        if os.path.exists("./green_tripdata_2025-01.parquet"):
            print("File exists, but cannot be read. Possible file corruption or format issues.")
        else:
            print("File does not exist! Check your file path.")

In [3]:
df = pd.read_parquet("./green_tripdata_2025-01.parquet")

In [4]:
df.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,cbd_congestion_fee
0,2,2025-01-01 00:03:01,2025-01-01 00:17:12,N,1.0,75,235,1.0,5.93,24.7,...,0.5,6.8,0.0,,1.0,34.0,1.0,1.0,0.0,0.0
1,2,2025-01-01 00:19:59,2025-01-01 00:25:52,N,1.0,166,75,1.0,1.32,8.6,...,0.5,0.0,0.0,,1.0,11.1,2.0,1.0,0.0,0.0
2,2,2025-01-01 00:05:29,2025-01-01 00:07:21,N,5.0,171,73,1.0,0.41,25.55,...,0.0,0.0,0.0,,1.0,26.55,2.0,2.0,0.0,0.0
3,2,2025-01-01 00:52:24,2025-01-01 01:07:52,N,1.0,74,223,1.0,4.12,21.2,...,0.5,6.13,6.94,,1.0,36.77,1.0,1.0,0.0,0.0
4,2,2025-01-01 00:25:05,2025-01-01 01:01:10,N,1.0,66,158,1.0,4.71,33.8,...,0.5,7.81,0.0,,1.0,46.86,1.0,1.0,2.75,0.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48326 entries, 0 to 48325
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   VendorID               48326 non-null  int32         
 1   lpep_pickup_datetime   48326 non-null  datetime64[us]
 2   lpep_dropoff_datetime  48326 non-null  datetime64[us]
 3   store_and_fwd_flag     46490 non-null  object        
 4   RatecodeID             46490 non-null  float64       
 5   PULocationID           48326 non-null  int32         
 6   DOLocationID           48326 non-null  int32         
 7   passenger_count        46490 non-null  float64       
 8   trip_distance          48326 non-null  float64       
 9   fare_amount            48326 non-null  float64       
 10  extra                  48326 non-null  float64       
 11  mta_tax                48326 non-null  float64       
 12  tip_amount             48326 non-null  float64       
 13  t

In [6]:
df.shape

(48326, 21)

## Read file from web directly

In [7]:
df = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-01.parquet")

In [8]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee
0,1,2025-01-01 00:18:38,2025-01-01 00:26:59,1.0,1.6,1.0,N,229,237,1,10.0,3.5,0.5,3.0,0.0,1.0,18.0,2.5,0.0,0.0
1,1,2025-01-01 00:32:40,2025-01-01 00:35:13,1.0,0.5,1.0,N,236,237,1,5.1,3.5,0.5,2.02,0.0,1.0,12.12,2.5,0.0,0.0
2,1,2025-01-01 00:44:04,2025-01-01 00:46:01,1.0,0.6,1.0,N,141,141,1,5.1,3.5,0.5,2.0,0.0,1.0,12.1,2.5,0.0,0.0
3,2,2025-01-01 00:14:27,2025-01-01 00:20:01,3.0,0.52,1.0,N,244,244,2,7.2,1.0,0.5,0.0,0.0,1.0,9.7,0.0,0.0,0.0
4,2,2025-01-01 00:21:34,2025-01-01 00:25:06,3.0,0.66,1.0,N,244,116,2,5.8,1.0,0.5,0.0,0.0,1.0,8.3,0.0,0.0,0.0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3475226 entries, 0 to 3475225
Data columns (total 20 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int32         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int32         
 8   DOLocationID           int32         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  Airport_fee           

In [10]:
df.shape

(3475226, 20)