# Download NYC TLC Uber Lyft

## Imports

In [1]:
import pandas as pd

# these two imports are for the get_DataFrame function
import os
import urllib.request

In [2]:
def get_DataFrame(file_name, local_path='data/'):
    """
    Tries to load a Parquet file from a local path.
    If the file is not found, downloads it from TLC's cloudfront URL and saves it locally.
    
    Parameters:
    - file_name (str): Name of the file to load/download (e.g., 'yellow_tripdata_2024-07.parquet').
    - local_path (str): Path to the local directory where the file is stored or will be saved. Default is 'data/'.
    
    Returns:
    - pd.DataFrame: DataFrame containing the data from the Parquet file.
    """
    # Ensure the local path exists
    os.makedirs(local_path, exist_ok=True)

    # Construct file paths
    local_file = os.path.join(local_path, file_name)
    tlc_url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file_name}"

    try:
        # Try reading the file from the local path
        print(f"Trying to load {file_name} from {local_file}")
        df = pd.read_parquet(local_file)
        return df

    except FileNotFoundError:
        print(f"{file_name} not found locally. Attempting to download from {tlc_url}")

        # Download the file from TLC's cloudfront URL
        try:
            urllib.request.urlretrieve(tlc_url, local_file)
            print(f"Downloaded {file_name} to {local_file}")
            # Load the downloaded file
            df = pd.read_parquet(local_file)
            return df
        except Exception as e:
            print(f"Failed to download {file_name}: {e}")
            raise

In [3]:
jan_2023 = 'fhvhv_tripdata_2023-01.parquet'
feb_2023 = 'fhvhv_tripdata_2023-02.parquet'
mar_2023 = 'fhvhv_tripdata_2023-03.parquet'
apr_2023 = 'fhvhv_tripdata_2023-04.parquet'
may_2023 = 'fhvhv_tripdata_2023-05.parquet'
jun_2023 = 'fhvhv_tripdata_2023-06.parquet'
jul_2023 = 'fhvhv_tripdata_2023-07.parquet'
aug_2023 = 'fhvhv_tripdata_2023-08.parquet'
sep_2023 = 'fhvhv_tripdata_2023-09.parquet'
oct_2023 = 'fhvhv_tripdata_2023-10.parquet'
nov_2023 = 'fhvhv_tripdata_2023-11.parquet'
dec_2023 = 'fhvhv_tripdata_2023-12.parquet'

jan_2024 = 'fhvhv_tripdata_2024-01.parquet'
feb_2024 = 'fhvhv_tripdata_2024-02.parquet'
mar_2024 = 'fhvhv_tripdata_2024-03.parquet'
apr_2024 = 'fhvhv_tripdata_2024-04.parquet'
may_2024 = 'fhvhv_tripdata_2024-05.parquet'
jun_2024 = 'fhvhv_tripdata_2024-06.parquet'
jul_2024 = 'fhvhv_tripdata_2024-07.parquet'

In [4]:
test = get_DataFrame(jan_2024)
test.info()

Trying to load fhvhv_tripdata_2024-01.parquet from data/fhvhv_tripdata_2024-01.parquet
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19663930 entries, 0 to 19663929
Data columns (total 24 columns):
 #   Column                Dtype         
---  ------                -----         
 0   hvfhs_license_num     object        
 1   dispatching_base_num  object        
 2   originating_base_num  object        
 3   request_datetime      datetime64[us]
 4   on_scene_datetime     datetime64[us]
 5   pickup_datetime       datetime64[us]
 6   dropoff_datetime      datetime64[us]
 7   PULocationID          int32         
 8   DOLocationID          int32         
 9   trip_miles            float64       
 10  trip_time             int64         
 11  base_passenger_fare   float64       
 12  tolls                 float64       
 13  bcf                   float64       
 14  sales_tax             float64       
 15  congestion_surcharge  float64       
 16  airport_fee           float64       


In [5]:
file_names = [jan_2023, feb_2023, mar_2023, apr_2023, may_2023, jun_2023, 
              jul_2023, aug_2023, sep_2023, oct_2023, nov_2023, dec_2023, 
              jan_2024, feb_2024, mar_2024, apr_2024, may_2024, jun_2024,
              jul_2024]

for file in file_names:
    df = get_DataFrame(file)
    print(f'Success: {df.shape}')
    del df # not enough room in memory for all fo them

Trying to load fhvhv_tripdata_2023-01.parquet from data/fhvhv_tripdata_2023-01.parquet
fhvhv_tripdata_2023-01.parquet not found locally. Attempting to download from https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2023-01.parquet
Downloaded fhvhv_tripdata_2023-01.parquet to data/fhvhv_tripdata_2023-01.parquet
Success: (18479031, 24)
Trying to load fhvhv_tripdata_2023-02.parquet from data/fhvhv_tripdata_2023-02.parquet
fhvhv_tripdata_2023-02.parquet not found locally. Attempting to download from https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2023-02.parquet
Downloaded fhvhv_tripdata_2023-02.parquet to data/fhvhv_tripdata_2023-02.parquet
Success: (17960971, 24)
Trying to load fhvhv_tripdata_2023-03.parquet from data/fhvhv_tripdata_2023-03.parquet
fhvhv_tripdata_2023-03.parquet not found locally. Attempting to download from https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2023-03.parquet
Downloaded fhvhv_tripdata_2023-03.parquet to data/fhvhv_tr

In [6]:
!ls 'data'

fhvhv_tripdata_2023-01.parquet fhvhv_tripdata_2023-11.parquet
fhvhv_tripdata_2023-02.parquet fhvhv_tripdata_2023-12.parquet
fhvhv_tripdata_2023-03.parquet fhvhv_tripdata_2024-01.parquet
fhvhv_tripdata_2023-04.parquet fhvhv_tripdata_2024-02.parquet
fhvhv_tripdata_2023-05.parquet fhvhv_tripdata_2024-03.parquet
fhvhv_tripdata_2023-06.parquet fhvhv_tripdata_2024-04.parquet
fhvhv_tripdata_2023-07.parquet fhvhv_tripdata_2024-05.parquet
fhvhv_tripdata_2023-08.parquet fhvhv_tripdata_2024-06.parquet
fhvhv_tripdata_2023-09.parquet fhvhv_tripdata_2024-07.parquet
fhvhv_tripdata_2023-10.parquet
