In [1]:
import requests
import time
import pandas as pd
import os
from tqdm import tqdm

TAXI DATASET

In [2]:
Yellow_API_Endpoint = "https://data.cityofnewyork.us/resource/m6nq-qud6.json"
Green_API_Endpoint = "https://data.cityofnewyork.us/resource/djnb-wcxt.json"

LIMIT = 1000

YELLOW_ROWS = 30000000
GREEN_ROWS = 1000000

In [3]:
def load_last_offset(offset_file):
    if os.path.exists(offset_file):
        with open(offset_file, 'r') as f:
            return int(f.read().strip())
    return 0  # Default to start from the beginning if no offset file exists

def save_last_offset(offset_file, offset):
    with open(offset_file, 'w') as f:
        f.write(str(offset))

In [4]:
def append_to_csv(df):
    # Define the file path
    output_path = os.path.join("../raw_data", "yellow_data.csv")

    # Append to the CSV file, without writing the header if the file already exists
    df.to_csv(output_path, mode='a', header=not os.path.exists(output_path), index=False)
    print(f"Appended data to {output_path}")

In [5]:
def fetch_taxi_data(api, limit, rows, max_retries=5, retry_delay=10):

    folder = "../raw_data"
    os.makedirs(folder, exist_ok=True)
    offset_file = os.path.join(folder, "last_offset.txt")
    offset = load_last_offset(offset_file)

    # Progress bar setup
    with tqdm(total=rows, desc="Fetching Data", initial=offset, position=0) as pbar:
        for i in range(offset, rows, limit):
            params = {
                "$limit": limit,
                "$offset": i
            }

            success = False
            retries = 0

            while not success and retries < max_retries:
                try:
                    # API request
                    response = requests.get(api, params=params, timeout=30)

                    if response.status_code == 200:
                        data = response.json()

                        # Convert to DataFrame
                        temp_df = pd.DataFrame(data)

                        # Append to CSV file
                        append_to_csv(temp_df)

                        # Update progress bar and offset
                        pbar.update(len(data))
                        save_last_offset(offset_file, i + len(data))  # Save offset after successful fetch
                        success = True

                    else:
                        print(f"Error: Status Code {response.status_code}. Retrying...")
                        retries += 1
                        time.sleep(retry_delay)

                except requests.exceptions.ConnectionError as e:
                    print(f"Connection error: {e}. Retrying ({retries + 1}/{max_retries})...")
                    retries += 1
                    time.sleep(retry_delay)

                except requests.exceptions.Timeout as e:
                    print(f"Timeout error: {e}. Retrying ({retries + 1}/{max_retries})...")
                    retries += 1
                    time.sleep(retry_delay)

            if not success:
                print(f"Failed to fetch data after {max_retries} retries. Offset: {i}")
                break

    print("Data fetching complete!")

def fetch_simple(api, limit, rows, output_file):
    folder = "../raw_data"
    os.makedirs(folder, exist_ok=True)

    # Initialize progress bar
    with tqdm(total=rows, desc="Fetching Data") as pbar:
        for i in range(0, rows, limit):
            params = {
                "$limit": limit,
                "$offset": i
            }

            # API request
            response = requests.get(api, params=params, timeout=30)

            if response.status_code == 200:
                data = response.json()
                temp_df = pd.DataFrame(data)

                # Append to CSV file
                if not os.path.exists(output_file):
                    temp_df.to_csv(output_file, index=False)  # Write header if file doesn't exist
                else:
                    temp_df.to_csv(output_file, mode='a', index=False, header=False)  # Append data without header

                # Update progress bar
                pbar.update(len(data))
            else:
                print(f"Error: Status Code {response.status_code}. Skipping offset {i}...")

    print("Data fetching complete!")

In [14]:
fetch_taxi_data(Yellow_API_Endpoint, LIMIT, YELLOW_ROWS)
fetch_simple(Green_API_Endpoint, LIMIT, GREEN_ROWS, "green_data.csv")

Fetching Data: 100%|██████████| 1000000/1000000 [31:49<00:00, 523.62it/s]

Data fetching complete!



