# Understanding Hired Rides in NYC

## Project Setup

In [372]:
# all import statements needed for the project

import os
import bs4
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import pandas as pd
import requests
import sqlalchemy as db
import re
import geopandas as gpd
from math import ceil
from urllib.parse import unquote
import glob
from sqlalchemy import text
import folium
from folium.plugins import HeatMap

In [243]:

TLC_URL = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

dataset_directory = "/Users/shaoziheng/Desktop/4501/project/datasets"
TAXI_ZONES_DIR = "/Users/shaoziheng/Desktop/4501/project/datasets/taxi_zones"
TAXI_ZONES_SHAPEFILE = f"{TAXI_ZONES_DIR}/taxi_zones.shp"
WEATHER_CSV_DIR = "/Users/shaoziheng/Desktop/4501/project/datasets/weather"

CRS = 4326  # coordinate reference system

# (lat, lon)
NEW_YORK_BOX_COORDS = ((40.560445, -74.242330), (40.908524, -73.717047))
LGA_BOX_COORDS = ((40.763589, -73.891745), (40.778865, -73.854838))
JFK_BOX_COORDS = ((40.639263, -73.795642), (40.651376, -73.766264))
EWR_BOX_COORDS = ((40.686794, -74.194028), (40.699680, -74.165205))

DATABASE_URL = "sqlite:///project.db"
DATABASE_SCHEMA_FILE = "schema.sql"
QUERY_DIRECTORY = "queries"

In [244]:
# Make sure the QUERY_DIRECTORY exists
try:
    os.mkdir(QUERY_DIRECTORY)
except Exception as e:
    if e.errno == 17:
        # the directory already exists
        pass
    else:
        raise

## Part 1: Data Preprocessing

### Load Taxi Zones

In [245]:
def load_taxi_zones(shapefile):
    """
    Load and preprocess a shapefile containing taxi zone data.

    Args:
        shapefile (str): Path to the shapefile containing taxi zone boundaries. 
                         The shapefile must include `LocationID` or similar 
                         geographic attributes.
                        
    Returns:
        A GeoDataFrame with added `longitude` and 
        `latitude` columns corresponding to the centroids
        of the taxi zones.
    """
    g = gpd.read_file(shapefile)
    g = g.to_crs(4326)
    g['longitude'] = g.centroid.x
    g['latitude'] = g.centroid.y
    return g

### Web scraping links for downloading files

In [333]:
# Fetch URL from the TLC page
def get_all_urls_from_tlc_page(taxi_page):
    """
    Fetch the HTML content from the provided TLC page URL and parse it with BeautifulSoup.

    Args:
        taxi_page (str): The URL of the TLC webpage containing the data links.

    Returns:
        BeautifulSoup: A parsed BeautifulSoup object containing the HTML content 
                       of the webpage for further processing.
    """
    response = requests.get(taxi_page)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch page content: {response.status_code}")
    soup = BeautifulSoup(response.text, "html.parser")
    return soup

In [247]:
# Extract URLs for yellow taxi and HVFHV data from the TLC page

def filter_parquet_urls(soup):
    """
    Extract URLs for yellow taxi and HVFHV trip data in Parquet format from the TLC webpage.

    Args:
        soup (BeautifulSoup): A BeautifulSoup object containing the parsed HTML of the TLC webpage.

    Returns:
        tuple: A tuple containing two lists:
            - yellow_taxi_links (list): List of URLs for yellow taxi Parquet files.
            - hvfhv_links (list): List of URLs for HVFHV Parquet files.

    Description:
        - Identifies links that match the naming pattern for yellow taxi and HVFHV trip data.
        - Decodes encoded characters (e.g., `%20` -> space) in the URLs.
        - Filters links based on file naming conventions for years 2020-2024.
    """
    yellow_taxi_links = []
    yellow_links = soup.find_all('a', {'href': re.compile(r"yellow_tripdata_202[0-3]-\d{2}\.parquet|yellow_tripdata_2024-(0[1-8])\.parquet")})
    for link in yellow_links:
        url = link['href'].strip()  # Remove leading/trailing spaces
        url = unquote(url)  # Decode any encoded characters like %20
        yellow_taxi_links.append(url)
        
    hvfhv_links = []
    hvfhv_links_soup = soup.find_all('a', {'href': re.compile(r"fhvhv_tripdata_202[0-3]-\d{2}\.parquet|fhvhv_tripdata_2024-(0[1-8])\.parquet")})
    for link in hvfhv_links_soup:
        url = link['href'].strip()  # Remove leading/trailing spaces
        url = unquote(url)  # Decode any encoded characters like %20
        hvfhv_links.append(url)
        
    return yellow_taxi_links, hvfhv_links


In [248]:
# download  the Yellow Taxi & High-Volume For-Hire Vehicle (HVFHV) trip data parquet files and save them to directory

def download_parquet_file(urls, output_directory):
    """
    Downloads Parquet files from a list of URLs and saves them to the specified directory.

    Args:
        urls (list): List of URLs to download.
        output_directory (str): Path to the directory where files will be saved.

    Raises:
        Exception: If a file fails to download.
    """
    # Ensure the output directory exists
    os.makedirs(output_directory, exist_ok=True)

    for url in urls:
        file_name = os.path.basename(url)
        output_path = os.path.join(output_directory, file_name)

        try:
            print(f"Downloading {url}...")
            response = requests.get(url, stream=True)
            response.raise_for_status()  # Raise an error for failed requests

            # Write the file content to disk
            with open(output_path, 'wb') as file:
                for chunk in response.iter_content(chunk_size=1024):
                    if chunk:  # Filter out keep-alive chunks
                        file.write(chunk)

            print(f"Saved to {output_path}")
        except Exception as e:
            print(f"Failed to download {url}: {e}")

In [249]:
soup=get_all_urls_from_tlc_page(TLC_URL)
yellow_taxi_links, hvfhv_links = filter_parquet_urls(soup)

In [26]:
# Download Yellow Taxi files
download_parquet_file(yellow_taxi_links, os.path.join(dataset_directory, "yellow_tripdata"))

# Download Uber HVFHV files
download_parquet_file(hvfhv_links, os.path.join(dataset_directory, "fhvhv_tripdata"))


Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet...
Saved to /Users/shaoziheng/Desktop/4501/project/datasets/yellow_tripdata/yellow_tripdata_2024-01.parquet
Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet...
Saved to /Users/shaoziheng/Desktop/4501/project/datasets/yellow_tripdata/yellow_tripdata_2024-02.parquet
Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-03.parquet...
Saved to /Users/shaoziheng/Desktop/4501/project/datasets/yellow_tripdata/yellow_tripdata_2024-03.parquet
Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-04.parquet...
Saved to /Users/shaoziheng/Desktop/4501/project/datasets/yellow_tripdata/yellow_tripdata_2024-04.parquet
Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-05.parquet...
Saved to /Users/shaoziheng/Desktop/4501/project/datasets/yellow_tripdata/yellow_tripdata_2024-05.parquet


Saved to /Users/shaoziheng/Desktop/4501/project/datasets/yellow_tripdata/yellow_tripdata_2021-10.parquet
Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-11.parquet...
Saved to /Users/shaoziheng/Desktop/4501/project/datasets/yellow_tripdata/yellow_tripdata_2021-11.parquet
Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-12.parquet...
Saved to /Users/shaoziheng/Desktop/4501/project/datasets/yellow_tripdata/yellow_tripdata_2021-12.parquet
Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2020-01.parquet...
Saved to /Users/shaoziheng/Desktop/4501/project/datasets/yellow_tripdata/yellow_tripdata_2020-01.parquet
Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2020-02.parquet...
Saved to /Users/shaoziheng/Desktop/4501/project/datasets/yellow_tripdata/yellow_tripdata_2020-02.parquet
Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2020-03.parquet...


Saved to /Users/shaoziheng/Desktop/4501/project/datasets/fhvhv_tripdata/fhvhv_tripdata_2022-08.parquet
Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2022-09.parquet...
Saved to /Users/shaoziheng/Desktop/4501/project/datasets/fhvhv_tripdata/fhvhv_tripdata_2022-09.parquet
Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2022-10.parquet...
Saved to /Users/shaoziheng/Desktop/4501/project/datasets/fhvhv_tripdata/fhvhv_tripdata_2022-10.parquet
Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2022-11.parquet...
Saved to /Users/shaoziheng/Desktop/4501/project/datasets/fhvhv_tripdata/fhvhv_tripdata_2022-11.parquet
Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2022-12.parquet...
Saved to /Users/shaoziheng/Desktop/4501/project/datasets/fhvhv_tripdata/fhvhv_tripdata_2022-12.parquet
Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2021-01.parquet...
Saved to /Users

### Calculate Sample Size

In [250]:
# calculate sample size using Cochran's formula with 95% CI and 5% marginal error.

def calculate_sample_size(population, confidence_level=0.95, margin_of_error=0.05):
    """
    Calculate the sample size using Cochran's formula, considering finite population correction.

    Args:
        population (int): Total population size for which the sample size needs to be calculated.
        confidence_level (float): Desired confidence level (default is 0.95 for 95% CI).
        margin_of_error (float): Allowable margin of error (default is 0.05 for 5%).

    Returns:
        int: The calculated sample size, rounded up to the nearest integer.

    Formula:
        Cochran's formula for infinite population:
            n0 = (Z^2 * p * (1 - p)) / e^2
        Where:
            - Z: Z-score corresponding to the confidence level.
            - p: Estimated proportion of the population (default: 0.5 for maximum variability).
            - e: Margin of error.

        Finite population correction for population size N:
            n = n0 / (1 + (n0 - 1) / N)
    """
    Z = {0.9: 1.645, 0.95: 1.96, 0.99: 2.576}[confidence_level]
    p = 0.5
    e = margin_of_error
    sample_size = (Z**2 * p * (1 - p)) / e**2
    if population < 1e6:  # Finite population correction
        sample_size = sample_size / (1 + (sample_size - 1) / population)
    return ceil(sample_size)