## Project Setup

In [2]:
# all import statements needed for the project, for example:

import os
import re
from bs4 import BeautifulSoup
from pathlib import Path
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
import requests
import sqlalchemy as db

In [3]:
# any constants you might need; some have been added for you, and 
# some you need to fill in

TLC_URL = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

TAXI_ZONES_DIR = "taxi_zones"
TAXI_ZONES_SHAPEFILE = os.path.join(TAXI_ZONES_DIR, "taxi_zones.shp")
WEATHER_CSV_DIR = "weather_data"

CRS = 4326  # coordinate reference system

# (lat, lon)
NEW_YORK_BOX_COORDS = ((40.560445, -74.242330), (40.908524, -73.717047))
LGA_BOX_COORDS = ((40.763589, -73.891745), (40.778865, -73.854838))
JFK_BOX_COORDS = ((40.639263, -73.795642), (40.651376, -73.766264))
EWR_BOX_COORDS = ((40.686794, -74.194028), (40.699680, -74.165205))

DATABASE_URL = "sqlite:///project.db"
DATABASE_SCHEMA_FILE = "schema.sql"
QUERY_DIRECTORY = "queries"

In [4]:
os.makedirs("taxi_zones", exist_ok=True)
print("Created 'taxi_zones' directory.")

Created 'taxi_zones' directory.


In [5]:
# Make sure the QUERY_DIRECTORY exists
try:
    os.mkdir(QUERY_DIRECTORY)
except Exception as e:
    if e.errno == 17:
        # the directory already exists
        pass
    else:
        raise

In [6]:
new_directory = r"E:\4501 final"
os.chdir(new_directory)
print(f"New working directory: {os.getcwd()}")

New working directory: E:\4501 final


In [7]:
try:
    taxi_zones = gpd.read_file(TAXI_ZONES_SHAPEFILE)
    print("Taxi zones loaded successfully.")
except Exception as e:
    print(f"Error loading taxi zones: {e}")

Taxi zones loaded successfully.


## Part 1: Data Preprocessing

### Load Taxi Zones

In [10]:
def load_taxi_zones(shapefile):
    taxi_zones = gpd.read_file(shapefile)
    return taxi_zones

In [11]:
# check 
if not os.path.exists(TAXI_ZONES_DIR):
    print(f"Directory '{TAXI_ZONES_DIR}' does not exist. Please create it and add the shapefile.")
else:
    TAXI_ZONES_SHAPEFILE = os.path.join(TAXI_ZONES_DIR, "taxi_zones.shp")
    if not os.path.exists(TAXI_ZONES_SHAPEFILE):
        print(f"Shapefile '{TAXI_ZONES_SHAPEFILE}' does not exist. Please ensure it's in the directory.")
    else:
        taxi_zones = load_taxi_zones(TAXI_ZONES_SHAPEFILE)
        print("Taxi zones loaded successfully.")

Taxi zones loaded successfully.


In [12]:
taxi_zones = load_taxi_zones(TAXI_ZONES_SHAPEFILE)

def lookup_coords_for_taxi_zone_id(zone_loc_id, loaded_taxi_zones=taxi_zones):
    zone = loaded_taxi_zones[loaded_taxi_zones["LocationID"] == zone_loc_id]
    centroid = zone.geometry.centroid.iloc[0]
    centroid_geo = gpd.GeoSeries([centroid], crs=loaded_taxi_zones.crs).to_crs(epsg=CRS).iloc[0]

    latitude = centroid_geo.y
    longitude = centroid_geo.x

    return (latitude, longitude)

In [13]:
def make_loc_id_coords_dict(loaded_taxi_zones):
    id_coords_dict = {}
    for loc_id in loaded_taxi_zones["LocationID"]:
        id_coords_dict[loc_id] = lookup_coords_for_taxi_zone_id(loc_id, loaded_taxi_zones)

    return id_coords_dict

In [14]:
taxi_zones = load_taxi_zones(TAXI_ZONES_SHAPEFILE)
ID_COORDS_DICT = make_loc_id_coords_dict(taxi_zones)

### Calculate Sample Size

In [16]:
def calculate_sample_size(population):
    confidence_level = 0.95
    margin_of_error = 0.05
    proportion = 0.5
    
    from scipy.stats import norm

    z_score = norm.ppf(1 - (1 - confidence_level) / 2)

    # Cochran’s
    n = (z_score**2 * proportion * (1 - proportion)) / (margin_of_error**2)
    
    # Adjust for finite population
    n_adj = n / (1 + (n - 1) / population)
    
    return int(round(n_adj)) 

### Common Functions

In [18]:
def get_all_urls_from_taxi_page(taxi_page):
    response = requests.get(taxi_page)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    yellow_tags = soup.find_all("a", attrs={"title": "Yellow Taxi Trip Records"})
    fhvhv_tags = soup.find_all("a", attrs={"title": "High Volume For-Hire Vehicle Trip Records"})

    yellow_urls = [a["href"].strip() for a in yellow_tags]
    fhvhv_urls = [a["href"].strip() for a in fhvhv_tags]
    
    return yellow_urls, fhvhv_urls

In [19]:
def find_parquet_urls(urls):
    pattern = re.compile(r"\.parquet$")
    parquet_urls = [url for url in urls if pattern.search(url)]    
    return parquet_urls

In [20]:
def download_parquet(urls, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    for url in urls:
        filename = os.path.basename(url)
        output_path = os.path.join(output_dir, filename)
        if os.path.exists(output_path):
            continue        
        response = requests.get(url, stream=True)
        with open(output_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=1024): 
                if chunk:
                    f.write(chunk)
        print(f"Downloaded {filename} to {output_dir}")

In [21]:
def get_and_clean_month(urls):
    pattern = re.compile(r"(202[0-3]-(0[1-9]|1[0-2])|2024-(0[1-8]))")
    cleaned_urls = [url for url in urls if pattern.search(url)]
    return cleaned_urls

In [22]:
def read_parquet_to_df(directory):
    files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith(".parquet")]
    all_dataframe = []
    for file in files:
        
        df = pd.read_parquet(file)
        all_dataframe.append(df)
    if all_dataframe:
        combined_df = pd.concat(all_dataframe, ignore_index=True)
        return combined_df

In [23]:
def clean_parquet_column(file_path, columns_to_keep):
    df = pd.read_parquet(file_path)
    cleaned_df = df[columns_to_keep]
    return cleaned_df

In [24]:
yellow_urls, fhvhv_urls = get_all_urls_from_taxi_page(TLC_URL)
taxi_parquet = find_parquet_urls(yellow_urls)
uber_parquet = find_parquet_urls(fhvhv_urls)
taxi_urls = get_and_clean_month(taxi_parquet)
uber_urls = get_and_clean_month(uber_parquet)

In [25]:
def sample_monthly(directory):
    files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.parquet')]
    
    max_rows = 0
    for file in files:
        df = pd.read_parquet(file)
        max_rows = max(max_rows, len(df))

    sample_size = calculate_sample_size(max_rows)
    print(f"Sample size for all months: {sample_size}")

    sampled_dataframes = []
    for file in files:
        df = pd.read_parquet(file)
        sampled_df = df.sample(n=sample_size, random_state=30, replace=False)
        sampled_dataframes.append(sampled_df)
    print("Finished sampling")

    if sampled_dataframes:
        combined_sampled_df = pd.concat(sampled_dataframes, ignore_index=True)
        return combined_sampled_df

### Process Taxi Data

In [27]:
taxi_data_dir = "taxi_data"
download_parquet(taxi_urls, taxi_data_dir)

In [28]:
sampled_taxi_df = sample_monthly(taxi_data_dir)

Sample size for all months: 384
Finished sampling


  combined_sampled_df = pd.concat(sampled_dataframes, ignore_index=True)


In [29]:
sampled_taxi_df.head(10)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,Airport_fee
0,2,2020-01-21 15:41:03,2020-01-21 15:54:32,2.0,0.71,1.0,N,233,164,1,9.0,0.0,0.5,1.84,0.0,0.3,14.14,2.5,,
1,2,2020-01-29 21:06:16,2020-01-29 21:11:34,1.0,0.76,1.0,N,141,140,2,5.5,0.5,0.5,0.0,0.0,0.3,9.3,2.5,,
2,2,2020-01-26 20:19:48,2020-01-26 20:25:39,3.0,0.77,1.0,N,161,163,2,5.5,0.5,0.5,0.0,0.0,0.3,9.3,2.5,,
3,2,2020-01-16 13:17:34,2020-01-16 13:24:01,1.0,0.99,1.0,N,143,239,1,6.5,0.0,0.5,1.96,0.0,0.3,11.76,2.5,,
4,2,2020-01-30 20:20:32,2020-01-30 20:28:30,1.0,0.84,1.0,N,233,162,1,6.5,0.5,0.5,1.2,0.0,0.3,11.5,2.5,,
5,2,2020-01-29 19:21:58,2020-01-29 19:35:52,1.0,1.54,1.0,N,90,161,1,10.0,1.0,0.5,3.58,0.0,0.3,17.88,2.5,,
6,1,2020-01-12 17:42:09,2020-01-12 17:47:26,2.0,1.1,1.0,N,140,236,2,6.0,2.5,0.5,0.0,0.0,0.3,9.3,2.5,,
7,1,2020-01-29 22:03:48,2020-01-29 22:08:19,1.0,0.9,1.0,N,142,239,2,5.5,3.0,0.5,0.0,0.0,0.3,9.3,2.5,,
8,2,2020-01-21 21:17:23,2020-01-21 21:36:29,4.0,3.78,1.0,N,161,148,1,15.0,0.5,0.5,3.76,0.0,0.3,22.56,2.5,,
9,1,2020-01-15 01:27:14,2020-01-15 01:32:45,1.0,0.9,1.0,N,161,233,1,6.0,3.0,0.5,2.9,0.0,0.3,12.7,2.5,,


In [30]:
def get_and_clean_month(dataframe):
    try: 
        print(f"Cleaning the sample dataframe...")
        

        # look up the latitude and longitude (get those coordinates)
        dataframe[["latitude_pickup", "longitude_pickup"]] = dataframe["PULocationID"].map(ID_COORDS_DICT).apply(pd.Series)
        dataframe[["latitude_dropoff", "longitude_dropoff"]] = dataframe["DOLocationID"].map(ID_COORDS_DICT).apply(pd.Series)

        
        # remove some location IDs not valid and distance is 0
        dataframe = dataframe.dropna(subset=['latitude_pickup', 'longitude_pickup', 'latitude_dropoff', 'longitude_dropoff'])
        df = df[df["trip_distance"] != 0]


        # remove unnecessary columns
        columns_to_keep = [
            'tpep_pickup_datetime', 'tpep_dropoff_datetime',
            'trip_distance', 
            'latitude_pickup', 'longitude_pickup', 'latitude_dropoff', 'longitude_dropoff' , 
            'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'congestion_surcharge', 'airport_fee', 
            'total_amount'
        ]         #调整
        dataframe = dataframe[columns_to_keep]


        # normalize column names
        dataframe.columns = [col.lower().replace(' ', '_') for col in dataframe.columns]

        # normalizing and using appropriate column types for the respective data;
        dataframe['tpep_pickup_datetime'] = pd.to_datetime(dataframe['tpep_pickup_datetime'])
        dataframe['tpep_dropoff_datetime'] = pd.to_datetime(dataframe['tpep_dropoff_datetime'])
        dataframe['trip_distance'] = dataframe['trip_distance'].astype(float)

        # for Yellow Taxi data, remove trips that start and/or end outside of  (40.560445, -74.242330) and (40.908524, -73.717047).
        lat_min, lon_min = 40.560445, -74.242330
        lat_max, lon_max = 40.908524, -73.717047
        dataframe = dataframe[
            (dataframe['latitude_pickup'].between(lat_min, lat_max)) &
            (dataframe['longitude_pickup'].between(lon_min, lon_max)) &
            (dataframe['latitude_dropoff'].between(lat_min, lat_max)) &
            (dataframe['longitude_dropoff'].between(lon_min, lon_max))
        ]

        return dataframe

    except Exception as e:
        print(f"Error processing the dataframe: {e}")
        return None

In [31]:
def get_and_clean_taxi_data(directory):
    all_dataframes = []
    
    if not isinstance(directory, str):
        raise ValueError("Expected a directory path as a string.")
        
    files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.parquet')]
    for parquet_url in files:
        # to see if have downloaded this exact
        # file already and saved it before trying again
        output_path = f"{parquet_url.split('/')[-1].replace('.parquet', '')}.parquet"  
        if os.path.exists(output_path):
            dataframe = pd.read_parquet(output_path) 
        else:
            dataframe = pd.read_parquet(parquet_url)  
        
        cleaned_dataframe = get_and_clean_month(dataframe)
        
        if cleaned_dataframe is not None:
            all_dataframes.append(cleaned_dataframe)

    if all_dataframes:
        return pd.concat(all_dataframes, ignore_index=True)
    else:
        return None

In [32]:
def get_taxi_data():
    all_urls = get_all_urls_from_taxi_page(TLC_URL)
    yellow_urls, fhvhv_urls = all_urls
    taxi_parquet = find_parquet_urls(yellow_urls)
    taxi_data = get_and_clean_taxi_data(taxi_data_dir)  

    return taxi_data

In [None]:
taxi_data = get_taxi_data()

Cleaning the sample dataframe...


In [None]:
taxi_data.head()

In [None]:
taxi_data.info()

In [None]:
taxi_data.describe()