### Part 1: Data Preprocessing

In [5]:
import os
import re
import zipfile
import requests
import pyarrow
import pandas as pd
import geopandas as gpd
from datetime import datetime
from bs4 import BeautifulSoup


def fetch_yellow_taxi_links(base_url):
    resp = requests.get(base_url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, 'html.parser')
    return soup.find_all('a', href=re.compile(r'^(?=.*yellow_tripdata)(?=.*(\d{4}-\d{2}\.parquet|\.zip)).*$'))


def fetch_taxi_data(links, start_date, end_date): 

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for link in links:
        url = link['href']
        file_name = url.split('/')[-1]
        date_str = file_name.split('_')[-1].split('.')[0]
        date_obj = datetime.strptime(date_str, '%Y-%m')

        if start_date <= date_obj <= end_date:
            file_path = os.path.join(output_dir, file_name)
            
            if not os.path.exists(file_path):
                print(f'Downloading {file_name}...')
                response = requests.get(url)
                response.raise_for_status()

                with open(file_path, 'wb') as file:
                    file.write(response.content)

            if file_name.endswith('.zip'):
                csv_file_name = file_name.replace('.zip', '.csv')
                csv_file_path = os.path.join(output_dir, csv_file_name)

                if not os.path.exists(csv_file_path):
                    print(f'Extracting {file_name}...')
                    with zipfile.ZipFile(file_path, 'r') as zip_file:
                        zip_file.extractall(output_dir)

                os.remove(file_path)

    print('Data fetching complete.')


def clean_and_sample_data(data: pd.DataFrame, columns_to_keep: list, columns_to_rename: dict,
                          down_threshold: float, up_threshold: float,
                          left_threshold: float, right_threshold: float, sample_size: int) -> pd.DataFrame:
    data = data[columns_to_keep].copy()
    data.rename(columns={old_name: new_name for old_name, new_name in zip(columns_to_keep, columns_to_rename)}, inplace=True)

    data = data[(data['Start_Lat'] <= up_threshold) & (data['End_Lat'] <= up_threshold) & (data['Start_Lat'] >= down_threshold) & (
        data['End_Lat'] >= down_threshold) & (data['Start_Lon'] <= right_threshold) & (data['End_Lon'] <= right_threshold) & (
                data['Start_Lon'] >= left_threshold) & (data['End_Lon'] >= left_threshold)]

    return data.sample(sample_size, random_state=42)

def compile_and_clean_taxi_data() -> pd.DataFrame:
    yellow_taxi_data = pd.DataFrame()

    down_threshold = 40.560445
    up_threshold = 40.908524
    left_threshold = -74.242330
    right_threshold = -73.717047
    sample_size = 2500

    for year in range(2009, 2010):
#     for year in range(2009, 2016):
        for month in range(1, 13):
            print(f"Processing {year}-{month:02d}")   
            if year == 2009:
                columns_to_keep = ['Trip_Pickup_DateTime', 'Trip_Dropoff_DateTime', "Trip_Distance",
                                   "Start_Lon", "Start_Lat", "End_Lon", "End_Lat", "Fare_Amt", "Tip_Amt"]
                columns_to_rename = ['Pickup_Datetime', 'Dropoff_Datetime', "Trip_Distance",
                                     "Start_Lon", "Start_Lat", "End_Lon", "End_Lat", "Fare_Amt", "Tip_Amt"]
            elif year == 2010:
                columns_to_keep = ['pickup_datetime', 'dropoff_datetime', "trip_distance", "pickup_longitude", "pickup_latitude",
                                   "dropoff_longitude", "dropoff_latitude", "fare_amount", "tip_amount"]
                columns_to_rename = ['Pickup_Datetime', 'Dropoff_Datetime', "Trip_Distance",
                                     "Start_Lon", "Start_Lat", "End_Lon", "End_Lat", "Fare_Amt", "Tip_Amt"]
            else:  # year in [2011, 2012, 2013, 2014, 2015]
                columns_to_keep = ['tpep_pickup_datetime', 'tpep_dropoff_datetime', "trip_distance", "pickup_longitude", "pickup_latitude",
                                   "dropoff_longitude", "dropoff_latitude", "fare_amount", "tip_amount"]
                columns_to_rename = ['Pickup_Datetime', 'Dropoff_Datetime', "Trip_Distance",
                                     "Start_Lon", "Start_Lat", "End_Lon", "End_Lat", "Fare_Amt", "Tip_Amt"]

            data = pd.read_parquet(f"{output_dir}/yellow_tripdata_{year}-{month:02d}.parquet")
            
            sampled_data = clean_and_sample_data(data, columns_to_keep, columns_to_rename,
                                                 down_threshold, up_threshold, left_threshold, right_threshold,
                                                 sample_size)

            yellow_taxi_data = yellow_taxi_data.append(sampled_data, ignore_index=True)

    return yellow_taxi_data



if __name__ == "__main__":
    main_url = 'https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page'
    start_date = datetime(2009, 1, 1)
    end_date = datetime(2015, 6, 30)
    output_dir = 'monthly_data'

    yellow_taxi_links = fetch_yellow_taxi_links(main_url)
    fetch_taxi_data(yellow_taxi_links, start_date, end_date) 
     
    compiled_taxi_data = compile_and_clean_taxi_data()
 
    compiled_taxi_data.to_csv("cleaned_yellow_taxi_data_2009_2015.csv", index=False)




Data fetching complete.
Processing 2009-01
Processing 2009-02
Processing 2009-03
Processing 2009-04
Processing 2009-05
Processing 2009-06
Processing 2009-07
Processing 2009-08
Processing 2009-09
Processing 2009-10
Processing 2009-11
Processing 2009-12
