### Importing Utilities

In [1]:
import pandas as pd
import numpy as np
import copy
import requests
from bs4 import BeautifulSoup
import time
from tqdm.notebook import tqdm

pd.set_option('chained_assignment',None)

### Retrieving URL (raw ridership data & taxi-zone-lookup-table)

In [2]:
url = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

In [3]:
response = requests.get(url)
dom = BeautifulSoup(response.content, 'html.parser')
table = dom.find_all("tbody")

In [4]:
yellow_url_list = []
green_url_list = []
fhv_url_list = []
high_vol_fhv_url_list = []

for i in range(6):
    urls = table[i].findAll("a")
    for url in urls:
        url = str(url).split('"')[1]
        if "yellow" in url:
            yellow_url_list.append(url)
        elif "green" in url:
            green_url_list.append(url)
        elif "fhvhv" in url:
            high_vol_fhv_url_list.append(url)
        else:
            fhv_url_list.append(url)

In [5]:
len(yellow_url_list), len(green_url_list), len(fhv_url_list), len(high_vol_fhv_url_list)

(66, 66, 66, 17)

In [6]:
# fixing typo (2019-010 => 2019-10)
high_vol_fhv_url_list[-3] = 'https://nyc-tlc.s3.amazonaws.com/trip+data/fhvhv_tripdata_2019-10.csv'

In [7]:
yellow_url_list[:5]

['https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-01.csv',
 'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-02.csv',
 'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-03.csv',
 'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-04.csv',
 'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-05.csv']

In [8]:
taxizone = pd.read_csv("https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv", index_col='LocationID')
taxizone = taxizone[['Borough']]
taxizone.head(5)

Unnamed: 0_level_0,Borough
LocationID,Unnamed: 1_level_1
1,EWR
2,Queens
3,Bronx
4,Manhattan
5,Staten Island


### Downloading and Saving Orig. Data

In [9]:
# start_time = time.time()
# data = pd.read_csv(yellow_url_list[0])
# print("--- %s seconds ---" % round(time.time() - start_time, 2))

In [10]:
def preprocess(df, data_type):
    # Leave only necessary columns
    if data_type == "yellow":
        col_needed = ['tpep_pickup_datetime', 'passenger_count', 'trip_distance', 'PULocationID']
        df = df[col_needed]
        df.columns = ['pickup_datetime', 'passenger_count', 'trip_distance', 'PULocationID']
    elif data_type == "green":
        col_needed = ['lpep_pickup_datetime', 'passenger_count', 'trip_distance', 'PULocationID']
        df = df[col_needed]
        df.columns = ['pickup_datetime', 'passenger_count', 'trip_distance', 'PULocationID']
    else:
        col_needed = ['pickup_datetime', 'dropoff_datetime', 'PULocationID']
        df = df[col_needed]
        # placeholder
        df['passenger_count'] = 1
        # trip_duration in place of trip_distance (but keep the same col_name for easier preprocessing)
        df['trip_distance'] = [(y-x).total_seconds() for x, y in zip(pd.to_datetime(df['pickup_datetime']), pd.to_datetime(df['dropoff_datetime']))]

    # Change to datetime object
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
    
    # Replace LocationID with actual borough names
    df = df.join(taxizone, on='PULocationID')
    # Drop unnecessary columns & rearrange the columns order
    df = df[['pickup_datetime', 'passenger_count', 'trip_distance', 'Borough']]
    df = df.groupby(by=[df['pickup_datetime'].dt.date, 'Borough']).agg({'passenger_count': 'sum', \
                                                                        'trip_distance': 'mean'})
    return df

def load_concat_process(url_list, data_type):
    """
    data_type: 'yellow', 'green', 'fhv', 'hvfhv'
    """
    print(">>> Start Downloading...")
    df_list = []
    for url in tqdm(url_list):
        df = pd.read_csv(url)
        df_list.append(df)
    print(">>> Concatenating...")
    full_df = pd.concat(df_list, ignore_index=True, sort=False)
    
    print(">>> Start Grouping by Day and Borough")
    cleaned = preprocess(full_df, data_type)
    return cleaned

### Testing with two csv files - Yellow Taxi


In [None]:
full_yellow_10 = load_concat_process(yellow_url_list[:10], data_type="yellow")
full_yellow_10.to_csv("full_yellow_10.csv")

>>> Start Downloading...


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

  if (await self.run_code(code, result,  async_=asy)):


In [None]:
full_yellow.head(10)

### Testing with two csv files - Green Taxi

In [12]:
full_green = load_concat_process(green_url_list[:10], data_type="green")
full_green.to_csv("full_green.csv")

>>> Start Downloading...


HBox(children=(FloatProgress(value=0.0, max=66.0), HTML(value='')))

  if (await self.run_code(code, result,  async_=asy)):





MemoryError: 

### Testing with two csv files - For-Hire Vehicle

In [None]:
full_fhv = load_concat_process(fhv_url_list, data_type="fhv")
full_fhv.to_csv("full_fhv.csv")

### Testing with two csv files - High Volume For-Hire Vehicle


In [None]:
full_hvfhv = load_concat_process(high_vol_fhv_url_list, data_type="hvfhv")
full_hvfhv.to_csv("full_hvfhv.csv")