### Importing Utilities

In [1]:
import pandas as pd
import numpy as np
import copy
import requests
from bs4 import BeautifulSoup
import time
from datetime import datetime
from tqdm.notebook import tqdm

pd.set_option('chained_assignment',None)

### Retrieving URL (raw ridership data & taxi-zone-lookup-table)

In [2]:
url = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

In [3]:
response = requests.get(url)
dom = BeautifulSoup(response.content, 'html.parser')
table = dom.find_all("tbody")

In [4]:
yellow_url_list = []
green_url_list = []
fhv_url_list = []
high_vol_fhv_url_list = []

for i in range(6):
    urls = table[i].findAll("a")
    for url in urls:
        url = str(url).split('"')[1]
        if "yellow" in url:
            yellow_url_list.append(url)
        elif "green" in url:
            green_url_list.append(url)
        elif "fhvhv" in url:
            high_vol_fhv_url_list.append(url)
        else:
            fhv_url_list.append(url)

In [5]:
len(yellow_url_list), len(green_url_list), len(fhv_url_list), len(high_vol_fhv_url_list)

(66, 66, 66, 17)

In [6]:
# fixing typo (2019-010 => 2019-10)
high_vol_fhv_url_list[-3] = 'https://nyc-tlc.s3.amazonaws.com/trip+data/fhvhv_tripdata_2019-10.csv'

In [7]:
yellow_url_list[:5]

['https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-01.csv',
 'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-02.csv',
 'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-03.csv',
 'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-04.csv',
 'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-05.csv']

In [8]:
taxizone = pd.read_csv("https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv", index_col='LocationID')
taxizone = taxizone[['Borough']]
taxizone.tail(5)

Unnamed: 0_level_0,Borough
LocationID,Unnamed: 1_level_1
261,Manhattan
262,Manhattan
263,Manhattan
264,Unknown
265,Unknown


### Downloading and Saving Orig. Data

In [9]:
# start_time = time.time()
# data = pd.read_csv(yellow_url_list[0])
# print("--- %s seconds ---" % round(time.time() - start_time, 2))

In [10]:
def preprocess(df, data_type):
    # Leave only necessary columns
    if data_type == "yellow":
        try:
            col_needed = ['tpep_pickup_datetime', 'passenger_count', 'trip_distance', 'PULocationID']
            df = df[col_needed]
        except: 
            col_needed = ['tpep_pickup_datetime', 'passenger_count', 'trip_distance']
            df = df[col_needed]
            df['PULocationID'] = 265
        df.columns = ['pickup_datetime', 'passenger_count', 'trip_distance', 'PULocationID']
    elif data_type == "green":
        try: 
            col_needed = ['lpep_pickup_datetime', 'passenger_count', 'trip_distance', 'PULocationID']
            df = df[col_needed]
        except: 
            col_needed = ['lpep_pickup_datetime', 'Passenger_count', 'Trip_distance']
            df = df[col_needed]
            df['PULocationID'] = 265
        df.columns = ['pickup_datetime', 'passenger_count', 'trip_distance', 'PULocationID']
    else:
        print("Selecting necessary columns")
        try:
            try:
                col_needed = ['pickup_datetime', 'dropoff_datetime', 'PULocationID']
                df = df[col_needed]
            except:
                col_needed = ['Pickup_DateTime', 'DropOff_datetime', 'PUlocationID']
                df = df[col_needed]    
                df = df.rename(columns={"Pickup_DateTime": "pickup_datetime", "DropOff_datetime": "dropoff_datetime", \
                                        "PUlocationID": "PULocationID"})
        except:
            col_needed = ['Pickup_date']
            df = df[col_needed]
            df = df.rename(columns={"Pickup_date": "pickup_datetime"})
        print("setting passenger_count as 1")
        # placeholder
        df['passenger_count'] = 1
        # trip_duration in place of trip_distance (but keep the same col_name for easier preprocessing)
        print("calculating trip_duration")
        trip_duration = []
        if 'dropoff_datetime' in df.columns:
            for x, y in zip(df['pickup_datetime'], df['dropoff_datetime']):
                try:
                    delta = (pd.to_datetime(y)-pd.to_datetime(x)).total_seconds()
                except:
                    delta = 0
                trip_duration.append(delta)
            df['trip_distance'] = trip_duration
        else:
            df['trip_distance'] = 0
    print("pickup_time to datetime object")
    # Change to datetime object
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
    print("joining Location ID")
    # Replace LocationID with actual borough names
    if 'PULocationID' in df.columns:
        df = df.join(taxizone, on='PULocationID')
    else:
        df['Borough'] = 'UNKNOWN'
    print("Rearranging & Dropping columns")
    # Drop unnecessary columns & rearrange the columns order
    df = df[['pickup_datetime', 'passenger_count', 'trip_distance', 'Borough']]
    df = df.groupby(by=[df['pickup_datetime'].dt.date, 'Borough']).agg({'passenger_count': 'sum', 'trip_distance': 'mean'})
    return df

def load_concat_process(url_list, data_type):
    """
    data_type: 'yellow', 'green', 'fhv', 'hvfhv'
    """
    print(">>> Start Downloading...", datetime.now().time())
    df_list = []
    for url in tqdm(url_list):
        df = pd.read_csv(url, index_col=False)
        try:
            print(">>> Preprocessing {} at {}".format(url, datetime.now().time()))
            df = preprocess(df, data_type)
        except:
            print("error occured when preprocessing", url)
            break
        df_list.append(df)
    print(">>> Concatenating...")
    full_df = pd.concat(df_list, ignore_index=False, sort=False)
    
#     print(">>> Start Grouping by Day and Borough")
#     cleaned = preprocess(full_df, data_type)
    return full_df

### Yellow Taxi


#### For Yellow and Green, PULocation is given as longitude/latitude instead of LocationID (setting them as UNKNOWN for now)

In [14]:
full_yellow = load_concat_process(yellow_url_list[60:], data_type="yellow")
print(">>> Saving As CSV...")
full_yellow.to_csv("full_yellow_60_end.csv")
print(">>> DONE! :)")

>>> Start Downloading...


HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))


>>> Concatenating...
>>> Saving As CSV...
>>> DONE! :)


### Green Taxi

In [17]:
full_green = load_concat_process(green_url_list[60:], data_type="green")
print(">>> Saving As CSV...")
full_green.to_csv("full_green_60_end.csv")
print(">>> DONE! :)")

>>> Start Downloading...


HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))


>>> Concatenating...
>>> Saving As CSV...
>>> DONE! :)


### For-Hire Vehicle

In [69]:
full_fhv = load_concat_process(fhv_url_list[:7], data_type="fhv")
print(">>> Saving As CSV...")
full_fhv.to_csv("full_fhv_0_7.csv")
print(">>> DONE! :)")

>>> Start Downloading... 13:58:33.451221


HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))

>>> Preprocessing https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2020-01.csv at 13:58:46.156775
Selecting necessary columns
setting passenger_count as 1
calculating trip_duration
pickup_time to datetime object
joining Location ID
Rearranging & Dropping columns
>>> Preprocessing https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2020-02.csv at 14:08:48.865959
Selecting necessary columns
setting passenger_count as 1
calculating trip_duration
pickup_time to datetime object
joining Location ID
Rearranging & Dropping columns
>>> Preprocessing https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2020-03.csv at 14:17:07.891807
Selecting necessary columns
setting passenger_count as 1
calculating trip_duration
pickup_time to datetime object
joining Location ID
Rearranging & Dropping columns
>>> Preprocessing https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2020-04.csv at 14:23:57.812968
Selecting necessary columns
setting passenger_count as 1
calculating trip_duration
pi

### High Volume For-Hire Vehicle


In [None]:
full_hvfhv = load_concat_process(high_vol_fhv_url_list, data_type="hvfhv")
print(">>> Saving As CSV...")
full_hvfhv.to_csv("full_hvfhv.csv")
print(">>> DONE! :)")

### Concatenating into final

In [73]:
import os
files = [f for f in os.listdir('.') if os.path.isfile(f)]

In [74]:
def concat_to_full(data_type):
    type_csv = [x for x in files if data_type in x]
    df_list = []
    df_lens = []
    for csv in type_csv:
        df = pd.read_csv(csv)
        mask1 = pd.to_datetime(df.pickup_datetime).dt.year >= 2015
        mask2 = pd.to_datetime(df.pickup_datetime).dt.year <= 2020
        final_mask = mask1 & mask2
        df = df[final_mask]
        df_list.append(df)
        df_lens.append(len(df))
    print(data_type, "individual len:", df_lens, "total len:", sum(df_lens))
    full = pd.concat(df_list)
    full = full.sort_values(by=['pickup_datetime'])
    filename = "full_" + data_type + ".csv"
    full.to_csv(filename, index=False)

In [75]:
concat_to_full("yellow")
concat_to_full("green")
concat_to_full("fhv")

yellow individual len: [2141, 2262, 2233, 2133, 437, 91, 217, 1100, 150, 184] total len: 10948
green individual len: [1866, 1953, 1944, 2010, 428, 505, 976, 184] total len: 9866
fhv individual len: [1174, 2128, 2142, 2128, 671, 303, 184, 611] total len: 9341


In [76]:
files

['.gitignore',
 'full_fhv_0_7.csv',
 'full_fhv_10_20.csv',
 'full_fhv_20_30.csv',
 'full_fhv_30_40.csv',
 'full_fhv_40_50.csv',
 'full_fhv_50_60.csv',
 'full_fhv_60_end.csv',
 'full_fhv_7_10.csv',
 'full_green_0_10.csv',
 'full_green_10_20.csv',
 'full_green_20_30.csv',
 'full_green_30_40.csv',
 'full_green_40_45.csv',
 'full_green_45_50.csv',
 'full_green_50_60.csv',
 'full_green_60_end.csv',
 'full_yellow_0_10.csv',
 'full_yellow_10_20.csv',
 'full_yellow_20_30.csv',
 'full_yellow_30_40.csv',
 'full_yellow_40_45.csv',
 'full_yellow_45_48.csv',
 'full_yellow_48_49.csv',
 'full_yellow_49_55.csv',
 'full_yellow_55_60.csv',
 'full_yellow_60_end.csv',
 'README.md',
 'taxi_data_crawling.ipynb']

In [80]:
fhv_url_list[5]

'https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2020-06.csv'

In [81]:
fhv_202006=load_concat_process([fhv_url_list[5]], data_type="fhv")
print(">>> Saving As CSV...")
fhv_202006.to_csv("full_fhv_202006.csv")
print(">>> DONE! :)")

>>> Start Downloading... 17:01:17.578584


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

>>> Preprocessing https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2020-06.csv at 17:01:24.881807
Selecting necessary columns
setting passenger_count as 1
calculating trip_duration
pickup_time to datetime object
joining Location ID
Rearranging & Dropping columns

>>> Concatenating...
>>> Saving As CSV...
>>> DONE! :)


In [114]:
fhv_202006_csv = pd.read_csv("https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2020-06.csv")

In [115]:
col_needed = ['pickup_datetime', 'dropoff_datetime', 'PULocationID']
fhv_202006_csv = fhv_202006_csv[col_needed]

In [116]:
fhv_202006_csv['passenger_count'] = 1

In [117]:
trip_duration = []
if 'dropoff_datetime' in fhv_202006_csv.columns:
    for x, y in zip(fhv_202006_csv['pickup_datetime'], fhv_202006_csv['dropoff_datetime']):
        try:
            delta = (pd.to_datetime(y)-pd.to_datetime(x)).total_seconds()
        except:
            delta = 0
        trip_duration.append(delta)
    fhv_202006_csv['trip_distance'] = trip_duration
else:
    fhv_202006_csv['trip_distance'] = 0

In [118]:
fhv_202006_csv['Borough'] = 'UNKNOWN'

In [119]:
fhv_202006_csv = fhv_202006_csv[['pickup_datetime', 'passenger_count', 'trip_distance', 'Borough']]
fhv_202006_csv['pickup_datetime'] = pd.to_datetime(fhv_202006_csv['pickup_datetime'])
fhv_202006_csv = fhv_202006_csv.groupby(by=[fhv_202006_csv['pickup_datetime'].dt.date, 'Borough']).agg({'passenger_count': 'sum', 'trip_distance': 'mean'})

In [120]:
fhv_202006_csv

Unnamed: 0_level_0,Unnamed: 1_level_0,passenger_count,trip_distance
pickup_datetime,Borough,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-06-01,UNKNOWN,29834,1242.284474
2020-06-02,UNKNOWN,26118,1192.747684
2020-06-03,UNKNOWN,27505,1195.516197
2020-06-04,UNKNOWN,27979,1176.211158
2020-06-05,UNKNOWN,28727,1227.63094
2020-06-06,UNKNOWN,21305,1054.220136
2020-06-07,UNKNOWN,18722,973.265089
2020-06-08,UNKNOWN,31724,1166.309009
2020-06-09,UNKNOWN,32110,1167.407194
2020-06-10,UNKNOWN,33479,1172.072105
