### Importing Utilities

In [175]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time

### Retrieving URL (raw ridership data & taxi-zone-lookup-table)

In [2]:
url = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

In [3]:
response = requests.get(url)
dom = BeautifulSoup(response.content, 'html.parser')
table = dom.find_all("tbody")

In [4]:
yellow_url_list = []
green_url_list = []
fhv_url_list = []
high_vol_fhv_url_list = []

for i in range(6):
    urls = table[i].findAll("a")
    for url in urls:
        url = str(url).split('"')[1]
        if "yellow" in url:
            yellow_url_list.append(url)
        elif "green" in url:
            green_url_list.append(url)
        elif "fhvhv" in url:
            high_vol_fhv_url_list.append(url)
        else:
            fhv_url_list.append(url)

In [5]:
len(yellow_url_list), len(green_url_list), len(fhv_url_list), len(high_vol_fhv_url_list)

(66, 66, 66, 17)

In [6]:
yellow_url_list[:5]

['https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-01.csv',
 'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-02.csv',
 'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-03.csv',
 'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-04.csv',
 'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-05.csv']

### Downloading and Saving Orig. Data

In [None]:
# start_time = time.time()
# data = pd.read_csv(yellow_url_list[0])
# print("--- %s seconds ---" % round(time.time() - start_time, 2))

In [136]:
def load_process_concat(url_list):
    full_df = pd.DataFrame(columns=columns_needed)
    for url in url_list:
        df = pd.read_csv(url)
        df = df[columns_needed]
        full_df = full_df.append(df, ignore_index=True)
    return full_df       

In [None]:
############ Thanks Cora ♥
# full_yellow = load_process_concat(yellow_url_list)
# full_green = load_process_concat(green_url_list)

# full_yellow.to_csv("full_yellow.csv", index=False)
# full_green.to_csv("full_green.csv", index=False)

In [None]:
############ Thanks Emmy ♥
# full_fhv = load_process_concat(fhv_url_list)
# full_hvfhv = load_process_concat(high_vol_fhv_url_list)

# full_fhv.to_csv("full_fhv.csv", index=False)
# full_hvfhv.to_csv("full_hvfhv.csv", index=False)

### Grouping by Days

In [161]:
test = pd.read_csv("yellow_tripdata_2019-01.csv")
test = test[columns_needed]
test

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,PULocationID,DOLocationID
0,2019-01-01 00:46:40,2019-01-01 00:53:20,1,1.50,151,239
1,2019-01-01 00:59:47,2019-01-01 01:18:59,1,2.60,239,246
2,2018-12-21 13:48:30,2018-12-21 13:52:40,3,0.00,236,236
3,2018-11-28 15:52:25,2018-11-28 15:55:45,5,0.00,193,193
4,2018-11-28 15:56:57,2018-11-28 15:58:33,5,0.00,193,193
...,...,...,...,...,...,...
7667787,2019-01-31 23:57:36,2019-02-01 00:18:39,1,4.79,263,4
7667788,2019-01-31 23:32:03,2019-01-31 23:33:11,1,0.00,193,193
7667789,2019-01-31 23:36:36,2019-01-31 23:36:40,1,0.00,264,264
7667790,2019-01-31 23:14:53,2019-01-31 23:15:20,1,0.00,264,7


In [162]:
test['tpep_pickup_datetime'] = pd.to_datetime(test['tpep_pickup_datetime'])
test['tpep_dropoff_datetime'] = pd.to_datetime(test['tpep_dropoff_datetime'])

In [172]:
test['trip_duration'] = test['tpep_dropoff_datetime'] - test['tpep_pickup_datetime']

In [168]:
test = test[['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_duration',\
       'trip_distance', 'PULocationID', 'DOLocationID']]
test

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_duration,trip_distance,PULocationID,DOLocationID
0,2019-01-01 00:46:40,2019-01-01 00:53:20,1,0 days 00:06:40,1.50,151,239
1,2019-01-01 00:59:47,2019-01-01 01:18:59,1,0 days 00:19:12,2.60,239,246
2,2018-12-21 13:48:30,2018-12-21 13:52:40,3,0 days 00:04:10,0.00,236,236
3,2018-11-28 15:52:25,2018-11-28 15:55:45,5,0 days 00:03:20,0.00,193,193
4,2018-11-28 15:56:57,2018-11-28 15:58:33,5,0 days 00:01:36,0.00,193,193
...,...,...,...,...,...,...,...
7667787,2019-01-31 23:57:36,2019-02-01 00:18:39,1,0 days 00:21:03,4.79,263,4
7667788,2019-01-31 23:32:03,2019-01-31 23:33:11,1,0 days 00:01:08,0.00,193,193
7667789,2019-01-31 23:36:36,2019-01-31 23:36:40,1,0 days 00:00:04,0.00,264,264
7667790,2019-01-31 23:14:53,2019-01-31 23:15:20,1,0 days 00:00:27,0.00,264,7


In [182]:
test.groupby([test['tpep_pickup_datetime'].dt.date, 'PULocationID']).last().to_csv("check.csv")

In [36]:
taxizone = pd.read_csv("https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv")
taxizone_pu = taxizone[['LocationID', 'Borough']]
taxizone_pu.columns = [['PULocationID', 'Borough']]
taxizone_do = taxizone[['LocationID', 'Borough']]
taxizone_do.columns = [['DOLocationID', 'Borough']]
display(taxizone_pu.head(5))
display(taxizone_do.head(5))

Unnamed: 0,PULocationID,Borough
0,1,EWR
1,2,Queens
2,3,Bronx
3,4,Manhattan
4,5,Staten Island


Unnamed: 0,DOLocationID,Borough
0,1,EWR
1,2,Queens
2,3,Bronx
3,4,Manhattan
4,5,Staten Island
