<a href="https://colab.research.google.com/github/sophiewagner7/its-too-nice-out-to-take-a-cab/blob/main/notebooks/Process_taxi_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Process Taxi Data

Moacir P. de Sá Pereira

This notebook reduces the size of our yellow taxi and high-volume for-hire vehicle (Uber, Lyft) data, downloaded from the [Taxi and Limousine Commission](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page). We drop columns that will not be important to us, rename the columns so that both yellow cab and FHVHV data will have the same column names, save `license`, which indicates which company operates the FHVHV trip in question. The codes convert to companies like this:

Code | Company
----|----
HV0002|Juno
HV0003|Uber
HV0004|Via
HV0005|Lyft

Additionally, we use the taxi zone lookup table to reduce our trip data only to trips that originate and terminate in Manhattan. This has a substantial effect on the FHVHV data, but little effect on the yellow cab data.

The files are then saved as parquet files in a `processed_files` folder.

Next, the processed files are concatenated to make yearly files, saved in `concatenated_files`.

In [None]:
import pandas as pd
from tqdm import tqdm
from google.colab import drive

drive.mount('/content/drive/')

In [None]:
root_path = "./drive/MyDrive/taxi-data"

zones_df = pd.read_csv(f"{root_path}/taxi_zone_lookup.csv")
manhattan_zones = list(zones_df[zones_df.Borough == "Manhattan"].LocationID)

yellow_columns_to_keep = {
    "tpep_pickup_datetime": "pickup_datetime",
    "tpep_dropoff_datetime": "dropoff_datetime",
    "trip_distance": "trip_distance",
    "PULocationID": "pickup_zone",
    "DOLocationID": "dropoff_zone",
    "fare_amount": "fare_amount",
    "tip_amount": "tip_amount"
}

fhvhv_columns_to_keep = {
    "pickup_datetime": "pickup_datetime",
    "dropoff_datetime": "dropoff_datetime",
    "trip_miles": "trip_distance",
    "PULocationID": "pickup_zone",
    "DOLocationID": "dropoff_zone",
    "base_passenger_fare": "fare_amount",
    "tips": "tip_amount",
    "hvfhs_license_num": "license",
}

In [None]:
def process_file(trip = "fhvhv", file_name = "fhvhv_tripdata_2021-05.parquet"):
  path = f"{root_path}/{file_name}"
  if trip == "yellow":
    columns = yellow_columns_to_keep
  else:
    columns = fhvhv_columns_to_keep
  df = pd.read_parquet(path)
  df = df[columns.keys()]
  df = df.rename(columns=columns)
  df = df[df.pickup_zone.isin(manhattan_zones) & df.dropoff_zone.isin(manhattan_zones)]
  df.to_parquet(f"{root_path}/processed_files/{file_name}")
  return df

In [None]:
def concat_files(trip="yellow", year=2019):
  starting_month = 1
  if year == 2019 and trip == "fhvhv":
    starting_month = 2

  ending_month = 12
  if year == 2024:
    ending_month = 8

  df = pd.DataFrame()
  for month in tqdm(range(starting_month, ending_month + 1)):
    path = f"{root_path}/processed_files/{trip}_tripdata_{year}-{str(month).zfill(2)}.parquet"
    df_fragment = pd.read_parquet(path)
    df = pd.concat([df, df_fragment], ignore_index = True)
  if trip == "yellow":
    df["license"] = "yellow"
  df.to_parquet(f"{root_path}/concatenated_files/{trip}_{year}.parquet")
  print(f"Wrote {root_path}/concatenated_files/{trip}_{year}.parquet")

In [None]:
for trip in ["fhvhv", "yellow"]:
  for year in range(2019, 2025):
    for month in range(12):
      if trip == "fhvhv" and year == 2019 and month == 0:
        # No file for FHVHV for January 2019.
        continue
      month = str(month + 1).zfill(2)
      file_name = f"{trip}_tripdata_{year}-{month}.parquet"
      print(f"Working on {file_name}")
      process_file(trip, file_name)


In [None]:
for trip in ["fhvhv", "yellow"]:
  for year in range(2020, 2025):
    concat_files(trip=trip, year=year)