In [11]:
import numpy as np
import pandas as pd

# --------------------
# CONSTANTS
# --------------------
# Define expected input columns
EXPECTED_INPUT_COLUMNS = [
    "tpep_pickup_datetime",
    "tpep_dropoff_datetime",
    "passenger_count",
    "trip_distance",
    "RatecodeID",
    "PULocationID",
    "DOLocationID",
    "payment_type",
    "extra",
    "total_amount",
]

# Columns after one-hot encoding RatecodeID and payment_type
EXPECTED_RATECODE_COLUMNS = [f"RatecodeID_{i}" for i in [1, 2, 3, 4, 5, 6, 99]]
EXPECTED_PAYMENT_COLUMNS = [f"payment_type_{i}" for i in [1, 2, 3, 4, 5]]


# Columns associated with datetime features
EXPECTED_DATETIME_FEATURES = [
    f"tpep_pickup_datetime_{unit}" for unit in ["day", "month", "year", "hour", "minute", "second"]
] + [
    f"tpep_dropoff_datetime_{unit}" for unit in ["day", "month", "year", "hour", "minute", "second"]
]

# Complete list of features (excluding label)
EXPECTED_SCHEMA = (
    ["passenger_count", "trip_distance", "extra", "PULocationID", "DOLocationID"] 
    + EXPECTED_RATECODE_COLUMNS
    + EXPECTED_PAYMENT_COLUMNS
    + EXPECTED_DATETIME_FEATURES
    + ["trip_duration"]
)

# --------------------
# FUNCTIONS
# --------------------

def preprocess_chunk(df):
    """
    Preprocess one chunk of the taxi dataset.
    - narrowing down to the columns mentioned in the problem statement
    - drop NAs and filter invalid data
    - add encoding for categorical variables (one-hot or frequency encoding depending on frequency)
    Returns (X, y).
    """
    df = df[EXPECTED_INPUT_COLUMNS].copy()
    
    # Drop rows with any NA values
    df.dropna(inplace=True)
    
    # Clean extra and total_amount values to be non-negative and positive respectively 
    # based on the attribute descriptions from Kaggle: https://www.kaggle.com/datasets/diishasiing/revenue-for-cab-drivers/data
    df = df[df["extra"] >= 0]
    df = df[df["total_amount"] > 0]

    # Convert datetime columns (vectorized)
    df.loc[:, "tpep_pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"], errors="coerce")
    df.loc[:, "tpep_dropoff_datetime"] = pd.to_datetime(df["tpep_dropoff_datetime"], errors="coerce")

    # Derive datetime features
    df = get_datetime_features(df, "tpep_pickup_datetime")
    df = get_datetime_features(df, "tpep_dropoff_datetime")

    # Trip duration in minutes
    df["trip_duration"] = (df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]).dt.total_seconds() / 60

    # Drop original datetime cols
    df.drop(columns=["tpep_pickup_datetime", "tpep_dropoff_datetime"], inplace=True)

    # Since there are 6 unique values for RatecodeID, 263 for PULocationID, 262 for DOLocationID and 5 for payment_type
    # taking into account the volume of data, using one hot encoding for ratecodeId and payment_type,
    # using frequency encoding for PULocationID and DOLocationID

    # One-hot encoding RatecodeID and payment_type, having to manually specify expected columns to address the issue from 
    # testing - some chunks may not have all categories, that was causing inconsistent number of columns
    # across chunks and causing errors like "ValueError: all the input array dimensions except for the concatenation axis
    # must match exactly, but along dimension 1, the array at index 0 has size 28 and the array at index 1 has size 29"
    # hardcoded values are from the exploratory data analysis notebook
    # TODO explore refactoring this to avoid hardcoding
    df = pd.get_dummies(df, columns=["RatecodeID", "payment_type"], prefix=["RatecodeID", "payment_type"])

    for col in ["PULocationID", "DOLocationID"]:
        freq = df[col].value_counts(normalize=True)
        df[col] = df[col].map(freq)

    # Add missing dummy columns with 0s
    for col in EXPECTED_RATECODE_COLUMNS + EXPECTED_PAYMENT_COLUMNS:
        if col not in df:
            df[col] = 0

    # Keep column order consistent
    df = df.reindex(columns=EXPECTED_SCHEMA + ["total_amount"], fill_value=0)

    # Tracking feature_columns and skip_normalization_columns to skip normalization of the attributes
    # that are the derived date-time attributes, were one-hot encoded or frequency encoded above
    # TODO see if there is an alternative to manually specifying the column names
    skip_normalization_columns = [
        col for col in df.columns
        if col.startswith("RatecodeID_")
        or col.startswith("payment_type_")
        or col.startswith("tpep_pickup_datetime_")
        or col.startswith("tpep_dropoff_datetime_")
        or col in ["PULocationID", "DOLocationID"]
    ]
    feature_columns = [c for c in df.columns if c != "total_amount"]

    # ensuring X and y are of type float64 as object type arrays cause errors with MPI Allreduce
    X = df[feature_columns].values.astype(np.float64) 
    y = df["total_amount"].values.astype(np.float64)

    # stack X and y back into a dataframe to return a single dataframe
    df = pd.DataFrame(np.hstack((X, y.reshape(-1, 1))), columns=feature_columns + ["total_amount"])
    return df, feature_columns, skip_normalization_columns

def get_datetime_features(df, col_name):
    '''
    Derive datetime features from a datetime column
    '''
    dt = df[col_name].dt
    features = pd.DataFrame({
        col_name + "_day": dt.day,
        col_name + "_month": dt.month,
        col_name + "_year": dt.year,
        col_name + "_hour": dt.hour,
        col_name + "_minute": dt.minute,
        col_name + "_second": dt.second,
    }, index=df.index)
    return pd.concat([df, features], axis=1)


In [12]:
df = pd.read_csv("../../data/nytaxi2022_subset.csv", header=0, parse_dates=["tpep_pickup_datetime", "tpep_dropoff_datetime"])

  df = pd.read_csv("../../data/nytaxi2022_subset.csv", header=0, parse_dates=["tpep_pickup_datetime", "tpep_dropoff_datetime"])
  df = pd.read_csv("../../data/nytaxi2022_subset.csv", header=0, parse_dates=["tpep_pickup_datetime", "tpep_dropoff_datetime"])


In [14]:
df, feature_columns, skip_normalization_columns = preprocess_chunk(df)

In [15]:
df.head()

Unnamed: 0,passenger_count,trip_distance,extra,PULocationID,DOLocationID,RatecodeID_1,RatecodeID_2,RatecodeID_3,RatecodeID_4,RatecodeID_5,...,tpep_pickup_datetime_minute,tpep_pickup_datetime_second,tpep_dropoff_datetime_day,tpep_dropoff_datetime_month,tpep_dropoff_datetime_year,tpep_dropoff_datetime_hour,tpep_dropoff_datetime_minute,tpep_dropoff_datetime_second,trip_duration,total_amount
0,2.0,3.8,3.0,0.03224,0.05092,1.0,0.0,0.0,0.0,0.0,...,35.0,40.0,1.0,1.0,2022.0,0.0,53.0,29.0,17.816667,21.95
1,1.0,2.1,0.5,0.050109,0.005151,1.0,0.0,0.0,0.0,0.0,...,33.0,43.0,1.0,1.0,2022.0,0.0,42.0,7.0,8.4,13.3
2,1.0,0.97,0.5,0.005037,0.007366,1.0,0.0,0.0,0.0,0.0,...,53.0,21.0,1.0,1.0,2022.0,1.0,2.0,19.0,8.966667,10.56
3,1.0,1.09,0.5,0.010724,0.022356,1.0,0.0,0.0,0.0,0.0,...,25.0,21.0,1.0,1.0,2022.0,0.0,35.0,23.0,10.033333,11.8
4,1.0,4.3,0.5,0.023937,0.022062,1.0,0.0,0.0,0.0,0.0,...,36.0,48.0,1.0,1.0,2022.0,1.0,14.0,20.0,37.533333,30.3


In [17]:
pd.DataFrame.to_csv(df, "../../data/processed/nytaxi2022_preprocessed_subset.csv", index=False)

In [18]:
print(feature_columns)
print(skip_normalization_columns)

['passenger_count', 'trip_distance', 'extra', 'PULocationID', 'DOLocationID', 'RatecodeID_1', 'RatecodeID_2', 'RatecodeID_3', 'RatecodeID_4', 'RatecodeID_5', 'RatecodeID_6', 'RatecodeID_99', 'payment_type_1', 'payment_type_2', 'payment_type_3', 'payment_type_4', 'payment_type_5', 'tpep_pickup_datetime_day', 'tpep_pickup_datetime_month', 'tpep_pickup_datetime_year', 'tpep_pickup_datetime_hour', 'tpep_pickup_datetime_minute', 'tpep_pickup_datetime_second', 'tpep_dropoff_datetime_day', 'tpep_dropoff_datetime_month', 'tpep_dropoff_datetime_year', 'tpep_dropoff_datetime_hour', 'tpep_dropoff_datetime_minute', 'tpep_dropoff_datetime_second', 'trip_duration']
['PULocationID', 'DOLocationID', 'RatecodeID_1', 'RatecodeID_2', 'RatecodeID_3', 'RatecodeID_4', 'RatecodeID_5', 'RatecodeID_6', 'RatecodeID_99', 'payment_type_1', 'payment_type_2', 'payment_type_3', 'payment_type_4', 'payment_type_5', 'tpep_pickup_datetime_day', 'tpep_pickup_datetime_month', 'tpep_pickup_datetime_year', 'tpep_pickup_date