In [1]:
import pandas as pd
import polars as pl
import os

In [2]:
from datetime import datetime, timedelta

# Load the CSV file
daily_rental = "./data/daily_rental_raw.csv"

schema_overrides = {
    "start_station_name": pl.Utf8,
    "end_station_name": pl.Utf8,
    "start_lat": pl.Float64,
    "start_lng": pl.Float64,
    "end_lat": pl.Float64,
    "end_lng": pl.Float64
}

null_values = ["MTL-ECO5-03"]

daily_rental_df = pl.read_csv(
    daily_rental,
    schema_overrides=schema_overrides,
    ignore_errors=True,
    null_values=null_values
)

Initial Condition Checking

In [3]:
def condition(prefix):
    return (
        (
            (pl.col(f"{prefix}station_name").is_not_null()) |
            (pl.col(f"{prefix}station_id").is_not_null()) |
            (
                (pl.col(f"{prefix}lat").is_not_null()) &
                (pl.col(f"{prefix}lng").is_not_null())
            )
        ) & (
            (pl.col(f"{prefix}lat").is_not_null()) &
            (pl.col(f"{prefix}lng").is_not_null())
        ) #Filter out rows with disabled/inactive station
    )


In [4]:
filter_condition = (
    (pl.col("started_at").is_not_null()) &
    (pl.col("ended_at").is_not_null()) &
    condition("start_") &
    condition("end_")
)

# Apply filter
filtered_df = daily_rental_df.filter(filter_condition)

# Write to CSV
filtered_df.write_csv("daily_rental_filtered.csv")