In [1]:
# import pandas as pd
import polars as pl
import os

In [2]:
from datetime import datetime, timedelta

# Load the CSV file
daily_rental = "./data/daily_rental_raw.csv"

schema_overrides = {
    "start_station_name": pl.Utf8,
    "end_station_name": pl.Utf8,
    "start_lat": pl.Float64,
    "start_lng": pl.Float64,
    "end_lat": pl.Float64,
    "end_lng": pl.Float64,
    "start_station_id": pl.Int64,
    "end_station_id": pl.Int64 
}

null_values = ["MTL-ECO5-03"]

daily_rental_df = pl.read_csv(
    daily_rental,
    schema_overrides=schema_overrides,
    ignore_errors=True,
    null_values=null_values
)

Initial Condition Checking

In [3]:
def condition(prefix):
    return (
        (
            (pl.col(f"{prefix}station_name").is_not_null()) |
            (pl.col(f"{prefix}station_id").is_not_null()) |
            (
                (pl.col(f"{prefix}lat").is_not_null()) &
                (pl.col(f"{prefix}lng").is_not_null())
            )
        ) & (
            (pl.col(f"{prefix}lat").is_not_null()) &
            (pl.col(f"{prefix}lng").is_not_null())
        ) #Filter out rows with disabled/inactive station
    )

filter_condition = (
    (pl.col("started_at").is_not_null()) &
    (pl.col("ended_at").is_not_null()) &
    condition("start_") &
    condition("end_")
)

# Apply filter
filtered_df = daily_rental_df.filter(filter_condition)

In [None]:
def count_nulls(df: pl.DataFrame, columns: list) -> pl.DataFrame:
    null_counts = {
        col: df[col].null_count() for col in columns
    }
    return pl.DataFrame([null_counts])

# Example usage
columns_to_check = [
    "start_station_name", "start_station_id", "start_lat", "start_lng",
    "end_station_name", "end_station_id", "end_lat", "end_lng"
]

# Assuming your DataFrame is named `daily_rental`
nulls_df = count_nulls(filtered_df, columns_to_check)

print(nulls_df)

In [10]:
filtered_df = filtered_df.with_columns([
    pl.col("started_at").str.to_datetime("%d-%m-%Y %H:%M:%S", strict=False)
        .dt.convert_time_zone("America/New_York").alias("started_at"),
    pl.col("ended_at").str.to_datetime("%d-%m-%Y %H:%M:%S", strict=False)
        .dt.convert_time_zone("America/New_York").alias("ended_at")
])

# daily_rental_df = pl.DataFrame(daily_rental)
print("daily_rental schema:",filtered_df.schema)
print(f"Daily rental Shape: {filtered_df.shape}")

daily_rental schema: Schema([('ride_id', String), ('rideable_type', String), ('started_at', Datetime(time_unit='us', time_zone='America/New_York')), ('ended_at', Datetime(time_unit='us', time_zone='America/New_York')), ('start_station_name', String), ('start_station_id', Int64), ('end_station_name', String), ('end_station_id', Int64), ('start_lat', Float64), ('start_lng', Float64), ('end_lat', Float64), ('end_lng', Float64), ('member_casual', String)])
Daily rental Shape: (15238380, 13)


In [11]:
# Write to CSV
filtered_df.write_csv("daily_rental_filtered.csv")