In [1]:
import polars as pl
import os

In [2]:
from datetime import datetime, timedelta

schema_overrides = {
    "start_station_name": pl.Utf8,
    "end_station_name": pl.Utf8,
    "start_lat": pl.Float64,
    "start_lng": pl.Float64,
    "end_lat": pl.Float64,
    "end_lng": pl.Float64,
}

null_values = ["MTL-ECO5-03"]

daily_rental_raw = pl.read_csv(
    "./data/daily_rental_raw.csv",
    schema_overrides=schema_overrides,
    ignore_errors=True,
    null_values=null_values
)

print("Schema:",daily_rental_raw.schema)
print(f"Daily rental Shape: {daily_rental_raw.shape}")


Schema: Schema([('ride_id', String), ('rideable_type', String), ('started_at', String), ('ended_at', String), ('start_station_name', String), ('start_station_id', Float64), ('end_station_name', String), ('end_station_id', Float64), ('start_lat', Float64), ('start_lng', Float64), ('end_lat', Float64), ('end_lng', Float64), ('member_casual', String)])
Daily rental Shape: (18007255, 13)


In [3]:
print(daily_rental_raw.select(["started_at", "ended_at","start_station_id","end_station_id"]).head())

shape: (5, 4)
┌─────────────────────┬─────────────────────┬──────────────────┬────────────────┐
│ started_at          ┆ ended_at            ┆ start_station_id ┆ end_station_id │
│ ---                 ┆ ---                 ┆ ---              ┆ ---            │
│ str                 ┆ str                 ┆ f64              ┆ f64            │
╞═════════════════════╪═════════════════════╪══════════════════╪════════════════╡
│ 2020-12-02 09:10:36 ┆ 2020-12-02 09:24:12 ┆ 31114.0          ┆ 31242.0        │
│ 2020-12-31 12:46:29 ┆ 2020-12-31 14:01:07 ┆ 31606.0          ┆ 31290.0        │
│ 2020-12-31 12:47:03 ┆ 2020-12-31 14:01:04 ┆ 31606.0          ┆ 31290.0        │
│ 2020-12-29 13:50:51 ┆ 2020-12-29 14:12:22 ┆ 31054.0          ┆ 31010.0        │
│ 2020-12-27 12:30:28 ┆ 2020-12-27 12:36:12 ┆ 31920.0          ┆ 31024.0        │
└─────────────────────┴─────────────────────┴──────────────────┴────────────────┘


In [3]:
def cast_data(daily_rental_raw):
    return daily_rental_raw.with_columns([
        pl.col("started_at")
          .str.to_datetime("%Y-%m-%d %H:%M:%S", strict=False)
          .dt.convert_time_zone("America/New_York")
          .alias("started_at"),
        pl.col("ended_at")
          .str.to_datetime("%Y-%m-%d %H:%M:%S", strict=False)
          .dt.convert_time_zone("America/New_York")
          .alias("ended_at"),
        pl.col("start_station_id").cast(pl.Int64, strict=False).alias("start_station_id"),
        pl.col("end_station_id").cast(pl.Int64, strict=False).alias("end_station_id"),
    ])


In [4]:
daily_rental_casted = cast_data(daily_rental_raw)

print(daily_rental_casted.select(["started_at", "ended_at","start_station_id","end_station_id"]).head())

shape: (5, 4)
┌─────────────────────────┬────────────────────────────────┬──────────────────┬────────────────┐
│ started_at              ┆ ended_at                       ┆ start_station_id ┆ end_station_id │
│ ---                     ┆ ---                            ┆ ---              ┆ ---            │
│ datetime[μs,            ┆ datetime[μs, America/New_York] ┆ i64              ┆ i64            │
│ America/New_York]       ┆                                ┆                  ┆                │
╞═════════════════════════╪════════════════════════════════╪══════════════════╪════════════════╡
│ 2020-12-02 04:10:36 EST ┆ 2020-12-02 04:24:12 EST        ┆ 31114            ┆ 31242          │
│ 2020-12-31 07:46:29 EST ┆ 2020-12-31 09:01:07 EST        ┆ 31606            ┆ 31290          │
│ 2020-12-31 07:47:03 EST ┆ 2020-12-31 09:01:04 EST        ┆ 31606            ┆ 31290          │
│ 2020-12-29 08:50:51 EST ┆ 2020-12-29 09:12:22 EST        ┆ 31054            ┆ 31010          │
│ 2020-12-27 07:

In [46]:
def null_condition(prefix):
    return (
        (
            (pl.col(f"{prefix}station_name").is_not_null()) &
            (pl.col(f"{prefix}station_id").is_not_null()) 
        ) & (
            (pl.col(f"{prefix}lat").is_not_null()) &
            (pl.col(f"{prefix}lng").is_not_null())
        )
    )

filter_condition = (
    (pl.col("started_at").is_not_null()) &
    (pl.col("ended_at").is_not_null()) &
    null_condition("start_") &
    null_condition("end_")
)

daily_rental_filtered = daily_rental_casted.filter(filter_condition)

In [47]:
def count_nulls(df: pl.DataFrame, columns: list) -> pl.DataFrame:
    null_counts = {
        col: df[col].null_count() for col in columns
    }
    return pl.DataFrame([null_counts])

columns_to_check = [
    "start_station_name", "start_station_id", "start_lat", "start_lng",
    "end_station_name", "end_station_id", "end_lat", "end_lng"
]

nulls_df = count_nulls(daily_rental_filtered, columns_to_check)

print(nulls_df.select("start_station_name", "start_station_id", "start_lat", "start_lng",
    "end_station_name", "end_station_id", "end_lat", "end_lng"))

shape: (1, 8)
┌─────────────┬─────────────┬───────────┬───────────┬─────────────┬────────────┬─────────┬─────────┐
│ start_stati ┆ start_stati ┆ start_lat ┆ start_lng ┆ end_station ┆ end_statio ┆ end_lat ┆ end_lng │
│ on_name     ┆ on_id       ┆ ---       ┆ ---       ┆ _name       ┆ n_id       ┆ ---     ┆ ---     │
│ ---         ┆ ---         ┆ i64       ┆ i64       ┆ ---         ┆ ---        ┆ i64     ┆ i64     │
│ i64         ┆ i64         ┆           ┆           ┆ i64         ┆ i64        ┆         ┆         │
╞═════════════╪═════════════╪═══════════╪═══════════╪═════════════╪════════════╪═════════╪═════════╡
│ 0           ┆ 0           ┆ 0         ┆ 0         ┆ 0           ┆ 0          ┆ 0       ┆ 0       │
└─────────────┴─────────────┴───────────┴───────────┴─────────────┴────────────┴─────────┴─────────┘


In [7]:
daily_rental = pl.read_csv("./data/daily_rental.csv")
daily_rental_df = pl.DataFrame(daily_rental)
print("daily_rental schema:",daily_rental_df.schema)
print(f"Daily rental Shape: {daily_rental_df.shape}")

daily_rental schema: Schema([('ride_id', String), ('rideable_type', String), ('started_at', String), ('ended_at', String), ('start_station_name', String), ('start_station_id', Int64), ('end_station_name', String), ('end_station_id', Int64), ('start_lat', Float64), ('start_lng', Float64), ('end_lat', Float64), ('end_lng', Float64), ('member_casual', String)])
Daily rental Shape: (12605921, 13)


In [50]:
daily_rental_filtered.write_csv("./data/daily_rental_filtered.csv")

In [None]:
region = pl.read_csv("data/regions.csv")

print("Region schema:",region.schema)
print(f"Region Shape: {region.shape}")


In [7]:
station_information= pl.read_csv("data/station_infor.csv").filter(
    (pl.col("lat").is_not_null()) & (pl.col("lon").is_not_null())
)

print("station_information schema:",station_information.schema)
print(f"station_information Shape: {station_information.shape}")

station_information.write_csv("./data/station_info.csv")


station_information schema: Schema([('short_name', Int64), ('capacity', Int64), ('region_id', Int64), ('station_id', String), ('lon', Float64), ('name', String), ('lat', Float64)])
station_information Shape: (785, 7)
