In [1]:
import polars as pl

In [2]:
station_info_df = pl.read_csv("./data/station_information.csv")
daily_rental = pl.read_csv("./data/daily_rental_filtered.csv")

In [3]:
station_df = pl.DataFrame(station_info_df)
print("station_info schema:",station_df.schema)

daily_rental_df = pl.DataFrame(daily_rental)
print("daily_rental schema:",daily_rental_df.schema)

station_info schema: Schema([('short_name', Int64), ('capacity', Int64), ('region_id', Int64), ('station_id', String), ('lon', Float64), ('name', String), ('lat', Float64)])
daily_rental schema: Schema([('ride_id', String), ('rideable_type', String), ('started_at', String), ('ended_at', String), ('start_station_name', String), ('start_station_id', Float64), ('end_station_name', String), ('end_station_id', Float64), ('start_lat', Float64), ('start_lng', Float64), ('end_lat', Float64), ('end_lng', Float64), ('member_casual', String)])


Inspect the Null Values in the DataFrames

In [4]:
def count_nulls(df: pl.DataFrame, columns: list) -> pl.DataFrame:
    null_counts = {
        col: df[col].null_count() for col in columns
    }
    return pl.DataFrame([null_counts])

# Example usage
columns_to_check = [
    "start_station_name", "start_station_id", "start_lat", "start_lng",
    "end_station_name", "end_station_id", "end_lat", "end_lng"
]

# Assuming your DataFrame is named `daily_rental`
nulls_df = count_nulls(daily_rental, columns_to_check)

print(nulls_df)

shape: (1, 8)
┌─────────────┬─────────────┬───────────┬───────────┬─────────────┬────────────┬─────────┬─────────┐
│ start_stati ┆ start_stati ┆ start_lat ┆ start_lng ┆ end_station ┆ end_statio ┆ end_lat ┆ end_lng │
│ on_name     ┆ on_id       ┆ ---       ┆ ---       ┆ _name       ┆ n_id       ┆ ---     ┆ ---     │
│ ---         ┆ ---         ┆ i64       ┆ i64       ┆ ---         ┆ ---        ┆ i64     ┆ i64     │
│ i64         ┆ i64         ┆           ┆           ┆ i64         ┆ i64        ┆         ┆         │
╞═════════════╪═════════════╪═══════════╪═══════════╪═════════════╪════════════╪═════════╪═════════╡
│ 1899774     ┆ 1899774     ┆ 0         ┆ 0         ┆ 1995563     ┆ 1996618    ┆ 0       ┆ 0       │
└─────────────┴─────────────┴───────────┴───────────┴─────────────┴────────────┴─────────┴─────────┘


In [7]:
# Filter rows with null values in end_station_name or end_station_id
null_rows = daily_rental.filter(
    pl.col("end_station_name").is_null() | pl.col("end_station_id").is_null()
)

# Print the rows with null values
print(null_rows.select('end_station_name','end_station_id','end_lat','end_lng'))

shape: (1_996_618, 4)
┌──────────────────┬────────────────┬─────────┬─────────┐
│ end_station_name ┆ end_station_id ┆ end_lat ┆ end_lng │
│ ---              ┆ ---            ┆ ---     ┆ ---     │
│ str              ┆ f64            ┆ f64     ┆ f64     │
╞══════════════════╪════════════════╪═════════╪═════════╡
│ null             ┆ null           ┆ 38.92   ┆ -77.01  │
│ null             ┆ null           ┆ 38.89   ┆ -77.1   │
│ null             ┆ null           ┆ 38.95   ┆ -77.08  │
│ null             ┆ null           ┆ 38.9    ┆ -76.99  │
│ null             ┆ null           ┆ 38.9    ┆ -77.0   │
│ …                ┆ …              ┆ …       ┆ …       │
│ null             ┆ null           ┆ 38.89   ┆ -76.98  │
│ null             ┆ null           ┆ 38.93   ┆ -77.04  │
│ null             ┆ null           ┆ 38.91   ┆ -77.05  │
│ null             ┆ null           ┆ 38.91   ┆ -77.03  │
│ null             ┆ null           ┆ 38.92   ┆ -77.01  │
└──────────────────┴────────────────┴─────────┴───

Fill in the missing values

In [1]:
# Round the latitude and longitude to 4 decimal places
daily_rental = daily_rental.with_columns([
    pl.col("end_lat").round(2).alias("end_lat"),
    pl.col("end_lng").round(2).alias("end_lng")
])

station_info = station_df.with_columns([
    pl.col("lat").round(2).alias("station_lat"),
    pl.col("lon").round(2).alias("station_lng")
])


NameError: name 'daily_rental' is not defined

In [26]:
# Perform the join on lat/lng coordinates with suffixes to avoid renaming conflicts
daily_rental = daily_rental.join(
    station_info,
    left_on=["end_lat", "end_lng"],
    right_on=["lat", "lon"],
    how="left",
    suffix="_station_info"  # Custom suffix for the columns from station_info
)

# Now, correctly use the column names from `station_info` (e.g., 'name' instead of 'station_name')
daily_rental = daily_rental.with_columns([
    pl.when(pl.col("end_station_name").is_null())
      .then(pl.col("name_station_info"))  # Updated to the renamed column
      .otherwise(pl.col("end_station_name"))
      .alias("end_station_name"),
    
    pl.when(pl.col("end_station_id").is_null())
      .then(pl.col("station_id_station_info"))  # Updated to the renamed column
      .otherwise(pl.col("end_station_id"))
      .alias("end_station_id")
])

# Drop unnecessary columns after the join (if any)
daily_rental = daily_rental.drop([col for col in daily_rental.columns if "_station_info" in col])

# Check the result
print(daily_rental)


shape: (18_007_247, 25)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ ride_id   ┆ rideable_ ┆ started_a ┆ ended_at  ┆ … ┆ station_i ┆ name_righ ┆ station_l ┆ station_ │
│ ---       ┆ type      ┆ t         ┆ ---       ┆   ┆ d_right   ┆ t         ┆ at        ┆ lng      │
│ str       ┆ ---       ┆ ---       ┆ str       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---      │
│           ┆ str       ┆ str       ┆           ┆   ┆ str       ┆ str       ┆ f64       ┆ f64      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ C04E20007 ┆ docked_bi ┆ 2020-12-0 ┆ 2020-12-0 ┆ … ┆ 08250172- ┆ 18th St & ┆ null      ┆ null     │
│ D039277   ┆ ke        ┆ 2         ┆ 2         ┆   ┆ 1f3f-11e7 ┆ Pennsylva ┆           ┆          │
│           ┆           ┆ 09:10:36  ┆ 09:24:12  ┆   ┆ -bf6b-386 ┆ nia Ave   ┆           ┆          │
│           ┆           ┆           ┆           ┆   ┆ 3bb…      ┆ N

In [None]:
daily_rental_check = daily_rental.with_columns(
    pl.when(
        ((pl.col("start_lat").is_null())&(pl.col("start_lat").is_null()))|
        ((pl.col("end_lat").is_null())&(pl.col("end_lat").is_null()))
        )
    .then(
        daily_rental.with_columns(
            pl.when(
                ((pl.col("start_lat").is_null())&(pl.col("start_lat").is_null()))|
                ((pl.col("end_lat").is_null())&(pl.col("end_lat").is_null()))
                )
        )
    )
    .otherwise(0)
    .alias('check')
)

print(daily_rental_check.filter(pl.col("check")==1).select('start_station_name','start_station_id','start_lat','start_lng','end_station_name','end_station_id','end_lat','end_lng'))

shape: (8, 8)
┌────────────┬────────────┬───────────┬───────────┬────────────┬───────────┬───────────┬───────────┐
│ start_stat ┆ start_stat ┆ start_lat ┆ start_lng ┆ end_statio ┆ end_stati ┆ end_lat   ┆ end_lng   │
│ ion_name   ┆ ion_id     ┆ ---       ┆ ---       ┆ n_name     ┆ on_id     ┆ ---       ┆ ---       │
│ ---        ┆ ---        ┆ f64       ┆ f64       ┆ ---        ┆ ---       ┆ f64       ┆ f64       │
│ str        ┆ str        ┆           ┆           ┆ str        ┆ str       ┆           ┆           │
╞════════════╪════════════╪═══════════╪═══════════╪════════════╪═══════════╪═══════════╪═══════════╡
│ Motivate   ┆ 32902      ┆ null      ┆ null      ┆ Motivate   ┆ 32900     ┆ 38.964406 ┆ -77.01075 │
│ Tech       ┆            ┆           ┆           ┆ BX Tech    ┆           ┆           ┆ 9         │
│ Office     ┆            ┆           ┆           ┆ office     ┆           ┆           ┆           │
│ Motivate   ┆ 32902      ┆ null      ┆ null      ┆ Motivate   ┆ 32902     ┆ 

In [10]:
daily_rental = daily_rental.with_columns([
    pl.col("start_station_id").cast(pl.Int64),  # Cast to string
    pl.col("end_station_id").cast(pl.Int64)      # Cast to string
])

daily_rental = daily_rental.with_columns([
    pl.col("start_station_id").cast(pl.Utf8).alias("start_station_id"),
    pl.col("end_station_id").cast(pl.Utf8).alias("end_station_id")
])

station_info = station_info_df.select([
    pl.col("short_name"),
    pl.col("name").alias("station_name"),
    pl.col("lat").alias("station_lat"),
    pl.col("lon").alias("station_lng")
])

print("daily_rental columns:", daily_rental.columns)
print("station_info columns:", station_info_df.columns)

daily_rental.describe()
station_info_df.describe()

daily_rental columns: ['ride_id', 'rideable_type', 'started_at', 'ended_at', 'start_station_name', 'start_station_id', 'end_station_name', 'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng', 'member_casual']
station_info columns: ['short_name', 'capacity', 'region_id', 'station_id', 'lon', 'name', 'lat']


KeyboardInterrupt: 

In [8]:
def fill_station_id_by_lat_lng(df, station_info_df):
    """
    Fill missing station_id using latitude and longitude.
    
    Args:
        df (pl.DataFrame): DataFrame containing rental data.
        station_info_df (pl.DataFrame): DataFrame containing station information.
    
    Returns:
        pl.DataFrame: Updated DataFrame with missing station_id filled.
    """
    # Join by lat and lon and update the start_station_id and end_station_id
    df = df.join(station_info_df, left_on=["start_lat", "start_lng"], right_on=["lat", "lon"], how="left") \
        .with_columns([
            pl.when(pl.col("start_station_id").is_null())
              .then(pl.col("short_name"))
              .otherwise(pl.col("start_station_id"))
              .alias("start_station_id"),

            pl.when(pl.col("end_station_id").is_null())
              .then(pl.col("short_name"))
              .otherwise(pl.col("end_station_id"))
              .alias("end_station_id")
        ]) \
        .select([col for col in df.columns if col not in ["lat", "lon", "short_name"]])  # Drop unnecessary columns

    return df


In [9]:
def fill_station_id_by_name(df, station_name_dict):
    """
    Fill missing station_id using station_name.
    
    Args:
        df (pl.DataFrame): DataFrame containing rental data.
        station_name_dict (dict): Dictionary mapping station_name -> station_id.
    
    Returns:
        pl.DataFrame: Updated DataFrame with missing station_id filled.
    """
    # Use the map function to fill missing values based on station_name
    df = df.with_columns([
        pl.when(pl.col("start_station_id").is_null())
          .then(pl.col("start_station_name").map(lambda x: station_name_dict.get(x, None), return_dtype=pl.Int64))
          .otherwise(pl.col("start_station_id"))
          .alias("start_station_id"),
        
        pl.when(pl.col("end_station_id").is_null())
          .then(pl.col("end_station_name").map(lambda x: station_name_dict.get(x, None), return_dtype=pl.Int64))
          .otherwise(pl.col("end_station_id"))
          .alias("end_station_id")
    ])
    return df


In [10]:
def fill_station_name_by_id(df, station_id_dict):
    """
    Fill missing station_name using station_id.
    
    Args:
        df (pl.DataFrame): DataFrame containing rental data.
        station_id_dict (dict): Dictionary mapping station_id -> station_name.
    
    Returns:
        pl.DataFrame: Updated DataFrame with missing station_name filled.
    """
    # Use the map function to fill missing values based on station_id
    df = df.with_columns([
        pl.when(pl.col("start_station_name").is_null())
          .then(pl.col("start_station_id").map(lambda x: station_id_dict.get(x, None), return_dtype=pl.Utf8))
          .otherwise(pl.col("start_station_name"))
          .alias("start_station_name"),
        
        pl.when(pl.col("end_station_name").is_null())
          .then(pl.col("end_station_id").map(lambda x: station_id_dict.get(x, None), return_dtype=pl.Utf8))
          .otherwise(pl.col("end_station_name"))
          .alias("end_station_name")
    ])
    return df


In [11]:
# Assuming `daily_rental` is your Polars DataFrame and the dictionaries are defined
daily_rental = fill_station_id_by_lat_lng(daily_rental, station_info_df)
daily_rental = fill_station_id_by_name(daily_rental, station_name_dict)
daily_rental = fill_station_name_by_id(daily_rental, station_id_dict)

# Save the updated DataFrame to a CSV file
daily_rental.write_csv('daily_rental_updated.csv')


AttributeError: 'Expr' object has no attribute 'map'

In [None]:
# # Save the updated DataFrame to a CSV file
# daily_rental.write_csv("daily_rental_updated.csv")

# # OR Save to a Parquet file (more efficient for large datasets)
# daily_rental.write_parquet("daily_rental_updated.parquet")
