In [3]:
from pathlib import Path
from dotenv import load_dotenv

if Path(".env").exists():
    load_dotenv()

In [5]:
import os
import polars as pl

db_uri = os.environ["DB_URI"]

print(f"{db_uri=}")

pl.read_database_uri(query="SELECT * FROM vessel_history_clean LIMIT 5;", uri=db_uri, engine="adbc")

db_uri='postgresql://captain_py:critic-bike-strife-hatbox-preface@demo01-staging-ferryland-20240702155133310700000001.cjiq6iow80rt.us-east-2.rds.amazonaws.com:5432/pyferries?options=-csearch_path%3Dproduction'
db_uri='postgresql://captain_py:critic-bike-strife-hatbox-preface@demo01-staging-ferryland-20240702155133310700000001.cjiq6iow80rt.us-east-2.rds.amazonaws.com:5432/pyferries?options=-csearch_path%3Dproduction'


OperationalError: IO: [libpq] Failed to connect: connection to server at "demo01-staging-ferryland-20240702155133310700000001.cjiq6iow80rt.us-east-2.rds.amazonaws.com" (10.37.43.247), port 5432 failed: Operation timed out
	Is the server running on that host and accepting TCP/IP connections?


In [None]:
import os
import pins

# connect_url = os.environ["CONNECT_SERVER"]
# connect_api_key = os.environ["CONNECT_API_KEY"]

# board = pins.board_connect(server_url=connect_url, api_key=connect_api_key)

# vessel_history_path = board.pin_download("sam.edwardes/vessel_history_clean")
# vessel_verbose_path = board.pin_download("sam.edwardes/vessel_verbose_clean")
# weather_path = board.pin_download("sam.edwardes/terminal_weather_clean")

In [None]:
from posit.connect import Client

with Client(url=connect_url, api_key=connect_api_key) as client:
    username = client.me.username
print(username)

In [None]:
import polars as pl

vessel_history = pl.read_parquet(vessel_history_path)
vessel_verbose = pl.read_parquet(vessel_verbose_path)
weather = pl.read_parquet(weather_path)

In [None]:
weather.glimpse()

In [None]:
vessel_history.glimpse()

In [None]:
ferry_trips = vessel_history.select(
    pl.col("Vessel", "Departing", "Arriving"),
    (pl.col("ActualDepart") - pl.col("ScheduledDepart"))
    .dt.total_seconds()
    .alias("Delay"),
    pl.col("Date"),
    pl.col("Date").dt.year().alias("Year"),
    pl.col("Date").dt.month().alias("Month"),
    pl.col("Date").dt.weekday().alias("Weekday"),
    pl.col("Date").dt.hour().alias("Hour"),
)
ferry_trips

In [None]:
ferry_trips.plot.hist("Delay", bin_range=(-1800, 7200), bins=30)

In [None]:
ferry_trips.select(
    pl.col("Delay")
    .map_elements(lambda x: max(x, 1), return_dtype=pl.Float64)
    .log()
    .alias("LogDelay")
).plot.hist("LogDelay")

In [None]:
ferry_trips = ferry_trips.select(
    pl.exclude("Delay"),
    pl.col("Delay")
    .map_elements(lambda x: max(x, 1), return_dtype=pl.Float64)
    .log()
    .alias("LogDelay"),
)
ferry_trips

In [None]:
vessel_verbose.glimpse()

In [None]:
vessel_verbose

In [None]:
ferry_info = vessel_verbose.select(
    pl.col("VesselName").str.to_lowercase(),
    pl.col("ClassName"),
    pl.col(
        "SpeedInKnots",
        "EngineCount",
        "Horsepower",
        "MaxPassengerCount",
        "PassengerOnly",
        "FastFerry",
        "PropulsionInfo",
    ),
    pl.col("YearBuilt", "YearRebuilt").dt.year(),
)

ferry_trips = ferry_trips.join(
    ferry_info, left_on="Vessel", right_on="VesselName", how="left", coalesce=True
)

In [None]:
import polars.selectors as cs

ferry_trips = (
    ferry_trips.with_columns(pl.col("Date").dt.round("1h").alias("time"))
    .join(
        weather.rename(lambda col_name: f"departing_{col_name}"),
        how="left",
        left_on=["Departing", "time"],
        right_on=["departing_terminal_name", "departing_time"],
        coalesce=True,
    )
    .join(
        weather.rename(lambda col_name: f"arriving_{col_name}"),
        how="left",
        left_on=["Arriving", "time"],
        right_on=["arriving_terminal_name", "arriving_time"],
        coalesce=True,
    )
    .select(
        ~cs.ends_with(
            "latitude",
            "longitude",
            "generationtime_ms",
            "utc_offset_seconds",
            "timezone",
            "timezone_abbreviation",
            "elevation",
            "hourly_units",
        ),
    )
    .select(pl.exclude("time"))
)
ferry_trips

In [None]:
import polars.selectors as cs

ferry_trips.select(cs.string()).drop("Vessel")

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_features = [
    "Month",
    "Weekday",
    "Hour",
    "SpeedInKnots",
    "EngineCount",
    "Horsepower",
    "MaxPassengerCount",
    "PassengerOnly",
    "FastFerry",
    "YearBuilt",
    "YearRebuilt",
    "departing_temperature_2m",
    "departing_precipitation",
    "departing_cloud_cover",
    "departing_wind_speed_10m",
    "departing_wind_direction_10m",
    "departing_wind_gusts_10m",
    "arriving_temperature_2m",
    "arriving_precipitation",
    "arriving_cloud_cover",
    "arriving_wind_speed_10m",
    "arriving_wind_direction_10m",
    "arriving_wind_gusts_10m",
]
categorical_features = [
    "Departing",
    "Arriving",
    "ClassName",
    "PropulsionInfo",
    "departing_weather_code",
    "arriving_weather_code",
]


preprocessor = ColumnTransformer(
    [
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(), categorical_features),
    ]
)

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(verbose=2, random_state=2, n_jobs=-1)

In [None]:
from sklearn.pipeline import Pipeline

model = Pipeline([("preprocess", preprocessor), ("random-forest", rf)])

In [None]:
from sklearn.model_selection import train_test_split

X = ferry_trips.drop_nulls().drop("Vessel", "Date", "Year", "LogDelay")
y = ferry_trips.drop_nulls()["LogDelay"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
print(f"Nrows training data: {X_train.shape[0]}")
print(f"Nrows testing data:  {X_test.shape[0]}")

In [None]:
uri = f"postgresql://{os.environ["DB_USER"]}:{os.environ["DB_PASSWORD"]}@{os.environ["DB_HOST"]}:{os.environ["DB_PORT"]}/{os.environ["DB_NAME"]}"

X_test.with_columns(y_test).write_database(
    table_name="test_data", connection=uri, engine="adbc", if_table_exists="replace"
)

In [None]:
model.fit(X_train.to_pandas(), y_train)
model.score(X_test, y_test)

In [None]:
from vetiver import VetiverModel

v = VetiverModel(
    model, model_name=f"{username}/ferry_delay", prototype_data=X.to_pandas()
)

In [None]:
import pins
import vetiver

model_board = pins.board_connect(
    server_url=connect_url, api_key=connect_api_key, allow_pickle_read=True
)
vetiver.vetiver_pin_write(model_board, model=v)

In [None]:
from rsconnect.api import RSConnectServer

connect_server = RSConnectServer(url=connect_url, api_key=connect_api_key)
vetiver.deploy_rsconnect(
    connect_server=connect_server, board=model_board, pin_name=f"{username}/ferry_delay"
)

In [None]:
vetiver.templates.model_card()