In [1]:
import polars as pl
import numpy as np
import random
from datetime import datetime, timedelta

pl.Config.set_tbl_cols(-1)
pl.Config.set_tbl_rows(20)

polars.config.Config

# Descripcion

Los datos son notificaciones de dispositivos GPS en Mexico. En promedio generan notificaciones automatizadas cada 5 minutos si el carro esta encendido, y 30 si esta apagado.  

Cada notificacion esta acompannada de un evento de lo que esta ocurriendo, y trae la latitud y longitud.  

El objetico es predecir si un vehiculo esta siendo robado de acuerdo a sus notificaciones, por lo que el primer paso seria limpiar datos y hacer ingenieria de variables.

Trata de hacerlo **lazy** si puedes.

In [8]:
def generate_dummy_data(num_cars, start_time, end_time, working_hours_interval, non_working_hours_interval):
    data = []
    min_lat, max_lat = 14.5388, 32.7186
    min_lon, max_lon = -118.4662, -86.7104

    for car_id in range(num_cars):
        current_time = start_time
        latitude = random.uniform(min_lat, max_lat)
        longitude = random.uniform(min_lon, max_lon)

        while current_time < end_time:
            interval = working_hours_interval if (current_time.weekday() < 5 and 9 <= current_time.hour < 17) else non_working_hours_interval

            if random.random() < 0.99:
                notification = random.choice(["low_fuel", "tire_pressure", "engine_check", None])
                data.append((f"car_{car_id}", current_time.isoformat(), latitude, longitude, notification))

            while True:
                additional_time = current_time + timedelta(minutes=random.expovariate(1 / (interval / 2)))
                if additional_time >= current_time + timedelta(minutes=interval):
                    break
                notification = random.choice(["low_fuel", "tire_pressure", "engine_check", None])
                data.append((f"car_{car_id}", additional_time.isoformat(), latitude, longitude, notification))

            latitude += random.uniform(-0.01, 0.01)
            longitude += random.uniform(-0.01, 0.01)

            if random.random() < 0.01:
                burst_start = current_time + timedelta(minutes=random.uniform(0, interval))
                burst_end = burst_start + timedelta(minutes=5)
                while current_time < burst_end:
                    notification = random.choice(["low_fuel", "tire_pressure", "engine_check", None])
                    data.append((f"car_{car_id}", current_time.isoformat(), latitude, longitude, notification))
                    current_time += timedelta(seconds=random.uniform(1, 10))

            current_time += timedelta(minutes=interval)

    return pl.DataFrame(
        {
            "car_id": [record[0] for record in data],
            "timestamp": [record[1] for record in data],
            "latitude": [record[2] for record in data],
            "longitude": [record[3] for record in data],
            "notification": [record[4] for record in data],
        }
    ).lazy()

In [10]:
# Parameters
num_cars = 1000
start_time = datetime(2023, 1, 1, 0, 0, 0)
end_time = start_time + timedelta(weeks=1)
working_hours_interval = 5  # 5-minute intervals during working hours
non_working_hours_interval = 30  # 30-minute intervals during non-working hours

# Generate dummy data
data = generate_dummy_data(
    num_cars, start_time, end_time, working_hours_interval, non_working_hours_interval
)

# Display generated data
print(data.head())
print(data.collect().limit(3))

naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)

SLICE[offset: 0, len: 5]
  DF ["car_id", "timestamp", "latitude", "longitude"]; PROJECT */5 COLUMNS; SELECTION: None
shape: (3, 5)
┌────────┬────────────────────────────┬───────────┬─────────────┬───────────────┐
│ car_id ┆ timestamp                  ┆ latitude  ┆ longitude   ┆ notification  │
│ ---    ┆ ---                        ┆ ---       ┆ ---         ┆ ---           │
│ str    ┆ str                        ┆ f64       ┆ f64         ┆ str           │
╞════════╪════════════════════════════╪═══════════╪═════════════╪═══════════════╡
│ car_0  ┆ 2023-01-01T00:00:00        ┆ 23.844921 ┆ -117.766906 ┆ low_fuel      │
│ car_0  ┆ 2023-01-01T00:09:24.774848 ┆ 23.844921 ┆ -117.766906 ┆ low_fuel      │
│ car_0  ┆ 2023-01-01T00:06:09.968995 ┆ 23.844921 ┆ -117.766906 ┆ tire_pressure │
└────────┴────────────────────────────┴───────────┴─────────────┴───────────────┘


## Limpieza de datos

### Timestamp

Convierte el `timestamp` que actualmente es string a formato de tiempo en polars

In [11]:
# Ensure the `timestamp` column is in a uniform format for parsing
data = data.with_columns(
    pl.when(pl.col("timestamp").str.contains(r"\."))
    .then(pl.col("timestamp"))
    .otherwise(pl.col("timestamp") + ".000000")
    .alias("timestamp")
)

# Parse the `timestamp` column into a Polars Datetime type
data = data.with_columns(
    pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S.%f")
)

# Collect and print the processed data
print(data.collect())

  pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S.%f")


shape: (6_297_257, 5)
┌─────────┬────────────────────────────┬───────────┬─────────────┬───────────────┐
│ car_id  ┆ timestamp                  ┆ latitude  ┆ longitude   ┆ notification  │
│ ---     ┆ ---                        ┆ ---       ┆ ---         ┆ ---           │
│ str     ┆ datetime[μs]               ┆ f64       ┆ f64         ┆ str           │
╞═════════╪════════════════════════════╪═══════════╪═════════════╪═══════════════╡
│ car_0   ┆ 2023-01-01 00:00:00        ┆ 23.844921 ┆ -117.766906 ┆ low_fuel      │
│ car_0   ┆ 2023-01-01 00:09:24.000774 ┆ 23.844921 ┆ -117.766906 ┆ low_fuel      │
│ car_0   ┆ 2023-01-01 00:06:09.000968 ┆ 23.844921 ┆ -117.766906 ┆ tire_pressure │
│ car_0   ┆ 2023-01-01 00:08:07.000513 ┆ 23.844921 ┆ -117.766906 ┆ low_fuel      │
│ car_0   ┆ 2023-01-01 00:08:26.000620 ┆ 23.844921 ┆ -117.766906 ┆ low_fuel      │
│ car_0   ┆ 2023-01-01 00:09:17.000891 ┆ 23.844921 ┆ -117.766906 ┆ engine_check  │
│ car_0   ┆ 2023-01-01 00:03:06.000677 ┆ 23.844921 ┆ -117.766906 

### Ingenieria de variables

Dado que va a entrar a un modelo de machine learning es encesario que todas las variables sean numericas, y esten en formnato tidy. Cada observacion en una fila, y cada variable en una columna. Por lo tanto se decidio crear estadisticos y agregar los datos a intervalos uniformes de `x` minutos.  

Por ejemplo, colapsar toda la informacion que ocurrion en el intervalo, como el numero de notificaciones en esos 5 minutos, el promedio entre notificaciones, y el tipo de notificaciones.

Existen varias formas de hacer esto, puedes hacerlo con `group_by` primero para crear las nuevas variables, o `group_by` (`rolling`, `dynamic`) usando operaciones sobre listas. Utiliza claude o chat_gpt

1. Crea una nueva variable que compute la diferencia de tiempo entre notificaciones del mismo vehiculo. Piensa como lo vas a hacer. Llama a esta variable `notification_time`
   


In [12]:
# Add a new column `notification_time`
data_diff = data.clone()

data = data.sort(["car_id", "timestamp"])
data_diff = (
    data_diff
    .sort(["car_id", "timestamp"])
    .group_by("car_id", maintain_order=True)
    .agg([pl.col("timestamp").diff().alias("notification_time")])
    .explode("notification_time")
    .drop("car_id")
)

data = pl.concat([data, data_diff], how="horizontal")

# Collect and print the processed data
print(data.collect())

shape: (6_297_257, 6)
┌─────────┬─────────────────────┬───────────┬─────────────┬───────────────┬───────────────────┐
│ car_id  ┆ timestamp           ┆ latitude  ┆ longitude   ┆ notification  ┆ notification_time │
│ ---     ┆ ---                 ┆ ---       ┆ ---         ┆ ---           ┆ ---               │
│ str     ┆ datetime[μs]        ┆ f64       ┆ f64         ┆ str           ┆ duration[μs]      │
╞═════════╪═════════════════════╪═══════════╪═════════════╪═══════════════╪═══════════════════╡
│ car_0   ┆ 2023-01-01 00:00:00 ┆ 23.844921 ┆ -117.766906 ┆ low_fuel      ┆ null              │
│ car_0   ┆ 2023-01-01          ┆ 23.844921 ┆ -117.766906 ┆ null          ┆ 1m 7s 282µs       │
│         ┆ 00:01:07.000282     ┆           ┆             ┆               ┆                   │
│ car_0   ┆ 2023-01-01          ┆ 23.844921 ┆ -117.766906 ┆ engine_check  ┆ 1m 59s 395µs      │
│         ┆ 00:03:06.000677     ┆           ┆             ┆               ┆                   │
│ car_0   ┆ 2023-0

2. Crea una nueva variable que compute la distancia que viajo el vehiculo desde la ultima notificacion. Llamala `distance`

In [13]:
import numpy as np

def haversine(lat1, lon1, lat2, lon2):
    R = 6371
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c

data = (
    data
    .sort(["car_id", "timestamp"])
    .with_columns(
        pl.col("latitude").shift(1).over("car_id").alias("prev_latitude"),
        pl.col("longitude").shift(1).over("car_id").alias("prev_longitude")
    )
    .with_columns(
        pl.struct(["latitude", "longitude", "prev_latitude", "prev_longitude"]).map_elements(
            lambda row: haversine(
                row["latitude"], row["longitude"], row["prev_latitude"], row["prev_longitude"]
            ) if row["prev_latitude"] is not None else None,
            return_dtype=pl.Float64
        ).alias("distance")
    )
    .drop(["prev_latitude", "prev_longitude"])
)

print(data.collect().limit(5))


shape: (5, 7)
┌────────┬──────────────────┬───────────┬─────────────┬───────────────┬─────────────────┬──────────┐
│ car_id ┆ timestamp        ┆ latitude  ┆ longitude   ┆ notification  ┆ notification_ti ┆ distance │
│ ---    ┆ ---              ┆ ---       ┆ ---         ┆ ---           ┆ me              ┆ ---      │
│ str    ┆ datetime[μs]     ┆ f64       ┆ f64         ┆ str           ┆ ---             ┆ f64      │
│        ┆                  ┆           ┆             ┆               ┆ duration[μs]    ┆          │
╞════════╪══════════════════╪═══════════╪═════════════╪═══════════════╪═════════════════╪══════════╡
│ car_0  ┆ 2023-01-01       ┆ 23.844921 ┆ -117.766906 ┆ low_fuel      ┆ null            ┆ null     │
│        ┆ 00:00:00         ┆           ┆             ┆               ┆                 ┆          │
│ car_0  ┆ 2023-01-01       ┆ 23.844921 ┆ -117.766906 ┆ null          ┆ 1m 7s 282µs     ┆ 0.0      │
│        ┆ 00:01:07.000282  ┆           ┆             ┆               ┆      

3. Crea intervalos de `x` minutos por carro. Como el numero de notificaciones en esos intervalos no es uniforme tienes que buscar funciones de polars especificas, pero ademas tienen que ser por vehiculo, pues tienen que ser del mismo. Revisa las funciones de `group_by` `dynamic` y `rolling`.
   1. Computa la media, mediana, varianza, max y min de `notification_time` los intervalos de `x` minutos
   2. Computa la media, mediana, varianza, max y min de `distance`


In [14]:
x = 55

df_intervals = data.clone()
df_intervals = (
    df_intervals
    .sort(["car_id", "timestamp"])
    .group_by_dynamic(
        index_column="timestamp",
        every=f"{x}m",
        by="car_id",
        include_boundaries=True
    )
    .agg([
        pl.col("notification_time").mean().alias("notification_time_mean"),
        pl.col("notification_time").median().alias("notification_time_median"),
        pl.col("notification_time").var().alias("notification_time_variance"),
        pl.col("notification_time").max().alias("notification_time_max"),
        pl.col("notification_time").min().alias("notification_time_min"),
        pl.col("distance").mean().alias("distance_mean"),
        pl.col("distance").median().alias("distance_median"),
        pl.col("distance").var().alias("distance_variance"),
        pl.col("distance").max().alias("distance_max"),
        pl.col("distance").min().alias("distance_min"),
    ])
)

print(df_intervals.collect())


  .group_by_dynamic(


shape: (184_198, 14)
┌─────┬─────┬─────┬─────┬─────┬─────┬─────┬────────┬───────┬───────┬───────┬───────┬───────┬───────┐
│ car ┆ _lo ┆ _up ┆ tim ┆ not ┆ not ┆ not ┆ notifi ┆ notif ┆ dista ┆ dista ┆ dista ┆ dista ┆ dista │
│ _id ┆ wer ┆ per ┆ est ┆ ifi ┆ ifi ┆ ifi ┆ cation ┆ icati ┆ nce_m ┆ nce_m ┆ nce_v ┆ nce_m ┆ nce_m │
│ --- ┆ _bo ┆ _bo ┆ amp ┆ cat ┆ cat ┆ cat ┆ _time_ ┆ on_ti ┆ ean   ┆ edian ┆ arian ┆ ax    ┆ in    │
│ str ┆ und ┆ und ┆ --- ┆ ion ┆ ion ┆ ion ┆ max    ┆ me_mi ┆ ---   ┆ ---   ┆ ce    ┆ ---   ┆ ---   │
│     ┆ ary ┆ ary ┆ dat ┆ _ti ┆ _ti ┆ _ti ┆ ---    ┆ n     ┆ f64   ┆ f64   ┆ ---   ┆ f64   ┆ f64   │
│     ┆ --- ┆ --- ┆ eti ┆ me_ ┆ me_ ┆ me_ ┆ durati ┆ ---   ┆       ┆       ┆ f64   ┆       ┆       │
│     ┆ dat ┆ dat ┆ me[ ┆ mea ┆ med ┆ var ┆ on[μs] ┆ durat ┆       ┆       ┆       ┆       ┆       │
│     ┆ eti ┆ eti ┆ μs] ┆ n   ┆ ian ┆ ian ┆        ┆ ion[μ ┆       ┆       ┆       ┆       ┆       │
│     ┆ me[ ┆ me[ ┆     ┆ --- ┆ --- ┆ ce  ┆        ┆ s]    ┆       ┆  