In [135]:
! pip install numpy



In [136]:
import polars as pl
import numpy as np
import random
from datetime import datetime, timedelta

# Descripcion

Los datos son notificaciones de dispositivos GPS en Mexico. En promedio generan notificaciones automatizadas cada 5 minutos si el carro esta encendido, y 30 si esta apagado.  

Cada notificacion esta acompannada de un evento de lo que esta ocurriendo, y trae la latitud y longitud.  

El objetico es predecir si un vehiculo esta siendo robado de acuerdo a sus notificaciones, por lo que el primer paso seria limpiar datos y hacer ingenieria de variables.

Trata de hacerlo **lazy** si puedes.

In [137]:
def generate_dummy_data(num_cars, start_time, end_time, working_hours_interval, non_working_hours_interval):
    data = []

    # Define the latitude and longitude ranges for Mexico
    min_latitude, max_latitude = 14.5388, 32.7186
    min_longitude, max_longitude = -118.4662, -86.7104

    for car_id in range(num_cars):
        current_time = start_time

        # Generate random initial latitude and longitude for each car
        latitude = random.uniform(min_latitude, max_latitude)
        longitude = random.uniform(min_longitude, max_longitude)

        while current_time < end_time:
            if current_time.weekday() < 5 and 9 <= current_time.hour < 17:
                # Working hours (Monday to Friday, 9 AM to 5 PM)
                interval = working_hours_interval
            else:
                # Non-working hours
                interval = non_working_hours_interval

            # Generate notification with 99% probability
            if random.random() < 0.99:
                notification = random.choice(["low_fuel", "tire_pressure", "engine_check", None])
                data.append((f"car_{car_id}", current_time.isoformat(), latitude, longitude, notification))

            # Generate additional notifications between intervals
            while True:
                additional_interval = random.expovariate(1 / (interval / 2))
                additional_time = current_time + timedelta(minutes=additional_interval)
                if additional_time >= current_time + timedelta(minutes=interval):
                    break
                notification = random.choice(["low_fuel", "tire_pressure", "engine_check", None])
                data.append((f"car_{car_id}", additional_time.isoformat(), latitude, longitude, notification))

            # Update latitude and longitude for car movement
            latitude += random.uniform(-0.01, 0.01)
            longitude += random.uniform(-0.01, 0.01)

            # Check if the car is among the 1% that can have 100 notifications within 5 minutes
            if random.random() < 0.01:
                burst_start_time = current_time + timedelta(minutes=random.uniform(0, interval))
                burst_end_time = burst_start_time + timedelta(minutes=5)
                while current_time < burst_end_time:
                    notification = random.choice(["low_fuel", "tire_pressure", "engine_check", None])
                    data.append((f"car_{car_id}", current_time.isoformat(), latitude, longitude, notification))
                    current_time += timedelta(seconds=random.uniform(1, 10))

            current_time += timedelta(minutes=interval)

    # Create a Polars DataFrame from the generated data
    df = pl.DataFrame(
        {
            "car_id": [record[0] for record in data],
            "timestamp": [record[1] for record in data],
            "latitude": [record[2] for record in data],
            "longitude": [record[3] for record in data],
            "notification": [record[4] for record in data],
        }
    )

    return df.lazy()

In [138]:
num_cars = 1000
start_time = datetime(2023, 1, 1, 0, 0, 0)  # Start of the week
end_time = start_time + timedelta(weeks=1)  # End of the week
working_hours_interval = 5  # Interval of 5 minutes during working hours
non_working_hours_interval = 30  # Interval of 30 minutes during non-working hours

# Generate the dummy data
data = generate_dummy_data(num_cars, start_time, end_time, working_hours_interval, non_working_hours_interval)

# Print the first few rows of the generated data
print(data.head())

naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)

SLICE[offset: 0, len: 5]
  DF ["car_id", "timestamp", "latitude", "longitude"]; PROJECT */5 COLUMNS; SELECTION: None


In [139]:
data.collect()

car_id,timestamp,latitude,longitude,notification
str,str,f64,f64,str
"""car_0""","""2023-01-01T00:00:00""",23.942326,-106.280889,"""engine_check"""
"""car_0""","""2023-01-01T00:10:10.408727""",23.942326,-106.280889,"""tire_pressure"""
"""car_0""","""2023-01-01T00:30:00""",23.934415,-106.289279,
"""car_0""","""2023-01-01T00:49:30.390179""",23.934415,-106.289279,"""tire_pressure"""
"""car_0""","""2023-01-01T00:58:38.675105""",23.934415,-106.289279,"""tire_pressure"""
…,…,…,…,…
"""car_999""","""2023-01-07T22:38:41.574654""",26.825027,-88.032093,"""engine_check"""
"""car_999""","""2023-01-07T23:04:39.630977""",26.825385,-88.033536,"""engine_check"""
"""car_999""","""2023-01-07T23:34:39.630977""",26.825386,-88.03489,
"""car_999""","""2023-01-07T23:43:46.323022""",26.825386,-88.03489,


## Limpieza de datos

### Timestamp

Convierte el `timestamp` que actualmente es string a formato de tiempo en polars

In [140]:
data = data.with_columns(
    pl.col("timestamp")
    .str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S.%f", strict=False)
    .alias("timestamp")
)


  .str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S.%f", strict=False)


In [141]:
data.collect()

car_id,timestamp,latitude,longitude,notification
str,datetime[μs],f64,f64,str
"""car_0""",,23.942326,-106.280889,"""engine_check"""
"""car_0""",2023-01-01 00:10:10.000408,23.942326,-106.280889,"""tire_pressure"""
"""car_0""",,23.934415,-106.289279,
"""car_0""",2023-01-01 00:49:30.000390,23.934415,-106.289279,"""tire_pressure"""
"""car_0""",2023-01-01 00:58:38.000675,23.934415,-106.289279,"""tire_pressure"""
…,…,…,…,…
"""car_999""",2023-01-07 22:38:41.000574,26.825027,-88.032093,"""engine_check"""
"""car_999""",2023-01-07 23:04:39.000630,26.825385,-88.033536,"""engine_check"""
"""car_999""",2023-01-07 23:34:39.000630,26.825386,-88.03489,
"""car_999""",2023-01-07 23:43:46.000323,26.825386,-88.03489,


### Ingenieria de variables

Dado que va a entrar a un modelo de machine learning es encesario que todas las variables sean numericas, y esten en formnato tidy. Cada observacion en una fila, y cada variable en una columna. Por lo tanto se decidio crear estadisticos y agregar los datos a intervalos uniformes de `x` minutos.  

Por ejemplo, colapsar toda la informacion que ocurrion en el intervalo, como el numero de notificaciones en esos 5 minutos, el promedio entre notificaciones, y el tipo de notificaciones.

Existen varias formas de hacer esto, puedes hacerlo con `group_by` primero para crear las nuevas variables, o `group_by` (`rolling`, `dynamic`) usando operaciones sobre listas. Utiliza claude o chat_gpt

1. Crea una nueva variable que compute la diferencia de tiempo entre notificaciones del mismo vehiculo. Piensa como lo vas a hacer. Llama a esta variable `notification_time`
   


In [142]:
#dynamic fija el mundo en 5 minutos, el rolling se para en cada observación y 
#se fija 5 minutos atrás
# Compute notification_time and distance
data = data.with_columns(
    pl.col("timestamp").diff().over("car_id").alias("notification_time")
 )



In [143]:
data.collect()

car_id,timestamp,latitude,longitude,notification,notification_time
str,datetime[μs],f64,f64,str,duration[μs]
"""car_0""",,23.942326,-106.280889,"""engine_check""",
"""car_0""",2023-01-01 00:10:10.000408,23.942326,-106.280889,"""tire_pressure""",
"""car_0""",,23.934415,-106.289279,,
"""car_0""",2023-01-01 00:49:30.000390,23.934415,-106.289279,"""tire_pressure""",
"""car_0""",2023-01-01 00:58:38.000675,23.934415,-106.289279,"""tire_pressure""",9m 8s 285µs
…,…,…,…,…,…
"""car_999""",2023-01-07 22:38:41.000574,26.825027,-88.032093,"""engine_check""",-20m -32s -41µs
"""car_999""",2023-01-07 23:04:39.000630,26.825385,-88.033536,"""engine_check""",25m 58s 56µs
"""car_999""",2023-01-07 23:34:39.000630,26.825386,-88.03489,,30m
"""car_999""",2023-01-07 23:43:46.000323,26.825386,-88.03489,,9m 6s 999693µs


2. Crea una nueva variable que compute la distancia que viajo el vehiculo desde la ultima notificacion. Llamala `distance`

In [144]:
# Radius of Earth in kilometers
R = 6371.0

# Convert latitude and longitude columns to radians
data = data.with_columns([
    (pl.col("latitude") * np.pi / 180).alias("lat_rad"),
    (pl.col("longitude") * np.pi / 180).alias("lon_rad"),
])

# Calculate Haversine distance with correct usage of sin, cos, and arcsin as chained methods
data = data.with_columns([
    (2 * R * (
        (pl.col("lat_rad").shift(-1) - pl.col("lat_rad")).sin().pow(2) +
        (pl.col("lat_rad").cos() * pl.col("lat_rad").shift(-1).cos() *
         (pl.col("lon_rad").shift(-1) - pl.col("lon_rad")).sin().pow(2))
    ).sqrt().arcsin()).over("car_id").alias("distance")
])

# Drop the intermediate columns if no longer needed
data = data.drop(["lat_rad", "lon_rad"])

# Collect and display the results
data_df = data.collect()
data_df


car_id,timestamp,latitude,longitude,notification,notification_time,distance
str,datetime[μs],f64,f64,str,duration[μs],f64
"""car_0""",,23.942326,-106.280889,"""engine_check""",,0.0
"""car_0""",2023-01-01 00:10:10.000408,23.942326,-106.280889,"""tire_pressure""",,2.450245
"""car_0""",,23.934415,-106.289279,,,0.0
"""car_0""",2023-01-01 00:49:30.000390,23.934415,-106.289279,"""tire_pressure""",,0.0
"""car_0""",2023-01-01 00:58:38.000675,23.934415,-106.289279,"""tire_pressure""",9m 8s 285µs,0.0
…,…,…,…,…,…,…
"""car_999""",2023-01-07 22:38:41.000574,26.825027,-88.032093,"""engine_check""",-20m -32s -41µs,0.297365
"""car_999""",2023-01-07 23:04:39.000630,26.825385,-88.033536,"""engine_check""",25m 58s 56µs,0.268635
"""car_999""",2023-01-07 23:34:39.000630,26.825386,-88.03489,,30m,0.0
"""car_999""",2023-01-07 23:43:46.000323,26.825386,-88.03489,,9m 6s 999693µs,0.0


In [145]:
#checo que no hice una estupidez
data.filter(pl.col("distance") > 0).collect()

car_id,timestamp,latitude,longitude,notification,notification_time,distance
str,datetime[μs],f64,f64,str,duration[μs],f64
"""car_0""",2023-01-01 00:10:10.000408,23.942326,-106.280889,"""tire_pressure""",,2.450245
"""car_0""",2023-01-01 00:34:33.000875,23.934415,-106.289279,"""low_fuel""",-19m -7s -999589µs,2.286143
"""car_0""",2023-01-01 01:02:19.000377,23.943221,-106.283476,"""low_fuel""",-1m -28s -999905µs,1.845762
"""car_0""",2023-01-01 01:51:42.000530,23.950506,-106.279124,"""low_fuel""",-6m -51s -999688µs,1.420448
"""car_0""",2023-01-01 02:09:01.000802,23.955025,-106.274186,"""engine_check""",6m 22s,1.876723
…,…,…,…,…,…,…
"""car_999""",2023-01-07 20:46:10.000650,26.821184,-88.049384,"""engine_check""",11m 31s 20µs,1.234179
"""car_999""",2023-01-07 21:08:00.000734,26.820049,-88.043297,,3m 21s 104µs,1.450274
"""car_999""",2023-01-07 22:13:03.000694,26.821351,-88.036137,,3m 44s 999986µs,1.145487
"""car_999""",2023-01-07 22:38:41.000574,26.825027,-88.032093,"""engine_check""",-20m -32s -41µs,0.297365


3. Crea intervalos de `x` minutos por carro. Como el numero de notificaciones en esos intervalos no es uniforme tienes que buscar funciones de polars especificas, pero ademas tienen que ser por vehiculo, pues tienen que ser del mismo. Revisa las funciones de `group_by` `dynamic` y `rolling`.
   1. Computa la media, mediana, varianza, max y min de `notification_time` los intervalos de `x` minutos
   2. Computa la media, mediana, varianza, max y min de `distance`


In [146]:
# Step 1: Fill nulls, ensuring `notification_time` stays as `duration[μs]`
data = data.with_columns([
    pl.col("timestamp").fill_null(strategy="forward").cast(pl.Datetime)
])

# Check if there are any remaining null values in 'timestamp'
null_timestamps = data.filter(pl.col("timestamp").is_null()).collect()
print("Rows with null timestamps:", null_timestamps.shape[0])

Rows with null timestamps: 1


In [147]:
null_timestamps

car_id,timestamp,latitude,longitude,notification,notification_time,distance
str,datetime[μs],f64,f64,str,duration[μs],f64
"""car_0""",,23.942326,-106.280889,"""engine_check""",,0.0


In [148]:
#el unico que nos queda lo rellenamos backqrds aunque en producción sea perdida de tiempo
data = data.with_columns([
    pl.col("timestamp").fill_null(strategy="backward").cast(pl.Datetime)
])

In [149]:

# Check if there are any remaining null values in 'timestamp'
null_timestamps = data.filter(pl.col("timestamp").is_null()).collect()
print("Rows with null timestamps:", null_timestamps.shape[0])
null_timestamps

Rows with null timestamps: 0


car_id,timestamp,latitude,longitude,notification,notification_time,distance
str,datetime[μs],f64,f64,str,duration[μs],f64


In [150]:
data.filter(pl.col("car_id").is_null()).collect()

car_id,timestamp,latitude,longitude,notification,notification_time,distance
str,datetime[μs],f64,f64,str,duration[μs],f64


In [151]:
data = data.with_columns([
    pl.col("notification_time").fill_null(pl.duration(microseconds=0)),
    pl.col("distance").fill_null(0.0)
])
data.collect()

car_id,timestamp,latitude,longitude,notification,notification_time,distance
str,datetime[μs],f64,f64,str,duration[μs],f64
"""car_0""",2023-01-01 00:10:10.000408,23.942326,-106.280889,"""engine_check""",0µs,0.0
"""car_0""",2023-01-01 00:10:10.000408,23.942326,-106.280889,"""tire_pressure""",0µs,2.450245
"""car_0""",2023-01-01 00:10:10.000408,23.934415,-106.289279,,0µs,0.0
"""car_0""",2023-01-01 00:49:30.000390,23.934415,-106.289279,"""tire_pressure""",0µs,0.0
"""car_0""",2023-01-01 00:58:38.000675,23.934415,-106.289279,"""tire_pressure""",9m 8s 285µs,0.0
…,…,…,…,…,…,…
"""car_999""",2023-01-07 22:38:41.000574,26.825027,-88.032093,"""engine_check""",-20m -32s -41µs,0.297365
"""car_999""",2023-01-07 23:04:39.000630,26.825385,-88.033536,"""engine_check""",25m 58s 56µs,0.268635
"""car_999""",2023-01-07 23:34:39.000630,26.825386,-88.03489,,30m,0.0
"""car_999""",2023-01-07 23:43:46.000323,26.825386,-88.03489,,9m 6s 999693µs,0.0


In [152]:
data.filter(pl.col("notification_time").is_null()).collect()

car_id,timestamp,latitude,longitude,notification,notification_time,distance
str,datetime[μs],f64,f64,str,duration[μs],f64


In [153]:
data.filter(pl.col("distance").is_null()).collect()

car_id,timestamp,latitude,longitude,notification,notification_time,distance
str,datetime[μs],f64,f64,str,duration[μs],f64


In [154]:
data.sort(["car_id", "timestamp"])
data.collect()

car_id,timestamp,latitude,longitude,notification,notification_time,distance
str,datetime[μs],f64,f64,str,duration[μs],f64
"""car_0""",2023-01-01 00:10:10.000408,23.942326,-106.280889,"""engine_check""",0µs,0.0
"""car_0""",2023-01-01 00:10:10.000408,23.942326,-106.280889,"""tire_pressure""",0µs,2.450245
"""car_0""",2023-01-01 00:10:10.000408,23.934415,-106.289279,,0µs,0.0
"""car_0""",2023-01-01 00:49:30.000390,23.934415,-106.289279,"""tire_pressure""",0µs,0.0
"""car_0""",2023-01-01 00:58:38.000675,23.934415,-106.289279,"""tire_pressure""",9m 8s 285µs,0.0
…,…,…,…,…,…,…
"""car_999""",2023-01-07 22:38:41.000574,26.825027,-88.032093,"""engine_check""",-20m -32s -41µs,0.297365
"""car_999""",2023-01-07 23:04:39.000630,26.825385,-88.033536,"""engine_check""",25m 58s 56µs,0.268635
"""car_999""",2023-01-07 23:34:39.000630,26.825386,-88.03489,,30m,0.0
"""car_999""",2023-01-07 23:43:46.000323,26.825386,-88.03489,,9m 6s 999693µs,0.0


In [155]:
data = data.with_columns([
    pl.col("timestamp").fill_null(strategy="forward").cast(pl.Datetime),
    pl.col("notification_time").fill_null(pl.duration(microseconds=0)),
    pl.col("distance").fill_null(0.0).cast(pl.Float64)
]).sort(["car_id", "timestamp"]).collect() 

In [156]:
# Step 2: Convert to LazyFrame after sorting and handle aggregation
data = data.lazy()

In [157]:
# Step 3: Perform dynamic grouping and aggregation on the now sorted LazyFrame
x_minutes = 5
data_aggregated = (data
    .group_by_dynamic(
        index_column="timestamp",
        every=f"{x_minutes}m",
        by="car_id"
    )
    .agg([
        pl.col("notification_time").mean().alias("mean_notification_time"),
        pl.col("notification_time").median().alias("median_notification_time"),
        pl.col("notification_time").var().alias("var_notification_time"),
        pl.col("notification_time").max().alias("max_notification_time"),
        pl.col("notification_time").min().alias("min_notification_time"),
        pl.col("distance").mean().alias("mean_distance"),
        pl.col("distance").median().alias("median_distance"),
        pl.col("distance").var().alias("var_distance"),
        pl.col("distance").max().alias("max_distance"),
        pl.col("distance").min().alias("min_distance"),
    ])
)

# Collect and display results
result = data_aggregated.collect()
result

  .group_by_dynamic(


car_id,timestamp,mean_notification_time,median_notification_time,var_notification_time,max_notification_time,min_notification_time,mean_distance,median_distance,var_distance,max_distance,min_distance
str,datetime[μs],duration[μs],duration[μs],duration[μs],duration[μs],duration[μs],f64,f64,f64,f64,f64
"""car_0""",2023-01-01 00:10:00,0µs,0µs,0µs,0µs,0µs,0.816748,0.0,2.001233,2.450245,0.0
"""car_0""",2023-01-01 00:30:00,-11m -4s -333388µs,-14m -5s -575µs,4096715d 17h 20m 29s 912ms,0µs,-19m -7s -999589µs,0.762048,0.0,1.74215,2.286143,0.0
"""car_0""",2023-01-01 00:35:00,3m 36s 999839µs,3m 36s 999839µs,,3m 36s 999839µs,3m 36s 999839µs,0.0,0.0,,0.0,0.0
"""car_0""",2023-01-01 00:45:00,-5m -56s -499853µs,-5m -56s -499853µs,2941947d 19h 19m 38s 43232µs,0µs,-11m -52s -999706µs,0.0,0.0,0.0,0.0,0.0
"""car_0""",2023-01-01 00:50:00,17m 24s 231µs,17m 24s 231µs,,17m 24s 231µs,17m 24s 231µs,0.0,0.0,,0.0,0.0
…,…,…,…,…,…,…,…,…,…,…,…
"""car_999""",2023-01-07 22:55:00,15m 52s 250145µs,19m 54s 499824µs,4078504d 22h 37m 43s 172864µs,22m 28s 586µs,1m 12s 348µs,0.0,0.0,0.0,0.0,0.0
"""car_999""",2023-01-07 23:00:00,25m 58s 56µs,25m 58s 56µs,,25m 58s 56µs,25m 58s 56µs,0.268635,0.268635,,0.268635,0.268635
"""car_999""",2023-01-07 23:30:00,30m,30m,,30m,30m,0.0,0.0,,0.0,0.0
"""car_999""",2023-01-07 23:35:00,0µs,0µs,,0µs,0µs,0.0,0.0,,0.0,0.0
