In [36]:
import polars as pl
import numpy as np
import random as random
from datetime import datetime, timedelta
from math import radians, sin, cos, sqrt, asin

# Descripcion

Los datos son notificaciones de dispositivos GPS en Mexico. En promedio generan notificaciones automatizadas cada 5 minutos si el carro esta encendido, y 30 si esta apagado.  

Cada notificacion esta acompannada de un evento de lo que esta ocurriendo, y trae la latitud y longitud.  

El objetico es predecir si un vehiculo esta siendo robado de acuerdo a sus notificaciones, por lo que el primer paso seria limpiar datos y hacer ingenieria de variables.

Trata de hacerlo **lazy** si puedes.

In [37]:
def generate_dummy_data(num_cars, start_time, end_time, working_hours_interval, non_working_hours_interval):
    data = []

    # Define the latitude and longitude ranges for Mexico
    min_latitude, max_latitude = 14.5388, 32.7186
    min_longitude, max_longitude = -118.4662, -86.7104

    for car_id in range(num_cars):
        current_time = start_time

        # Generate random initial latitude and longitude for each car
        latitude = random.uniform(min_latitude, max_latitude)
        longitude = random.uniform(min_longitude, max_longitude)

        while current_time < end_time:
            if current_time.weekday() < 5 and 9 <= current_time.hour < 17:
                # Working hours (Monday to Friday, 9 AM to 5 PM)
                interval = working_hours_interval
            else:
                # Non-working hours
                interval = non_working_hours_interval

            # Generate notification with 99% probability
            if random.random() < 0.99:
                notification = random.choice(["low_fuel", "tire_pressure", "engine_check", None])
                data.append((f"car_{car_id}", current_time.isoformat(), latitude, longitude, notification))

            # Generate additional notifications between intervals
            while True:
                additional_interval = random.expovariate(1 / (interval / 2))
                additional_time = current_time + timedelta(minutes=additional_interval)
                if additional_time >= current_time + timedelta(minutes=interval):
                    break
                notification = random.choice(["low_fuel", "tire_pressure", "engine_check", None])
                data.append((f"car_{car_id}", additional_time.isoformat(), latitude, longitude, notification))

            # Update latitude and longitude for car movement
            latitude += random.uniform(-0.01, 0.01)
            longitude += random.uniform(-0.01, 0.01)

            # Check if the car is among the 1% that can have 100 notifications within 5 minutes
            if random.random() < 0.01:
                burst_start_time = current_time + timedelta(minutes=random.uniform(0, interval))
                burst_end_time = burst_start_time + timedelta(minutes=5)
                while current_time < burst_end_time:
                    notification = random.choice(["low_fuel", "tire_pressure", "engine_check", None])
                    data.append((f"car_{car_id}", current_time.isoformat(), latitude, longitude, notification))
                    current_time += timedelta(seconds=random.uniform(1, 10))

            current_time += timedelta(minutes=interval)

    # Create a Polars DataFrame from the generated data
    df = pl.DataFrame(
        {
            "car_id": [record[0] for record in data],
            "timestamp": [record[1] for record in data],
            "latitude": [record[2] for record in data],
            "longitude": [record[3] for record in data],
            "notification": [record[4] for record in data],
        }
    )

    return df.lazy()

In [38]:
num_cars = 1000
start_time = datetime(2023, 1, 1, 0, 0, 0)  # Start of the week
end_time = start_time + timedelta(weeks=1)  # End of the week
working_hours_interval = 5  # Interval of 5 minutes during working hours
non_working_hours_interval = 30  # Interval of 30 minutes during non-working hours

# Generate the dummy data
data = generate_dummy_data(num_cars, start_time, end_time, working_hours_interval, non_working_hours_interval)

# Print the first few rows of the generated data
print(data.collect())

shape: (6_283_268, 5)
┌─────────┬────────────────────────────┬───────────┬─────────────┬───────────────┐
│ car_id  ┆ timestamp                  ┆ latitude  ┆ longitude   ┆ notification  │
│ ---     ┆ ---                        ┆ ---       ┆ ---         ┆ ---           │
│ str     ┆ str                        ┆ f64       ┆ f64         ┆ str           │
╞═════════╪════════════════════════════╪═══════════╪═════════════╪═══════════════╡
│ car_0   ┆ 2023-01-01T00:00:00        ┆ 16.431368 ┆ -108.499227 ┆ null          │
│ car_0   ┆ 2023-01-01T00:30:00        ┆ 16.431572 ┆ -108.497224 ┆ low_fuel      │
│ car_0   ┆ 2023-01-01T00:34:18.487067 ┆ 16.431572 ┆ -108.497224 ┆ tire_pressure │
│ car_0   ┆ 2023-01-01T01:00:00        ┆ 16.435203 ┆ -108.50384  ┆ null          │
│ car_0   ┆ 2023-01-01T01:02:45.993100 ┆ 16.435203 ┆ -108.50384  ┆ null          │
│ …       ┆ …                          ┆ …         ┆ …           ┆ …             │
│ car_999 ┆ 2023-01-07T23:44:09.595171 ┆ 28.836285 ┆ -106.604029 

## Limpieza de datos

### Timestamp

Convierte el `timestamp` que actualmente es string a formato de tiempo en polars

In [39]:
data = data.with_columns(pl.col("timestamp").cast(pl.Datetime))

In [40]:
data.collect()

car_id,timestamp,latitude,longitude,notification
str,datetime[μs],f64,f64,str
"""car_0""",2023-01-01 00:00:00,16.431368,-108.499227,
"""car_0""",2023-01-01 00:30:00,16.431572,-108.497224,"""low_fuel"""
"""car_0""",2023-01-01 00:34:18.487067,16.431572,-108.497224,"""tire_pressure"""
"""car_0""",2023-01-01 01:00:00,16.435203,-108.50384,
"""car_0""",2023-01-01 01:02:45.993100,16.435203,-108.50384,
…,…,…,…,…
"""car_999""",2023-01-07 23:44:09.595171,28.836285,-106.604029,
"""car_999""",2023-01-07 23:47:45.696463,28.836285,-106.604029,
"""car_999""",2023-01-07 23:36:50.168622,28.836285,-106.604029,"""engine_check"""
"""car_999""",2023-01-07 23:34:33.332953,28.836285,-106.604029,


### Ingenieria de variables

Dado que va a entrar a un modelo de machine learning es encesario que todas las variables sean numericas, y esten en formnato tidy. Cada observacion en una fila, y cada variable en una columna. Por lo tanto se decidio crear estadisticos y agregar los datos a intervalos uniformes de `x` minutos.  

Por ejemplo, colapsar toda la informacion que ocurrion en el intervalo, como el numero de notificaciones en esos 5 minutos, el promedio entre notificaciones, y el tipo de notificaciones.

Existen varias formas de hacer esto, puedes hacerlo con `group_by` primero para crear las nuevas variables, o `group_by` (`rolling`, `dynamic`) usando operaciones sobre listas. Utiliza claude o chat_gpt

1. Crea una nueva variable que compute la diferencia de tiempo entre notificaciones del mismo vehiculo. Piensa como lo vas a hacer. Llama a esta variable `notification_time`
   


In [41]:
data = data.sort(["car_id", "timestamp"])
data=data.with_columns(pl.col("timestamp").shift(-1).alias("lagged_timestamp"))
data=data.with_columns((pl.col("lagged_timestamp")-pl.col("timestamp")).alias("notification_time"))


In [42]:
data.collect()

car_id,timestamp,latitude,longitude,notification,lagged_timestamp,notification_time
str,datetime[μs],f64,f64,str,datetime[μs],duration[μs]
"""car_0""",2023-01-01 00:00:00,16.431368,-108.499227,,2023-01-01 00:30:00,30m
"""car_0""",2023-01-01 00:30:00,16.431572,-108.497224,"""low_fuel""",2023-01-01 00:34:18.487067,4m 18s 487067µs
"""car_0""",2023-01-01 00:34:18.487067,16.431572,-108.497224,"""tire_pressure""",2023-01-01 01:00:00,25m 41s 512933µs
"""car_0""",2023-01-01 01:00:00,16.435203,-108.50384,,2023-01-01 01:02:45.993100,2m 45s 993100µs
"""car_0""",2023-01-01 01:02:45.993100,16.435203,-108.50384,,2023-01-01 01:30:00,27m 14s 6900µs
…,…,…,…,…,…,…
"""car_999""",2023-01-07 23:51:41.911628,28.836285,-106.604029,"""engine_check""",2023-01-07 23:53:10.488422,1m 28s 576794µs
"""car_999""",2023-01-07 23:53:10.488422,28.836285,-106.604029,"""engine_check""",2023-01-07 23:53:52.320353,41s 831931µs
"""car_999""",2023-01-07 23:53:52.320353,28.836285,-106.604029,"""engine_check""",2023-01-07 23:57:55.336558,4m 3s 16205µs
"""car_999""",2023-01-07 23:57:55.336558,28.836285,-106.604029,"""low_fuel""",2023-01-07 23:58:36.620036,41s 283478µs


2. Crea una nueva variable que compute la distancia que viajo el vehiculo desde la ultima notificacion. Llamala `distance`

In [43]:
# Fórmula para calcular distancias a partir de coordenadas geográficas
def haversine_distance(lat1, lon1, lat2, lon2):
    if None in (lat1, lon1, lat2, lon2):
        return None
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * asin(sqrt(a))
    R = 6371.0  # Radio de la Tierra en km
    return R * c

In [44]:
df_calc_distancia = data.group_by("car_id", maintain_order=True).agg(pl.all(), pl.col("latitude").shift(-1).alias("lagged_latitude"), pl.col("longitude").shift(-1).alias("lagged_longitude"))
data_columnas= [col for col in df_calc_distancia.columns if col != 'car_id']
df_calc_distancia = df_calc_distancia.explode(data_columnas)
df_calc_distancia.collect()

car_id,timestamp,latitude,longitude,notification,lagged_timestamp,notification_time,lagged_latitude,lagged_longitude
str,datetime[μs],f64,f64,str,datetime[μs],duration[μs],f64,f64
"""car_0""",2023-01-01 00:00:00,16.431368,-108.499227,,2023-01-01 00:30:00,30m,16.431572,-108.497224
"""car_0""",2023-01-01 00:30:00,16.431572,-108.497224,"""low_fuel""",2023-01-01 00:34:18.487067,4m 18s 487067µs,16.431572,-108.497224
"""car_0""",2023-01-01 00:34:18.487067,16.431572,-108.497224,"""tire_pressure""",2023-01-01 01:00:00,25m 41s 512933µs,16.435203,-108.50384
"""car_0""",2023-01-01 01:00:00,16.435203,-108.50384,,2023-01-01 01:02:45.993100,2m 45s 993100µs,16.435203,-108.50384
"""car_0""",2023-01-01 01:02:45.993100,16.435203,-108.50384,,2023-01-01 01:30:00,27m 14s 6900µs,16.434738,-108.498603
…,…,…,…,…,…,…,…,…
"""car_999""",2023-01-07 23:51:41.911628,28.836285,-106.604029,"""engine_check""",2023-01-07 23:53:10.488422,1m 28s 576794µs,28.836285,-106.604029
"""car_999""",2023-01-07 23:53:10.488422,28.836285,-106.604029,"""engine_check""",2023-01-07 23:53:52.320353,41s 831931µs,28.836285,-106.604029
"""car_999""",2023-01-07 23:53:52.320353,28.836285,-106.604029,"""engine_check""",2023-01-07 23:57:55.336558,4m 3s 16205µs,28.836285,-106.604029
"""car_999""",2023-01-07 23:57:55.336558,28.836285,-106.604029,"""low_fuel""",2023-01-07 23:58:36.620036,41s 283478µs,28.836285,-106.604029


In [45]:
funcion_distancia = pl.when(
    #Sólo calculamos cuando no son null, porque sino truena la función
    pl.col("latitude").is_not_null() &
    pl.col("longitude").is_not_null() &
    pl.col("lagged_latitude").is_not_null() &
    pl.col("lagged_longitude").is_not_null()
    ).then(pl.struct(["latitude", "longitude", "lagged_latitude", "lagged_longitude"]).map_elements(
        lambda row: haversine_distance(row["latitude"],row["longitude"], row["lagged_latitude"], row["lagged_longitude"]))
       ).otherwise(pl.lit(None)).alias("distance") #Si es el último dato de un coche, no se puede calcular la distancia, se pone null
data_distancia = df_calc_distancia.with_columns(funcion_distancia)

In [46]:
data_distancia.collect()



car_id,timestamp,latitude,longitude,notification,lagged_timestamp,notification_time,lagged_latitude,lagged_longitude,distance
str,datetime[μs],f64,f64,str,datetime[μs],duration[μs],f64,f64,f64
"""car_0""",2023-01-01 00:00:00,16.431368,-108.499227,,2023-01-01 00:30:00,30m,16.431572,-108.497224,0.2148
"""car_0""",2023-01-01 00:30:00,16.431572,-108.497224,"""low_fuel""",2023-01-01 00:34:18.487067,4m 18s 487067µs,16.431572,-108.497224,0.0
"""car_0""",2023-01-01 00:34:18.487067,16.431572,-108.497224,"""tire_pressure""",2023-01-01 01:00:00,25m 41s 512933µs,16.435203,-108.50384,0.812951
"""car_0""",2023-01-01 01:00:00,16.435203,-108.50384,,2023-01-01 01:02:45.993100,2m 45s 993100µs,16.435203,-108.50384,0.0
"""car_0""",2023-01-01 01:02:45.993100,16.435203,-108.50384,,2023-01-01 01:30:00,27m 14s 6900µs,16.434738,-108.498603,0.56094
…,…,…,…,…,…,…,…,…,…
"""car_999""",2023-01-07 23:51:41.911628,28.836285,-106.604029,"""engine_check""",2023-01-07 23:53:10.488422,1m 28s 576794µs,28.836285,-106.604029,0.0
"""car_999""",2023-01-07 23:53:10.488422,28.836285,-106.604029,"""engine_check""",2023-01-07 23:53:52.320353,41s 831931µs,28.836285,-106.604029,0.0
"""car_999""",2023-01-07 23:53:52.320353,28.836285,-106.604029,"""engine_check""",2023-01-07 23:57:55.336558,4m 3s 16205µs,28.836285,-106.604029,0.0
"""car_999""",2023-01-07 23:57:55.336558,28.836285,-106.604029,"""low_fuel""",2023-01-07 23:58:36.620036,41s 283478µs,28.836285,-106.604029,0.0


3. Crea intervalos de `x` minutos por carro. Como el numero de notificaciones en esos intervalos no es uniforme tienes que buscar funciones de polars especificas, pero ademas tienen que ser por vehiculo, pues tienen que ser del mismo. Revisa las funciones de `group_by` `dynamic` y `rolling`.
   1. Computa la media, mediana, varianza, max y min de `notification_time` los intervalos de `x` minutos
   2. Computa la media, mediana, varianza, max y min de `distance`


In [47]:
#Probemos cada 5 minutos
x = 5

# Usar groupby_dynamic para crear intervalos dinámicos y calcular estadísticas
data = data.group_by_dynamic(
    index_column="timestamp",  # Columna de tiempo sobre la cual agrupar
    group_by="car_id",               # Agrupar por car_id para calcular por cada vehículo
    every=f"{x}m",             # Tamaño de cada intervalo en minutos
    period=f"{x}m",            # Periodo entre los intervalos
    closed="both",
    include_boundaries=True
).agg([
    pl.col("notification_time").mean().alias(f'media_notification_time_{x}m'),
    pl.col("notification_time").median().alias(f'mediana_notification_time_{x}m'),
    pl.col("notification_time").var().alias(f'varianza_notification_time_{x}m'),
    pl.col("notification_time").max().alias(f'max_notification_time_{x}m'),
    pl.col("notification_time").min().alias(f'min_notification_time_{x}m')
])

In [48]:
data.collect()

car_id,_lower_boundary,_upper_boundary,timestamp,mean_notification_time_5m,median_notification_time_5m,variance_notification_time_5m,max_notification_time_5m,min_notification_time_5m
str,datetime[μs],datetime[μs],datetime[μs],duration[μs],duration[μs],duration[μs],duration[μs],duration[μs]
"""car_0""",2022-12-31 23:55:00,2023-01-01 00:00:00,2022-12-31 23:55:00,30m,30m,,30m,30m
"""car_0""",2023-01-01 00:00:00,2023-01-01 00:05:00,2023-01-01 00:00:00,30m,30m,,30m,30m
"""car_0""",2023-01-01 00:25:00,2023-01-01 00:30:00,2023-01-01 00:25:00,4m 18s 487067µs,4m 18s 487067µs,,4m 18s 487067µs,4m 18s 487067µs
"""car_0""",2023-01-01 00:30:00,2023-01-01 00:35:00,2023-01-01 00:30:00,15m,15m,9526362d 2h 40m 12s 524928µs,25m 41s 512933µs,4m 18s 487067µs
"""car_0""",2023-01-01 00:55:00,2023-01-01 01:00:00,2023-01-01 00:55:00,2m 45s 993100µs,2m 45s 993100µs,,2m 45s 993100µs,2m 45s 993100µs
…,…,…,…,…,…,…,…,…
"""car_999""",2023-01-07 23:35:00,2023-01-07 23:40:00,2023-01-07 23:35:00,2m 53s 278351µs,2m 53s 278351µs,303157d 21h 8m 12s 11296µs,4m 47s 717944µs,58s 838758µs
"""car_999""",2023-01-07 23:40:00,2023-01-07 23:45:00,2023-01-07 23:40:00,1m 42s 990379µs,1m 26s 247339µs,129404d 22h 41m 34s 603904µs,3m 36s 101292µs,6s 622508µs
"""car_999""",2023-01-07 23:45:00,2023-01-07 23:50:00,2023-01-07 23:45:00,1m 58s 107582µs,1m 58s 107582µs,12117d 6h 20m 46s 540ms,2m 20s 986958µs,1m 35s 228207µs
"""car_999""",2023-01-07 23:50:00,2023-01-07 23:55:00,2023-01-07 23:50:00,2m 4s 474976µs,1m 28s 576794µs,128301d 22h 34m 25s 303794µs,4m 3s 16205µs,41s 831931µs


In [49]:
t=5
data_distancia = data_distancia.group_by_dynamic(
    index_column="timestamp",  # Columna de tiempo sobre la cual agrupar
    group_by="car_id",               # Agrupar por car_id para calcular por cada vehículo
    every=f"{t}m",             # Tamaño de cada intervalo en minutos
    period=f"{t}m",            # Periodo entre los intervalos
    closed="both",
    include_boundaries=True
).agg([
    pl.col("distance").mean().alias(f'media_distance_{t}m'),
    pl.col("distance").median().alias(f'mediana_distance_{t}m'),
    pl.col("distance").var().alias(f'varianza_distance_{t}m'),
    pl.col("distance").max().alias(f'max_distance_{t}m'),
    pl.col("distance").min().alias(f'min_distance_{t}m')
])

In [50]:
data_distancia.collect()



car_id,_lower_boundary,_upper_boundary,timestamp,media_distance_5m,mediana_distance_5m,varianza_distance_5m,max_distance_5m,min_distance_5m
str,datetime[μs],datetime[μs],datetime[μs],f64,f64,f64,f64,f64
"""car_0""",2022-12-31 23:55:00,2023-01-01 00:00:00,2022-12-31 23:55:00,0.2148,0.2148,,0.2148,0.2148
"""car_0""",2023-01-01 00:00:00,2023-01-01 00:05:00,2023-01-01 00:00:00,0.2148,0.2148,,0.2148,0.2148
"""car_0""",2023-01-01 00:25:00,2023-01-01 00:30:00,2023-01-01 00:25:00,0.0,0.0,,0.0,0.0
"""car_0""",2023-01-01 00:30:00,2023-01-01 00:35:00,2023-01-01 00:30:00,0.406475,0.406475,0.330444,0.812951,0.0
"""car_0""",2023-01-01 00:55:00,2023-01-01 01:00:00,2023-01-01 00:55:00,0.0,0.0,,0.0,0.0
…,…,…,…,…,…,…,…,…
"""car_999""",2023-01-07 23:35:00,2023-01-07 23:40:00,2023-01-07 23:35:00,0.0,0.0,0.0,0.0,0.0
"""car_999""",2023-01-07 23:40:00,2023-01-07 23:45:00,2023-01-07 23:40:00,0.0,0.0,0.0,0.0,0.0
"""car_999""",2023-01-07 23:45:00,2023-01-07 23:50:00,2023-01-07 23:45:00,0.0,0.0,0.0,0.0,0.0
"""car_999""",2023-01-07 23:50:00,2023-01-07 23:55:00,2023-01-07 23:50:00,0.0,0.0,0.0,0.0,0.0
