In [1]:
import polars as pl
import numpy as np
import random
from datetime import datetime, timedelta

# Descripcion

Los datos son notificaciones de dispositivos GPS en Mexico. En promedio generan notificaciones automatizadas cada 5 minutos si el carro esta encendido, y 30 si esta apagado.  

Cada notificacion esta acompannada de un evento de lo que esta ocurriendo, y trae la latitud y longitud.  

El objetico es predecir si un vehiculo esta siendo robado de acuerdo a sus notificaciones, por lo que el primer paso seria limpiar datos y hacer ingenieria de variables.

Trata de hacerlo **lazy** si puedes.

In [3]:
def generate_dummy_data(num_cars, start_time, end_time, working_hours_interval, non_working_hours_interval):
    data = []

    # Define the latitude and longitude ranges for Mexico
    min_latitude, max_latitude = 14.5388, 32.7186
    min_longitude, max_longitude = -118.4662, -86.7104

    for car_id in range(num_cars):
        current_time = start_time

        # Generate random initial latitude and longitude for each car
        latitude = random.uniform(min_latitude, max_latitude)
        longitude = random.uniform(min_longitude, max_longitude)

        while current_time < end_time:
            if current_time.weekday() < 5 and 9 <= current_time.hour < 17:
                # Working hours (Monday to Friday, 9 AM to 5 PM)
                interval = working_hours_interval
            else:
                # Non-working hours
                interval = non_working_hours_interval

            # Generate notification with 99% probability
            if random.random() < 0.99:
                notification = random.choice(["low_fuel", "tire_pressure", "engine_check", None])
                data.append((f"car_{car_id}", current_time.isoformat(), latitude, longitude, notification))

            # Generate additional notifications between intervals
            while True:
                additional_interval = random.expovariate(1 / (interval / 2))
                additional_time = current_time + timedelta(minutes=additional_interval)
                if additional_time >= current_time + timedelta(minutes=interval):
                    break
                notification = random.choice(["low_fuel", "tire_pressure", "engine_check", None])
                data.append((f"car_{car_id}", additional_time.isoformat(), latitude, longitude, notification))

            # Update latitude and longitude for car movement
            latitude += random.uniform(-0.01, 0.01)
            longitude += random.uniform(-0.01, 0.01)

            # Check if the car is among the 1% that can have 100 notifications within 5 minutes
            if random.random() < 0.01:
                burst_start_time = current_time + timedelta(minutes=random.uniform(0, interval))
                burst_end_time = burst_start_time + timedelta(minutes=5)
                while current_time < burst_end_time:
                    notification = random.choice(["low_fuel", "tire_pressure", "engine_check", None])
                    data.append((f"car_{car_id}", current_time.isoformat(), latitude, longitude, notification))
                    current_time += timedelta(seconds=random.uniform(1, 10))

            current_time += timedelta(minutes=interval)

    # Create a Polars DataFrame from the generated data
    df = pl.DataFrame(
        {
            "car_id": [record[0] for record in data],
            "timestamp": [record[1] for record in data],
            "latitude": [record[2] for record in data],
            "longitude": [record[3] for record in data],
            "notification": [record[4] for record in data],
        }
    )

    return df.lazy()

In [4]:
num_cars = 1000
start_time = datetime(2023, 1, 1, 0, 0, 0)  # Start of the week
end_time = start_time + timedelta(weeks=1)  # End of the week
working_hours_interval = 5  # Interval of 5 minutes during working hours
non_working_hours_interval = 30  # Interval of 30 minutes during non-working hours

# Generate the dummy data
data = generate_dummy_data(num_cars, start_time, end_time, working_hours_interval, non_working_hours_interval)

# Print the first few rows of the generated data
print(data.head())

naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)

SLICE[offset: 0, len: 5]
  DF ["car_id", "timestamp", "latitude", "longitude"]; PROJECT */5 COLUMNS; SELECTION: None


In [5]:
data.collect()

car_id,timestamp,latitude,longitude,notification
str,str,f64,f64,str
"""car_0""","""2023-01-01T00:00:00""",24.692708,-100.687216,
"""car_0""","""2023-01-01T00:02:53.545944""",24.692708,-100.687216,"""tire_pressure"""
"""car_0""","""2023-01-01T00:21:58.165712""",24.692708,-100.687216,
"""car_0""","""2023-01-01T00:05:46.836696""",24.692708,-100.687216,"""engine_check"""
"""car_0""","""2023-01-01T00:04:53.455242""",24.692708,-100.687216,
…,…,…,…,…
"""car_999""","""2023-01-08T00:09:14.174353""",30.362038,-90.961233,"""engine_check"""
"""car_999""","""2023-01-07T23:54:27.573992""",30.362038,-90.961233,"""low_fuel"""
"""car_999""","""2023-01-08T00:16:52.634620""",30.362038,-90.961233,
"""car_999""","""2023-01-07T23:52:36.826973""",30.362038,-90.961233,"""low_fuel"""


## Limpieza de datos

### Timestamp

Convierte el `timestamp` que actualmente es string a formato de tiempo en polars

In [6]:
'''
Al incio, se lo puse a ChatGPT y me dio esto:
# Generación de los datos
num_cars = 1000
start_time = datetime(2023, 1, 1, 0, 0, 0)  # Inicio de la semana
end_time = start_time + timedelta(weeks=1)  # Fin de la semana
working_hours_interval = 5  # Intervalo de 5 minutos durante horas laborales
non_working_hours_interval = 30  # Intervalo de 30 minutos durante horas no laborales

# Generar los datos falsos
data = generate_dummy_data(num_cars, start_time, end_time, working_hours_interval, non_working_hours_interval)

# Convertir la columna 'timestamp' de string a tipo datetime
data_cleaned = data.with_columns(
    pl.col("timestamp").str.strptime(pl.Datetime, fmt="%Y-%m-%dT%H:%M:%S").alias("timestamp")
)

# Mostrar las primeras filas para verificar
print(data_cleaned.head())

No cumple con los requisitos. En la documentación venía que los datos en formato Y-M-D T H-M-S se pueden cambiar con %F T %T. El %.f es para los casos que tenga nanoseconds de la forma .X
'''
data = data.with_columns(pl.col("timestamp").str.strptime(pl.Datetime, "%FT%T%.f"))

In [7]:
data.collect()

car_id,timestamp,latitude,longitude,notification
str,datetime[μs],f64,f64,str
"""car_0""",2023-01-01 00:00:00,24.692708,-100.687216,
"""car_0""",2023-01-01 00:02:53.545944,24.692708,-100.687216,"""tire_pressure"""
"""car_0""",2023-01-01 00:21:58.165712,24.692708,-100.687216,
"""car_0""",2023-01-01 00:05:46.836696,24.692708,-100.687216,"""engine_check"""
"""car_0""",2023-01-01 00:04:53.455242,24.692708,-100.687216,
…,…,…,…,…
"""car_999""",2023-01-08 00:09:14.174353,30.362038,-90.961233,"""engine_check"""
"""car_999""",2023-01-07 23:54:27.573992,30.362038,-90.961233,"""low_fuel"""
"""car_999""",2023-01-08 00:16:52.634620,30.362038,-90.961233,
"""car_999""",2023-01-07 23:52:36.826973,30.362038,-90.961233,"""low_fuel"""


### Ingenieria de variables

Dado que va a entrar a un modelo de machine learning es encesario que todas las variables sean numericas, y esten en formnato tidy. Cada observacion en una fila, y cada variable en una columna. Por lo tanto se decidio crear estadisticos y agregar los datos a intervalos uniformes de `x` minutos.  

Por ejemplo, colapsar toda la informacion que ocurrion en el intervalo, como el numero de notificaciones en esos 5 minutos, el promedio entre notificaciones, y el tipo de notificaciones.

Existen varias formas de hacer esto, puedes hacerlo con `group_by` primero para crear las nuevas variables, o `group_by` (`rolling`, `dynamic`) usando operaciones sobre listas. Utiliza claude o chat_gpt

1. Crea una nueva variable que compute la diferencia de tiempo entre notificaciones del mismo vehiculo. Piensa como lo vas a hacer. Llama a esta variable `notification_time`
   


In [11]:
import polars as pl

def add_notification_time(df: pl.DataFrame):
    """
    Añade una columna 'notification_time' que contiene la diferencia de tiempo
    en minutos entre notificaciones consecutivas del mismo vehículo.
    
    Args:
        df: DataFrame de Polars con columnas 'car_id', 'timestamp'
        
    Returns:
        DataFrame con la nueva columna 'notification_time'
    """
    return (
        df.sort(["car_id", "timestamp"])  # Ordenar por vehículo y tiempo
        .with_columns([
            pl.col("timestamp")  # Seleccionar columna 'timestamp'
            .diff()  # Calcular la diferencia con la notificación anterior
            .over("car_id")  # Reiniciar la diferencia entre vehículos
            .alias("notification_time")  # Crear y nombrar la nueva columna
            .cast(pl.Int64)  # Convertir a tipo numérico (en segundos)
            / 60  # Convertir a minutos
        ])
    )

data = add_notification_time(data)


In [12]:
# Mostrar las primeras filas para verificar
print(data.head())

data.collect()

naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)

SLICE[offset: 0, len: 5]
   WITH_COLUMNS:
   [[(col("timestamp").diff().over([col("car_id")]).strict_cast(Int64)) / (60)].alias("notification_time")] 
    SORT BY [col("car_id"), col("timestamp")]
       WITH_COLUMNS:
       [[(col("timestamp").diff().over([col("car_id")]).strict_cast(Int64)) / (60)].alias("notification_time")] 
        SORT BY [col("car_id"), col("timestamp")]
           WITH_COLUMNS:
           [[(col("timestamp").diff().over([col("car_id")]).strict_cast(Int64)) / (60)].alias("notification_time")] 
            SORT BY [col("car_id"), col("timestamp")]
               WITH_COLUMNS:
               [col("timestamp").str.strptime([String(raise)])] 
                DF ["car_id", "timestamp", "latitude", "longitude"]; PROJECT */5 COLUMNS; SELECTION: None


car_id,timestamp,latitude,longitude,notification,notification_time
str,datetime[μs],f64,f64,str,f64
"""car_0""",2023-01-01 00:00:00,24.692708,-100.687216,,
"""car_0""",2023-01-01 00:02:53.545944,24.692708,-100.687216,"""tire_pressure""",2892432.4
"""car_0""",2023-01-01 00:04:53.455242,24.692708,-100.687216,,1998488.3
"""car_0""",2023-01-01 00:05:37.011111,24.692708,-100.687216,"""tire_pressure""",725931.15
"""car_0""",2023-01-01 00:05:46.836696,24.692708,-100.687216,"""engine_check""",163759.75
…,…,…,…,…,…
"""car_999""",2023-01-07 23:54:27.573992,30.362038,-90.961233,"""low_fuel""",1.8458e6
"""car_999""",2023-01-08 00:07:27.212819,30.362038,-90.961233,"""engine_check""",1.2994e7
"""car_999""",2023-01-08 00:09:14.174353,30.362038,-90.961233,"""engine_check""",1.7827e6
"""car_999""",2023-01-08 00:12:24.072577,30.362038,-90.961233,,3164970.4


2. Crea una nueva variable que compute la distancia que viajo el vehiculo desde la ultima notificacion. Llamala `distance`

In [13]:
def haversine_distance(lat1: float, lon1: float, lat2: float, lon2: float) -> float:
    """
    Calcula la distancia en kilómetros entre dos puntos usando la fórmula de Haversine.
    """
    R = 6371  # Radio de la Tierra en kilómetros
    
    # Convertir grados a radianes
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    # Diferencias en coordenadas
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    # Fórmula de Haversine
    a = np.sin(dlat / 2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    
    return R * c  # Retornar la distancia en kilómetros

def add_distance(df: pl.DataFrame) -> pl.DataFrame:
    """
    Añade una columna 'distance' que contiene la distancia recorrida en kilómetros
    desde la última notificación para cada vehículo.
    
    Args:
        df: DataFrame de Polars con columnas 'car_id', 'latitude', 'longitude'
        
    Returns:
        DataFrame con la nueva columna 'distance'
    """
    return (
        df.sort(["car_id", "timestamp"])  # Ordenar por 'car_id' y 'timestamp'
        .with_columns([
            pl.struct(["latitude", "longitude"])  # Crear una estructura con las coordenadas actuales
            .shift()  # Desplazar para obtener las coordenadas anteriores
            .over("car_id")  # Aplicar el desplazamiento por vehículo
            .alias("prev_coords")
        ])
        .with_columns([
            pl.col("prev_coords").struct.field("latitude").alias("prev_lat"),
            pl.col("prev_coords").struct.field("longitude").alias("prev_lon")
        ])
        .with_columns([
            # Calcular la distancia utilizando la función Haversine
            pl.struct(["latitude", "longitude", "prev_lat", "prev_lon"])
            .map_elements(lambda x: haversine_distance(
                x["latitude"], 
                x["longitude"], 
                x["prev_lat"] if x["prev_lat"] is not None else x["latitude"], 
                x["prev_lon"] if x["prev_lon"] is not None else x["longitude"]
            ))
            .alias("distance")
        ])
        .drop(["prev_coords", "prev_lat", "prev_lon"])  # Eliminar columnas intermedias
    )

data = add_distance(data)

In [14]:
# Mostrar las primeras filas para verificar
print(data.head())

data.collect()

naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)

SLICE[offset: 0, len: 5]
  simple π 7/10 ["car_id", "timestamp", ... 5 other columns]
     WITH_COLUMNS:
     [col("latitude").as_struct([col("longitude"), col("prev_lat"), col("prev_lon")]).map_list().alias("distance")] 
       WITH_COLUMNS:
       [col("prev_coords").struct.field_by_name(latitude)().alias("prev_lat"), col("prev_coords").struct.field_by_name(longitude)().alias("prev_lon")] 
         WITH_COLUMNS:
         [col("latitude").as_struct([col("longitude")]).shift([dyn int: 1]).over([col("car_id")]).alias("prev_coords")] 
          SORT BY [col("car_id"), col("timestamp")]
             WITH_COLUMNS:
             [[(col("timestamp").diff().over([col("car_id")]).strict_cast(Int64)) / (60)].alias("notification_time")] 
              SORT BY [col("car_id"), col("timestamp")]
                 WITH_COLUMNS:
                 [[(col("timestamp").diff().over([col("car_id")]).strict_cast(Int64)) / (60)].ali

  data.collect()


car_id,timestamp,latitude,longitude,notification,notification_time,distance
str,datetime[μs],f64,f64,str,f64,f64
"""car_0""",2023-01-01 00:00:00,24.692708,-100.687216,,,0.0
"""car_0""",2023-01-01 00:02:53.545944,24.692708,-100.687216,"""tire_pressure""",2892432.4,0.0
"""car_0""",2023-01-01 00:04:53.455242,24.692708,-100.687216,,1998488.3,0.0
"""car_0""",2023-01-01 00:05:37.011111,24.692708,-100.687216,"""tire_pressure""",725931.15,0.0
"""car_0""",2023-01-01 00:05:46.836696,24.692708,-100.687216,"""engine_check""",163759.75,0.0
…,…,…,…,…,…,…
"""car_999""",2023-01-07 23:54:27.573992,30.362038,-90.961233,"""low_fuel""",1.8458e6,0.0
"""car_999""",2023-01-08 00:07:27.212819,30.362038,-90.961233,"""engine_check""",1.2994e7,0.0
"""car_999""",2023-01-08 00:09:14.174353,30.362038,-90.961233,"""engine_check""",1.7827e6,0.0
"""car_999""",2023-01-08 00:12:24.072577,30.362038,-90.961233,,3164970.4,0.0


3. Crea intervalos de `x` minutos por carro. Como el numero de notificaciones en esos intervalos no es uniforme tienes que buscar funciones de polars especificas, pero ademas tienen que ser por vehiculo, pues tienen que ser del mismo. Revisa las funciones de `group_by` `dynamic` y `rolling`.
   1. Computa la media, mediana, varianza, max y min de `notification_time` los intervalos de `x` minutos
   2. Computa la media, mediana, varianza, max y min de `distance`


In [None]:
# Función para calcular la diferencia de tiempo entre notificaciones consecutivas
def add_notification_time(df: pl.DataFrame) -> pl.DataFrame:
    return (
        df.sort(["car_id", "timestamp"])  # Ordenar por vehículo y tiempo
        .with_columns([
            pl.col("timestamp")
            .diff()  # Calcular la diferencia con la anterior
            .over("car_id")  # Reiniciar el cálculo para cada vehículo
            .alias("notification_time")
        ])
    )

# Función para calcular la distancia usando la fórmula de Haversine
def haversine_distance(lat1: float, lon1: float, lat2: float, lon2: float) -> float:
    R = 6371  # Radio de la Tierra en kilómetros
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

# Función para calcular la distancia recorrida desde la última notificación
def add_distance(df: pl.DataFrame) -> pl.DataFrame:
    return (
        df.sort(["car_id", "timestamp"])
        .with_columns([
            pl.struct(["latitude", "longitude"])
            .shift()
            .over("car_id")
            .alias("prev_coords")
        ])
        .with_columns([
            pl.col("prev_coords").struct.field("latitude").alias("prev_lat"),
            pl.col("prev_coords").struct.field("longitude").alias("prev_lon")
        ])
        .with_columns([
            pl.struct(["latitude", "longitude", "prev_lat", "prev_lon"])
            .map_elements(lambda x: haversine_distance(
                x["latitude"], 
                x["longitude"], 
                x["prev_lat"] if x["prev_lat"] is not None else x["latitude"], 
                x["prev_lon"] if x["prev_lon"] is not None else x["longitude"]
            ))
            .alias("distance")
        ])
        .drop(["prev_coords", "prev_lat", "prev_lon"])
    )

# Aplicar las transformaciones
data = add_notification_time(data)
data = add_distance(data)

# Ahora calcular las estadísticas de intervalos
def compute_interval_statistics(df: pl.DataFrame, interval_minutes: int = 5) -> pl.DataFrame:
    return (
        df.group_by(
            "car_id"  # Agrupar por vehículo primero
        ).agg([
            # Agregar columnas para intervalos de tiempo
            pl.col("timestamp")
            .floor("1m")  # Redondear al minuto con el formato correcto
            .alias("interval_timestamp"),
            
            pl.col("notification_time").mean().alias("notification_time_mean_minutes"),
            pl.col("notification_time").median().alias("notification_time_median_minutes"),
            pl.col("notification_time").dt.total_minutes().var().alias("notification_time_variance_minutes"),
            pl.col("notification_time").max().alias("notification_time_max_minutes"),
            pl.col("notification_time").min().alias("notification_time_min_minutes"),
            
            # Estadísticas para distancia
            pl.col("distance").mean().alias("distance_mean"),
            pl.col("distance").median().alias("distance_median"),
            pl.col("distance").var().alias("distance_variance"),
            pl.col("distance").max().alias("distance_max"),
            pl.col("distance").min().alias("distance_min"),
            
            # Opcional: contar el número de notificaciones en el intervalo
            pl.len().alias("notifications_count")  # Usar len en lugar de count
        ])
    )

# Calcular las estadísticas
data = compute_interval_statistics(data, interval_minutes=5)

# Mostrar el resultado
data.collect().head()