In [1]:
import polars as pl
import numpy as np
import random
from datetime import datetime, timedelta


The following required CPU features were not detected:
    fma
Continuing to use this version of Polars on this processor will likely result in a crash.
Install the `polars-lts-cpu` package instead of `polars` to run Polars with better compatibility.

Hint: If you are on an Apple ARM machine (e.g. M1) this is likely due to running Python under Rosetta.
It is recommended to install a native version of Python that does not run under Rosetta x86-64 emulation.




# Descripcion

Los datos son notificaciones de dispositivos GPS en Mexico. En promedio generan notificaciones automatizadas cada 5 minutos si el carro esta encendido, y 30 si esta apagado.  

Cada notificacion esta acompannada de un evento de lo que esta ocurriendo, y trae la latitud y longitud.  

El objetivo es predecir si un vehiculo esta siendo robado de acuerdo a sus notificaciones, por lo que el primer paso seria limpiar datos y hacer ingenieria de variables.

Trata de hacerlo **lazy** si puedes.

In [2]:
def generate_dummy_data(num_cars, start_time, end_time, working_hours_interval, non_working_hours_interval):
    data = []

    # Define the latitude and longitude ranges for Mexico
    min_latitude, max_latitude = 14.5388, 32.7186
    min_longitude, max_longitude = -118.4662, -86.7104

    for car_id in range(num_cars):
        current_time = start_time

        # Generate random initial latitude and longitude for each car
        latitude = random.uniform(min_latitude, max_latitude)
        longitude = random.uniform(min_longitude, max_longitude)

        while current_time < end_time:
            if current_time.weekday() < 5 and 9 <= current_time.hour < 17:
                # Working hours (Monday to Friday, 9 AM to 5 PM)
                interval = working_hours_interval
            else:
                # Non-working hours
                interval = non_working_hours_interval

            # Generate notification with 99% probability
            if random.random() < 0.99:
                notification = random.choice(["low_fuel", "tire_pressure", "engine_check", None])
                data.append((f"car_{car_id}", current_time.isoformat(), latitude, longitude, notification))

            # Generate additional notifications between intervals
            while True:
                additional_interval = random.expovariate(1 / (interval / 2))
                additional_time = current_time + timedelta(minutes=additional_interval)
                if additional_time >= current_time + timedelta(minutes=interval):
                    break
                notification = random.choice(["low_fuel", "tire_pressure", "engine_check", None])
                data.append((f"car_{car_id}", additional_time.isoformat(), latitude, longitude, notification))

            # Update latitude and longitude for car movement
            latitude += random.uniform(-0.01, 0.01)
            longitude += random.uniform(-0.01, 0.01)

            # Check if the car is among the 1% that can have 100 notifications within 5 minutes
            if random.random() < 0.01:
                burst_start_time = current_time + timedelta(minutes=random.uniform(0, interval))
                burst_end_time = burst_start_time + timedelta(minutes=5)
                while current_time < burst_end_time:
                    notification = random.choice(["low_fuel", "tire_pressure", "engine_check", None])
                    data.append((f"car_{car_id}", current_time.isoformat(), latitude, longitude, notification))
                    current_time += timedelta(seconds=random.uniform(1, 10))

            current_time += timedelta(minutes=interval)

    # Create a Polars DataFrame from the generated data
    df = pl.DataFrame(
        {
            "car_id": [record[0] for record in data],
            "timestamp": [record[1] for record in data],
            "latitude": [record[2] for record in data],
            "longitude": [record[3] for record in data],
            "notification": [record[4] for record in data],
        }
    )

    return df.lazy()

In [8]:
num_cars = 1000
start_time = datetime(2023, 1, 1, 0, 0, 0)  # Start of the week
end_time = start_time + timedelta(weeks=1)  # End of the week
working_hours_interval = 5  # Interval of 5 minutes during working hours
non_working_hours_interval = 30  # Interval of 30 minutes during non-working hours

# Generate the dummy data
data = generate_dummy_data(num_cars, start_time, end_time, working_hours_interval, non_working_hours_interval)

# Print the first few rows of the generated data
print(data.head())

naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)

SLICE[offset: 0, len: 5]
  DF ["car_id", "timestamp", "latitude", "longitude"]; PROJECT */5 COLUMNS; SELECTION: "None"


## Limpieza de datos

### Timestamp

Convierte el `timestamp` que actualmente es string a formato de tiempo en polars

In [22]:
data_ = data.with_columns(pl.col('timestamp').str.strptime(pl.Datetime))
#data.head()
data_.collect()

car_id,timestamp,latitude,longitude,notification
str,datetime[μs],f64,f64,str
"""car_0""",2023-01-01 00:00:00,29.525967,-116.100893,
"""car_0""",2023-01-01 00:28:30.482659,29.525967,-116.100893,"""engine_check"""
"""car_0""",2023-01-01 00:03:23.570334,29.525967,-116.100893,"""low_fuel"""
"""car_0""",2023-01-01 00:12:51.046725,29.525967,-116.100893,"""low_fuel"""
"""car_0""",2023-01-01 00:02:03.451558,29.525967,-116.100893,"""low_fuel"""
…,…,…,…,…
"""car_999""",2023-01-07 23:06:25.754419,14.886387,-103.366665,"""engine_check"""
"""car_999""",2023-01-07 23:09:01.221859,14.886387,-103.366665,
"""car_999""",2023-01-07 23:33:18.709749,14.876938,-103.371203,"""low_fuel"""
"""car_999""",2023-01-07 23:49:20.279183,14.876938,-103.371203,"""tire_pressure"""


### Ingenieria de variables

Dado que va a entrar a un modelo de machine learning es encesario que todas las variables sean numericas, y esten en formnato tidy. Cada observacion en una fila, y cada variable en una columna. Por lo tanto se decidio crear estadisticos y agregar los datos a intervalos uniformes de `x` minutos.  

Por ejemplo, colapsar toda la informacion que ocurrion en el intervalo, como el numero de notificaciones en esos 5 minutos, el promedio entre notificaciones, y el tipo de notificaciones.

Existen varias formas de hacer esto, puedes hacerlo con `group_by` primero para crear las nuevas variables, o `group_by` (`rolling`, `dynamic`) usando operaciones sobre listas. Utiliza claude o chat_gpt

1. Crea una nueva variable que compute la diferencia de tiempo entre notificaciones del mismo vehiculo. Piensa como lo vas a hacer. Llama a esta variable `notification_time`
   


In [32]:
data__ = (data_
    .sort(['car_id', 'timestamp'])
    .with_columns([
        pl.col('timestamp').diff().over('car_id').fill_null(pl.duration(nanoseconds=0)).alias('notification_time')
    ])
)
data__.collect()

car_id,timestamp,latitude,longitude,notification,notification_time
str,datetime[μs],f64,f64,str,duration[μs]
"""car_0""",2023-01-01 00:00:00,29.525967,-116.100893,,0µs
"""car_0""",2023-01-01 00:01:33.313478,29.525967,-116.100893,"""low_fuel""",1m 33s 313478µs
"""car_0""",2023-01-01 00:02:03.451558,29.525967,-116.100893,"""low_fuel""",30s 138080µs
"""car_0""",2023-01-01 00:02:59.145638,29.525967,-116.100893,"""low_fuel""",55s 694080µs
"""car_0""",2023-01-01 00:03:23.570334,29.525967,-116.100893,"""low_fuel""",24s 424696µs
…,…,…,…,…,…
"""car_999""",2023-01-07 23:09:01.221859,14.886387,-103.366665,,2m 35s 467440µs
"""car_999""",2023-01-07 23:10:04.135211,14.886387,-103.366665,"""low_fuel""",1m 2s 913352µs
"""car_999""",2023-01-07 23:33:18.709749,14.876938,-103.371203,"""low_fuel""",23m 14s 574538µs
"""car_999""",2023-01-07 23:35:22.920610,14.876938,-103.371203,"""low_fuel""",2m 4s 210861µs


2. Crea una nueva variable que compute la distancia que viajo el vehiculo desde la ultima notificacion. Llamala `distance`

In [47]:
import numpy as np

def haversine(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1_rad = np.radians(lat1)
    lon1_rad = np.radians(lon1)
    lat2_rad = np.radians(lat2)
    lon2_rad = np.radians(lon2)
    
    # Haversine formula
    dlon = lon2_rad - lon1_rad
    dlat = lat2_rad - lat1_rad
    a = np.sin(dlat/2)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    r = 6371  # Radius of the Earth in kilometers
    distance = r * c
    return distance


In [50]:
data3 = (data__
    .with_columns([
        pl.col('latitude').diff().over('car_id').fill_null(0).alias('latitude2')
    ])
).with_columns([pl.col('longitude').diff().over('car_id').fill_null(0).alias('longitude2')])

data3 = data3.with_columns(distance_traveled=haversine(pl.col('latitude'), pl.col('longitude'), pl.col('latitude')+pl.col('latitude2'), pl.col('longitude')+pl.col('longitude2')))
data3 = data3.drop('latitude2').drop('longitude2')
data3.collect()

car_id,timestamp,latitude,longitude,notification,notification_time,distance_traveled
str,datetime[μs],f64,f64,str,duration[μs],f64
"""car_0""",2023-01-01 00:00:00,29.525967,-116.100893,,0µs,0.0
"""car_0""",2023-01-01 00:01:33.313478,29.525967,-116.100893,"""low_fuel""",1m 33s 313478µs,0.0
"""car_0""",2023-01-01 00:02:03.451558,29.525967,-116.100893,"""low_fuel""",30s 138080µs,0.0
"""car_0""",2023-01-01 00:02:59.145638,29.525967,-116.100893,"""low_fuel""",55s 694080µs,0.0
"""car_0""",2023-01-01 00:03:23.570334,29.525967,-116.100893,"""low_fuel""",24s 424696µs,0.0
…,…,…,…,…,…,…
"""car_999""",2023-01-07 23:09:01.221859,14.886387,-103.366665,,2m 35s 467440µs,0.0
"""car_999""",2023-01-07 23:10:04.135211,14.886387,-103.366665,"""low_fuel""",1m 2s 913352µs,0.0
"""car_999""",2023-01-07 23:33:18.709749,14.876938,-103.371203,"""low_fuel""",23m 14s 574538µs,1.158328
"""car_999""",2023-01-07 23:35:22.920610,14.876938,-103.371203,"""low_fuel""",2m 4s 210861µs,0.0


3. Crea intervalos de `x` minutos por carro. Como el numero de notificaciones en esos intervalos no es uniforme tienes que buscar funciones de polars especificas, pero ademas tienen que ser por vehiculo, pues tienen que ser del mismo. Revisa las funciones de `group_by` `dynamic` y `rolling`.
   1. Computa la media, mediana, varianza, max y min de `notification_time` los intervalos de `x` minutos
   2. Computa la media, mediana, varianza, max y min de `distance`


In [58]:
agregaciones_time = data3.group_by("car_id").agg([
    pl.col("notification_time").mean().alias("media"),
    pl.col("notification_time").median().alias("mediana"),
    pl.col("notification_time").var().alias("varianza"),
    pl.col("notification_time").max().alias("max"),
    pl.col("notification_time").min().alias("min")
])

agregaciones_time.select('car_id', 'media', 'mediana', 'varianza', 'max', 'min').collect()

car_id,media,mediana,varianza,max,min
str,duration[μs],duration[μs],duration[μs],duration[μs],duration[μs]
"""car_354""",1m 36s 171345µs,20s 594727µs,621539d 14h 22m 21s 258224µs,49m 24s 995611µs,0µs
"""car_984""",1m 50s 421194µs,27s 341949µs,658196d 14h 15m 392880µs,30m,0µs
"""car_925""",1m 40s 321734µs,21s 494835µs,640553d 5h 2m 45s 85864µs,30m 12s 134542µs,0µs
"""car_842""",1m 23s 342335µs,12s 513278µs,539997d 19h 35m 58s 438984µs,30m 9s 209196µs,0µs
"""car_530""",1m 35s 389493µs,19s 361959µs,609094d 8h 18m 48s 49616µs,30m 9s 517998µs,0µs
…,…,…,…,…,…
"""car_836""",1m 30s 16986µs,15s 646712µs,581559d 14h 23m 30s 546408µs,30m,0µs
"""car_678""",1m 45s 837694µs,24s 600ms,663786d 5h 56m 15s 806808µs,34m 5s 863304µs,0µs
"""car_869""",1m 30s 503356µs,19s 509980µs,548957d 18h 37m 23s 853752µs,33m 18s 371824µs,0µs
"""car_983""",1m 44s 664402µs,24s 607289µs,669225d 45m 39s 705168µs,30m 43s 71301µs,0µs


In [57]:
agregaciones_distance = data3.group_by("car_id").agg([
    pl.col("distance_traveled").mean().alias("media"),
    pl.col("distance_traveled").median().alias("mediana"),
    pl.col("distance_traveled").var().alias("varianza"),
    pl.col("distance_traveled").max().alias("max"),
    pl.col("distance_traveled").min().alias("min")
])

agregaciones_distance.select('car_id', 'media', 'mediana', 'varianza', 'max', 'min').collect()

car_id,media,mediana,varianza,max,min
str,f64,f64,f64,f64,f64
"""car_289""",0.114402,0.0,0.094169,1.499893,0.0
"""car_584""",0.109557,0.0,0.087686,1.435974,0.0
"""car_189""",0.105199,0.0,0.086434,1.506046,0.0
"""car_579""",0.111239,0.0,0.094135,1.511978,0.0
"""car_586""",0.098517,0.0,0.082451,1.519779,0.0
…,…,…,…,…,…
"""car_435""",0.103422,0.0,0.085683,1.46791,0.0
"""car_315""",0.102957,0.0,0.086429,1.483347,0.0
"""car_923""",0.106957,0.0,0.089423,1.486751,0.0
"""car_29""",0.113535,0.0,0.09497,1.49767,0.0
