In [6]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from sklearn.cluster import DBSCAN
import pandas as pd
from pyspark.sql.functions import pandas_udf
from pyspark.sql.types import IntegerType
import os

# Crear la sesión de Spark
spark = SparkSession.builder \
    .appName("DBSCANClustering") \
    .getOrCreate()

# Ruta de la carpeta de datos
data_directory = "/home/jovyan/work/data"

# Listar todos los directorios que comienzan con 'workplace_uuid='
workplace_uuids = [
    folder_name.split("=")[1] for folder_name in os.listdir(data_directory)
    if folder_name.startswith("workplace_uuid=")
]

# Mostrar el array con los UUIDs de los workplaces
print("Workplaces disponibles:", workplace_uuids)



Workplaces disponibles: ['7fd46663-e762-4b42-952d-759b81a50fad', 'accb4441-9875-4c20-9189-90dc3526b8e8', 'a5ebe1be-f1f0-4fa0-9086-90d44f6aec6a']


In [8]:
# Especifica el UUID que deseas cargar
workplace_uuid = workplace_uuids[0]  # Reemplaza con el UUID exacto

# Cargar los datos de ese workplace en particular
data_path = f"/home/jovyan/work/data/workplace_uuid={workplace_uuid}"
df_single_workplace = spark.read.parquet(data_path)

# Muestra el esquema y algunos datos para confirmar la carga
df_single_workplace.printSchema()
df_single_workplace.show(10)

root
 |-- uuid: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- sensor_id: string (nullable = true)
 |-- temperature: double (nullable = true)
 |-- light: double (nullable = true)
 |-- pressure: double (nullable = true)
 |-- humidity: double (nullable = true)
 |-- co2: double (nullable = true)
 |-- s0: double (nullable = true)
 |-- no2: double (nullable = true)
 |-- o3: double (nullable = true)
 |-- position_x: double (nullable = true)
 |-- position_y: double (nullable = true)
 |-- position_z: double (nullable = true)
 |-- fecha: date (nullable = true)

+--------------------+-------------------+---------+-----------+------+--------+--------+------+-----+-----+-----+------------------+------------------+----------+----------+
|                uuid|          timestamp|sensor_id|temperature| light|pressure|humidity|   co2|   s0|  no2|   o3|        position_x|        position_y|position_z|     fecha|
+--------------------+-------------------+---------+-----------+

In [None]:
# Seleccionar las columnas de interés para clustering (debes ajustar los nombres de columnas)
selected_columns = ["columna1", "columna2", "columna3"]  # Reemplaza con tus columnas

# Vectorizar las características
assembler = VectorAssembler(inputCols=selected_columns, outputCol="features")
df_vectorized = assembler.transform(df_parquet)

# Función de clustering con DBSCAN
@pandas_udf(IntegerType())
def dbscan_cluster(features_series: pd.Series) -> pd.Series:
    X = pd.DataFrame(features_series.tolist(), columns=selected_columns)
    clustering = DBSCAN(eps=0.5, min_samples=5).fit(X)
    return pd.Series(clustering.labels_)

# Aplica DBSCAN en el DataFrame vectorizado
df_clusters = df_vectorized.withColumn("cluster", dbscan_cluster("features"))

# Muestra los resultados de clustering
df_clusters.select(selected_columns + ["cluster"]).show()


In [10]:
# Especifica el UUID que deseas cargar
workplace_uuid = workplace_uuids[0]  # Reemplaza con el UUID específico si deseas otro

# Ruta de los datos del workplace específico
data_path = f"/home/jovyan/work/data/workplace_uuid={workplace_uuid}"
df_single_workplace = spark.read.parquet(data_path)

# Seleccionar solo las columnas deseadas
columns_to_select = [
    "sensor_id", "temperature", "light", "pressure", "humidity", "co2",
    "s0", "no2", "o3", "position_x", "position_y", "position_z"
]

# Crear un DataFrame con las columnas seleccionadas
df_selected = df_single_workplace.select(columns_to_select)

# Mostrar el esquema y algunos datos para confirmar la selección
df_selected.printSchema()
df_selected.show(10)

# Obtener el número total de registros
total_registros = df_selected.count()

# Mostrar el número total de registros
print(f"Número total de registros: {total_registros}")


root
 |-- sensor_id: string (nullable = true)
 |-- temperature: double (nullable = true)
 |-- light: double (nullable = true)
 |-- pressure: double (nullable = true)
 |-- humidity: double (nullable = true)
 |-- co2: double (nullable = true)
 |-- s0: double (nullable = true)
 |-- no2: double (nullable = true)
 |-- o3: double (nullable = true)
 |-- position_x: double (nullable = true)
 |-- position_y: double (nullable = true)
 |-- position_z: double (nullable = true)

+---------+-----------+------+--------+--------+------+-----+-----+-----+------------------+------------------+----------+
|sensor_id|temperature| light|pressure|humidity|   co2|   s0|  no2|   o3|        position_x|        position_y|position_z|
+---------+-----------+------+--------+--------+------+-----+-----+-----+------------------+------------------+----------+
|sensor_11|      23.22|346.75|  972.51|   66.86|765.08|0.071|0.035|0.013| 41.25834167436907|1.1823468133413673|       0.0|
|sensor_12|      23.17| 976.8| 1007.1