In [1]:
import pandas as pd

from importnb import imports
with imports("ipynb"):
    from utils import df_from_csv, df_to_csv

PM2.5: 35.9, AQI: 102
PM2.5: 35.9, Measure Level: MeasureLevels.UNHEALTHY_FOR_SENSITIVE_GROUPS, Range Values: Min: 35.5, Max: 55.4
AQI: 102, Measure Level: MeasureLevels.UNHEALTHY_FOR_SENSITIVE_GROUPS, Range Values: Min: 101, Max: 150


## Analysis Dia sin Carro y sin Moto

In [2]:
# Load Datasets
df_tangaras = df_from_csv('tangaras.csv', datafolder='dscysm/2023', dtindex=False)
df_temp = df_from_csv('temp_raw.csv', datafolder='dscysm/2023')
df_hum = df_from_csv('hum_raw.csv', datafolder='dscysm/2023')
df_pm25 = df_from_csv('pm25_clean.csv', datafolder='dscysm/2023')
df_aqi = df_from_csv('aqi.csv', datafolder='dscysm/2023')

## Zonas Geograficas

Ubicar cada sensor de Tangara en la correspondiente zona geografica de Cali segun la distribucion de zonas de geograficas de Cali establecida por Infraestructura de Datos Espaciales de Santiago de Cali (IDESC), mas informacion: https://www.cali.gov.co/planeacion/publicaciones/169423/zonas_geograficas_idesc/

In [3]:
# Get Zona Geografica
def get_zona_geografica(id_tangara: str) -> str:
    # TODO: Configurar zona geografica en cada sensor tangara en la instalacion
    zona_sur = ('TANGARA_1712', 'TANGARA_307A', 'TANGARA_2BBA', 'TANGARA_06BE', 'TANGARA_2FF6')
    zona_occidente = ('TANGARA_2B42', 'TANGARA_2E9A', 'TANGARA_532E')
    zona_centro_historico_comercial = ()
    zona_centro_geografico = ('TANGARA_48C6')
    zona_norte = ('TANGARA_260A', 'TANGARA_14D6', 'TANGARA_F1AE')
    zona_oriente = ('TANGARA_4B1A')
    zonas_geograficas = {
        'Zona Sur': zona_sur,
        'Zona Norte': zona_norte,
        'Zona Centro Historico Comercial': zona_centro_historico_comercial,
        'Zona Centro Geografico': zona_centro_geografico,
        'Zona Occidente': zona_occidente,
        'Zona Oriente': zona_oriente
    }
    for key, value in zonas_geograficas.items():
        if id_tangara in value:
            return key
    return 'N.A'

In [4]:
# Add Zonas Geograficas
df_tangaras['ZONA_GEOGRAFICA'] = df_tangaras['ID'].apply(lambda id_tangara: get_zona_geografica(id_tangara))
df_tangaras.head()

Unnamed: 0,ID,GEOHASH,MAC,GEOLOCATION,LATITUDE,LONGITUDE,ZONA_GEOGRAFICA
0,TANGARA_06BE,d29e6de,D29TTGOTD906BE,3.3982086181640625 -76.52595520019531,3.398209,-76.525955,Zona Sur
1,TANGARA_14D6,d29eg4k,D29ESP32DED14D6,3.4847259521484375 -76.50260925292969,3.484726,-76.502609,Zona Norte
2,TANGARA_1712,d29dfx4,D29ESP32DEE1712,3.3350372314453125 -76.52732849121094,3.335037,-76.527328,Zona Sur
3,TANGARA_260A,d29edyj,D29ESP32DE1260A,3.4613800048828125 -76.51222229003906,3.46138,-76.512222,Zona Norte
4,TANGARA_2B42,d29e6pg,D29ESP32DED2B42,3.4270477294921875 -76.54792785644531,3.427048,-76.547928,Zona Occidente


## Promedios Horarios

Calcular los promedios horarios de Temperatura, Humedad y PM2.5, el valor del AQI fue calculado previamente como el promedio horario movil 24H.

In [5]:
# Resample mean grouping by hour on DATETIME
df_temp = df_temp.resample('H').mean()
df_hum = df_hum.resample('H').mean()
df_pm25 = df_pm25.resample('H').mean()

In [6]:
# Temperatura
df_temp.head(3)

Unnamed: 0_level_0,TANGARA_260A,TANGARA_4B1A,TANGARA_14D6,TANGARA_2B42,TANGARA_2E9A,TANGARA_2FF6,TANGARA_307A,TANGARA_1712,TANGARA_48C6,TANGARA_532E,...,TANGARA_06BE,TANGARA_2BBA,TANGARA_2BDE,TANGARA_39D6,TANGARA_3B7E,TANGARA_3BEA,TANGARA_421A,TANGARA_5D62,TANGARA_5636,TANGARA_D282
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-10-28 00:00:00-05:00,26.131217,25.187815,25.6525,25.585234,29.459412,25.507731,25.026639,23.70819,24.576316,24.215,...,25.980756,,,,,,,,,
2023-10-28 01:00:00-05:00,25.773565,24.535833,25.197632,25.450167,29.070667,24.870667,24.636917,23.404831,23.967544,23.673333,...,25.564083,,,,,,,,,
2023-10-28 02:00:00-05:00,25.696121,24.234083,24.72708,25.245333,28.712917,24.44042,24.514538,23.233833,23.535088,23.2775,...,25.3505,,,,,,,,,


In [7]:
# Humedad
df_hum.head(3)

Unnamed: 0_level_0,TANGARA_260A,TANGARA_4B1A,TANGARA_14D6,TANGARA_2B42,TANGARA_2E9A,TANGARA_2FF6,TANGARA_307A,TANGARA_1712,TANGARA_48C6,TANGARA_532E,...,TANGARA_06BE,TANGARA_2BBA,TANGARA_2BDE,TANGARA_39D6,TANGARA_3B7E,TANGARA_3BEA,TANGARA_421A,TANGARA_5D62,TANGARA_5636,TANGARA_D282
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-10-28 00:00:00-05:00,70.719565,76.764958,76.123017,71.466729,62.797395,79.906387,73.625966,79.034914,99.9,99.9,...,72.384202,,,,,,,,,
2023-10-28 01:00:00-05:00,72.308957,79.167333,77.903421,72.57625,64.527167,82.27,74.453667,79.645847,99.9,99.9,...,73.876583,,,,,,,,,
2023-10-28 02:00:00-05:00,72.140948,80.075583,78.666283,73.389583,65.782333,83.700672,74.011513,79.33275,99.9,99.9,...,74.396583,,,,,,,,,


In [8]:
# PM2.5
df_pm25.head(3)

Unnamed: 0_level_0,TANGARA_260A,TANGARA_4B1A,TANGARA_14D6,TANGARA_2B42,TANGARA_2E9A,TANGARA_2FF6,TANGARA_307A,TANGARA_1712,TANGARA_48C6,TANGARA_532E,...,TANGARA_06BE,TANGARA_2BBA,TANGARA_2BDE,TANGARA_39D6,TANGARA_3B7E,TANGARA_3BEA,TANGARA_421A,TANGARA_5D62,TANGARA_5636,TANGARA_D282
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-10-28 00:00:00-05:00,7.347826,13.714286,10.163793,5.457944,5.092437,4.87395,5.193277,4.318966,4.842105,5.716667,...,6.033613,,,,,,,,,
2023-10-28 01:00:00-05:00,5.895652,7.216667,9.385965,5.1,4.908333,5.257143,6.966667,5.779661,5.04386,6.475,...,7.041667,,,,,,,,,
2023-10-28 02:00:00-05:00,6.827586,11.5,8.39823,5.983333,5.541667,6.831933,4.386555,3.516667,5.95614,8.216667,...,8.558333,,,,,,,,,


In [9]:
# AQI
df_aqi.head(3)

Unnamed: 0_level_0,TANGARA_260A,TANGARA_4B1A,TANGARA_14D6,TANGARA_2B42,TANGARA_2E9A,TANGARA_2FF6,TANGARA_307A,TANGARA_1712,TANGARA_48C6,TANGARA_532E,...,TANGARA_06BE,TANGARA_2BBA,TANGARA_2BDE,TANGARA_39D6,TANGARA_3B7E,TANGARA_3BEA,TANGARA_421A,TANGARA_5D62,TANGARA_5636,TANGARA_D282
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-10-29 00:00:00-05:00,55.0,64.0,63.0,53.0,33.0,55.0,42.0,32.0,44.0,32.0,...,61.0,,,,,,,,,
2023-10-29 01:00:00-05:00,54.0,65.0,62.0,53.0,33.0,55.0,42.0,31.0,43.0,32.0,...,61.0,,,,,,,,,
2023-10-29 02:00:00-05:00,54.0,65.0,63.0,52.0,33.0,54.0,42.0,31.0,43.0,33.0,...,61.0,,,,,,,,,


In [10]:
import warnings
warnings.filterwarnings('ignore')

# Consolidado para exportar en un unico dataset, util para importar y generar el dashboard de analisis en la aplicacion Apache Superset
df_dscysm = pd.DataFrame(data=[], columns=['DATETIME','PM25','TEMP','HUM','AQI','ID_TANGARA','LATITUDE','LONGITUDE','ZONA_GEOGRAFICA'])

# Merge todas las variables
df_merged = pd.merge(pd.merge(pd.merge(df_pm25, df_temp, on="DATETIME", suffixes=['_PM25', '_TEMP']), df_hum, on='DATETIME'), df_aqi, on='DATETIME', suffixes=['_HUM', '_AQI'])

# Por cada uno de los tangaras
for id_tangara in df_tangaras['ID']:
    tangara = df_tangaras[df_tangaras['ID'] == id_tangara][['ID','LATITUDE','LONGITUDE','ZONA_GEOGRAFICA']]
    
    # Trabajar solo con los tangaras que han reportado suficientes datos: mas del 70% de datos reportados, Variable AQI
    umbral_minimo = 70
    percent_datos = round(df_merged[f'{id_tangara}_AQI'].count() * 100 / len(df_merged[f'{id_tangara}_AQI']))
    if percent_datos < umbral_minimo:
        print(f"No hay suficientes datos reportados por {id_tangara}: solo ha reportado el {percent_datos}%, se descarta para el analisis")
        continue

    # New DataFrame
    new_data = pd.DataFrame(data=[], columns=df_dscysm.columns)

    # PM25
    sensor_pm25 = df_merged[[f'{id_tangara}_PM25']]
    # TEMP
    sensor_temp = df_merged[[f'{id_tangara}_TEMP']]
    # HUM
    sensor_hum = df_merged[[f'{id_tangara}_HUM']]
    # AQI
    sensor_aqi = df_merged[[f'{id_tangara}_AQI']]

    # DATETIME
    datetime_data = sensor_pm25.index
    # VALUES
    value_pm25 = sensor_pm25.values
    value_temp = sensor_temp.values
    value_hum = sensor_hum.values
    value_aqi = sensor_aqi.values

    # Set DataFrame
    new_data['DATETIME'] = datetime_data
    new_data['PM25'] = value_pm25
    new_data['TEMP'] = value_temp
    new_data['HUM'] = value_hum
    new_data['AQI'] = value_aqi
    new_data['ID_TANGARA'] = [id_tangara] * len(datetime_data)
    new_data['LATITUDE'] = [tangara['LATITUDE'].values[0]] * len(datetime_data)
    new_data['LONGITUDE'] = [tangara['LONGITUDE'].values[0]] * len(datetime_data)
    new_data['ZONA_GEOGRAFICA'] = [tangara['ZONA_GEOGRAFICA'].values[0]] * len(datetime_data)

    # Concat
    df_dscysm = pd.concat([df_dscysm, new_data])

print(f"Solo se consideran sensores de Tangara que han reportado mas del {umbral_minimo}% de datos")

df_dscysm.tail()
# df_dscysm.dtypes

No hay suficientes datos reportados por TANGARA_1712: solo ha reportado el 22%, se descarta para el analisis
No hay suficientes datos reportados por TANGARA_2BBA: solo ha reportado el 3%, se descarta para el analisis
No hay suficientes datos reportados por TANGARA_2BDE: solo ha reportado el 8%, se descarta para el analisis
No hay suficientes datos reportados por TANGARA_39D6: solo ha reportado el 4%, se descarta para el analisis
No hay suficientes datos reportados por TANGARA_3B7E: solo ha reportado el 8%, se descarta para el analisis
No hay suficientes datos reportados por TANGARA_3BEA: solo ha reportado el 8%, se descarta para el analisis
No hay suficientes datos reportados por TANGARA_421A: solo ha reportado el 8%, se descarta para el analisis
No hay suficientes datos reportados por TANGARA_532E: solo ha reportado el 23%, se descarta para el analisis
No hay suficientes datos reportados por TANGARA_5636: solo ha reportado el 8%, se descarta para el analisis
No hay suficientes datos r

Unnamed: 0,DATETIME,PM25,TEMP,HUM,AQI,ID_TANGARA,LATITUDE,LONGITUDE,ZONA_GEOGRAFICA
499,2023-11-18 19:00:00-05:00,4.908333,28.162333,61.019333,21.0,TANGARA_F1AE,3.486099,-76.495743,Zona Norte
500,2023-11-18 20:00:00-05:00,4.35,28.456417,55.375917,21.0,TANGARA_F1AE,3.486099,-76.495743,Zona Norte
501,2023-11-18 21:00:00-05:00,6.458333,27.955667,54.391,23.0,TANGARA_F1AE,3.486099,-76.495743,Zona Norte
502,2023-11-18 22:00:00-05:00,5.8,27.0925,59.82225,20.0,TANGARA_F1AE,3.486099,-76.495743,Zona Norte
503,2023-11-18 23:00:00-05:00,3.566667,26.169917,63.023333,16.0,TANGARA_F1AE,3.486099,-76.495743,Zona Norte


## Agregar dimesion Dia de la Semana

In [11]:
# Dia de la semana
df_dscysm['DIA_SEMANA'] = df_dscysm['DATETIME'].apply(lambda value: ['Lunes','Martes','Miercoles','Jueves','Viernes','Sabado','Domingo'][value.dayofweek])
df_dscysm.head()

Unnamed: 0,DATETIME,PM25,TEMP,HUM,AQI,ID_TANGARA,LATITUDE,LONGITUDE,ZONA_GEOGRAFICA,DIA_SEMANA
0,2023-10-29 00:00:00-05:00,16.312236,25.778403,73.982353,61.0,TANGARA_06BE,3.398209,-76.525955,Zona Sur,Domingo
1,2023-10-29 01:00:00-05:00,14.791667,25.753917,74.124083,61.0,TANGARA_06BE,3.398209,-76.525955,Zona Sur,Domingo
2,2023-10-29 02:00:00-05:00,19.7,25.505917,74.522333,61.0,TANGARA_06BE,3.398209,-76.525955,Zona Sur,Domingo
3,2023-10-29 03:00:00-05:00,18.133333,25.165667,74.405333,61.0,TANGARA_06BE,3.398209,-76.525955,Zona Sur,Domingo
4,2023-10-29 04:00:00-05:00,16.737288,24.799237,74.484068,60.0,TANGARA_06BE,3.398209,-76.525955,Zona Sur,Domingo


## Dataset para Analisis Completado

Para hacer un analisis de forma visual, se usara el software de Apache Superset, donde se va a construir un dashboard con varias graficas para soportar las conclusiones del analisis a partir de los datos.

In [12]:
# Save PM2.5 Data Frame Sensors into CSV file
df_to_csv(df_dscysm, "dscysm.csv", datafolder='dscysm/2023')