In [1]:
import os
import math
from dotenv import load_dotenv

# Explicitly providing path to '.env'
from pathlib import Path  # Python 3.6+ only
# Load .env variables
_ = load_dotenv(dotenv_path=f"{Path().resolve().parents[1]}/standalone/.env")

# with the new api
from importnb import imports
with imports("ipynb"):
    from utils import to_timestamp, df_data_sensors, to_be_checked, is_corr_ok, df_to_csv, df_from_csv

PM2.5: 35.9, AQI: 102
PM2.5: 35.9, Measure Level: MeasureLevels.UNHEALTHY_FOR_SENSITIVE_GROUPS, Range Values: Min: 35.5, Max: 55.4
AQI: 102, Measure Level: MeasureLevels.UNHEALTHY_FOR_SENSITIVE_GROUPS, Range Values: Min: 101, Max: 150


## Humidity Raw Data

In [2]:
# Load Tangaras
df_tangaras = df_from_csv("tangaras.csv", dtindex=False)

print(f"Total Tangara Sensors: {len(df_tangaras)}")

df_tangaras.head()

Total Tangara Sensors: 18


Unnamed: 0,ID,GEOHASH,MAC,GEOLOCATION,LATITUDE,LONGITUDE
0,TANGARA_260A,d29edyj,D29ESP32DE1260A,3.4613800048828125 -76.51222229003906,3.46138,-76.512222
1,TANGARA_2BDE,d29dbmw,D29ESP32DE52BDE,3.3267974853515625 -76.62071228027344,3.326797,-76.620712
2,TANGARA_39D6,d29dbmw,D29ESP32DE539D6,3.3267974853515625 -76.62071228027344,3.326797,-76.620712
3,TANGARA_3B7E,d29dbmw,D29ESP32DE53B7E,3.3267974853515625 -76.62071228027344,3.326797,-76.620712
4,TANGARA_3BEA,d29dbmw,D29ESP32DE53BEA,3.3267974853515625 -76.62071228027344,3.326797,-76.620712


In [3]:
# Start Date Time ISO 8601 Format, TZ='America/Bogota' -05:00
START_ISO8601_DATETIME = os.getenv("START_ISO8601_DATETIME", None)
start_timestamp = to_timestamp(START_ISO8601_DATETIME)
# End Date Time ISO 8601 Format, TZ='America/Bogota' -05:00
END_ISO8601_DATETIME = os.getenv("END_ISO8601_DATETIME", None)
end_timestamp = to_timestamp(os.getenv("END_ISO8601_DATETIME", None))

# GROUP BY TIME
GROUP_BY_TIME = os.getenv("GROUP_BY_TIME", None)

print(f'Since: {START_ISO8601_DATETIME} -> {start_timestamp}, Until: {END_ISO8601_DATETIME} -> {end_timestamp}')
print(f"Group by Time: {GROUP_BY_TIME}")

Since: 2023-11-17T00:00:00-05:00 -> 1700197200000, Until: 2023-11-18T23:59:59-05:00 -> 1700369999000
Group by Time: 30s


In [4]:
# Humidity Data Frame Sensors
df_hum_raw = df_data_sensors(df_tangaras, start_timestamp, end_timestamp, 'hum', GROUP_BY_TIME)
df_hum_raw.head()
#df_hum_raw.shape

Unnamed: 0_level_0,TANGARA_260A,TANGARA_2BDE,TANGARA_39D6,TANGARA_3B7E,TANGARA_3BEA,TANGARA_421A,TANGARA_5D62,TANGARA_5636,TANGARA_D282,TANGARA_4B1A,TANGARA_14D6,TANGARA_2B42,TANGARA_2E9A,TANGARA_2FF6,TANGARA_307A,TANGARA_48C6,TANGARA_F1AE,TANGARA_06BE
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2023-11-17 00:00:00-05:00,,,,,,,,,,,,65.2,,,,,60.49,
2023-11-17 00:00:30-05:00,59.97,,,,,,,,,71.14,64.94,65.34,59.34,71.81,68.62,99.9,60.41,65.77
2023-11-17 00:01:00-05:00,59.83,,,,,,,,,71.22,65.15,65.42,59.39,71.98,68.72,99.9,60.4,65.84
2023-11-17 00:01:30-05:00,59.73,,,,,,,,,71.23,65.21,65.4,59.46,72.18,68.74,99.9,60.4,65.81
2023-11-17 00:02:00-05:00,59.67,,,,,,,,,71.24,65.22,65.5,59.57,72.36,68.69,99.9,60.43,65.76


## Descriptive Statistics

In [5]:
# Describe Data
df_hum_raw.describe()

Unnamed: 0,TANGARA_260A,TANGARA_2BDE,TANGARA_39D6,TANGARA_3B7E,TANGARA_3BEA,TANGARA_421A,TANGARA_5D62,TANGARA_5636,TANGARA_D282,TANGARA_4B1A,TANGARA_14D6,TANGARA_2B42,TANGARA_2E9A,TANGARA_2FF6,TANGARA_307A,TANGARA_48C6,TANGARA_F1AE,TANGARA_06BE
count,5727.0,1359.0,1.0,1292.0,1376.0,1425.0,2175.0,1304.0,1435.0,5726.0,5665.0,4978.0,4810.0,5718.0,5711.0,5733.0,5739.0,5715.0
mean,60.77283,55.216152,83.65,45.112074,46.802776,48.15887,0.0,55.182331,48.363164,67.856343,63.148258,63.096131,58.485152,69.205203,62.446959,99.9,57.892729,62.661039
std,7.253669,4.690169,,6.512747,5.624303,7.402838,0.0,3.793066,4.457632,10.176782,9.080769,4.718506,4.870566,8.683055,8.867404,1.421209e-14,7.447153,7.161146
min,47.29,49.21,83.65,38.05,40.25,41.11,0.0,48.28,41.98,48.96,45.5,51.09,43.86,51.01,40.89,99.9,45.0,46.87
25%,54.495,53.055,83.65,42.5975,43.8175,45.05,0.0,53.0575,46.305,57.9925,54.91,59.05,54.8225,61.25,53.33,99.9,50.7,55.91
50%,60.91,54.21,83.65,43.825,45.93,45.61,0.0,55.0,47.74,70.59,64.67,62.86,59.57,70.535,64.77,99.9,59.0,63.65
75%,65.69,54.915,83.65,45.2,46.7,47.11,0.0,55.59,48.42,75.13,69.82,67.5175,61.7975,76.8,70.25,99.9,63.85,68.915
max,76.41,82.24,83.65,83.94,82.08,80.38,0.0,78.11,79.43,85.61,80.29,73.7,68.88,83.9,76.34,99.9,72.3,76.09


## Missing Data

In [6]:
import warnings
warnings.filterwarnings('ignore')

# Missing Data Threshold
threshold = 90
print(f'Threshold: {threshold}%')

# For each Tangara sensor
for id_tangara_sensor in df_hum_raw.columns:
    df_hum_sensor = df_hum_raw[[id_tangara_sensor]]
    # To be checked
    is_ok, data_percent, missing_data_percent = to_be_checked(df_hum_sensor, threshold)
    if not is_ok:
        print(f"Tangara Sensor: {id_tangara_sensor}, Data: {data_percent}%, Missing: {missing_data_percent}%, To be checked")

Threshold: 90%
Tangara Sensor: TANGARA_2BDE, Data: 24%, Missing: 76%, To be checked
Tangara Sensor: TANGARA_39D6, Data: 0%, Missing: 100%, To be checked
Tangara Sensor: TANGARA_3B7E, Data: 22%, Missing: 78%, To be checked
Tangara Sensor: TANGARA_3BEA, Data: 24%, Missing: 76%, To be checked
Tangara Sensor: TANGARA_421A, Data: 25%, Missing: 75%, To be checked
Tangara Sensor: TANGARA_5D62, Data: 38%, Missing: 62%, To be checked
Tangara Sensor: TANGARA_5636, Data: 23%, Missing: 77%, To be checked
Tangara Sensor: TANGARA_D282, Data: 25%, Missing: 75%, To be checked
Tangara Sensor: TANGARA_2B42, Data: 86%, Missing: 14%, To be checked
Tangara Sensor: TANGARA_2E9A, Data: 84%, Missing: 16%, To be checked


## Data Correlation

In [7]:
# Data Correlation Threshold
threshold = 0.9
print(f'Threshold: {threshold}%')

ID_REFE_TANGARA_SENSOR='TANGARA_06BE'
ID_TARG_TANGARA_SENSOR='TANGARA_2FF6'
# Reference Tangara Sensor
df_reference_sensor = df_hum_raw[ID_REFE_TANGARA_SENSOR]
# Target Tangara Sensor
df_target_sensor = df_hum_raw[ID_TARG_TANGARA_SENSOR]

# To be checked
if not math.isnan(df_reference_sensor.std()):
    is_ok, corr = is_corr_ok(df_reference_sensor, df_target_sensor, threshold)
    if not is_ok:
        print(f"Reference Tangara Sensor: {ID_REFE_TANGARA_SENSOR}, Target Tangara Sensor: {ID_TARG_TANGARA_SENSOR}, Correlation: {corr}, To be checked")

# For each Tangara sensor
# for id_tangara_sensor in df_hum_raw.columns:
#     # Target Tangara Sensor
#     df_target_sensor = df_hum_raw[id_tangara_sensor]
#     # To be checked
#     is_ok, corr = is_corr_ok(df_reference_sensor, df_target_sensor, threshold)
#     if not is_ok:
#         print(f"Reference Tangara Sensor: {ID_REFE_TANGARA_SENSOR}, Target Tangara Sensor: {id_tangara_sensor}, Correlation: {corr}, To be checked")


Threshold: 0.9%
Reference Tangara Sensor: TANGARA_06BE, Target Tangara Sensor: TANGARA_2FF6, Correlation: 0.0, To be checked


In [8]:
# Save Humidity Data Frame Sensors into CSV file
df_to_csv(df_hum_raw, "hum_raw.csv")