In [1]:
import os
import math
from dotenv import load_dotenv

# Explicitly providing path to '.env'
from pathlib import Path  # Python 3.6+ only
# Load .env variables
_ = load_dotenv(dotenv_path=f"{Path().resolve().parents[1]}/src/.env")

# with the new api
from importnb import imports
with imports("ipynb"):
    from utils import to_timestamp, df_data_sensors, missing_data, correlation_data, df_to_csv, df_from_csv

PM2.5: 35.9, AQI: 102
PM2.5: 35.9, Measure Level: MeasureLevels.UNHEALTHY_FOR_SENSITIVE_GROUPS, Range Values: Min: 35.5, Max: 55.4
AQI: 102, Measure Level: MeasureLevels.UNHEALTHY_FOR_SENSITIVE_GROUPS, Range Values: Min: 101, Max: 150


## Humidity Raw Data

In [2]:
# Load Tangaras
df_tangaras = df_from_csv("tangaras.csv", dtindex=False)

print(f"Total Tangara Sensors: {len(df_tangaras)}")

df_tangaras.head()

[32m2023-12-15 15:35:34.197[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mdf_from_csv[0m:[36m340[0m - [34m[1mLoad DataFrame: /home/sebaxtian/Workspaces/Tangara/tangara-evaluation/src/data/0_raw/tangaras.csv[0m


Total Tangara Sensors: 22


Unnamed: 0,ID,GEOHASH,MAC,GEOLOCATION,LATITUDE,LONGITUDE
0,TANGARA_25CE,d29e4r0,D29ESP32DE125CE,3.3789825439453125 -76.54106140136719,3.378983,-76.541061
1,TANGARA_260A,d29edyj,D29ESP32DE1260A,3.4613800048828125 -76.51222229003906,3.46138,-76.512222
2,TANGARA_2BDE,d29e6de,D29ESP32DE52BDE,3.3982086181640625 -76.52595520019531,3.398209,-76.525955
3,TANGARA_39D6,d29e6de,D29ESP32DE539D6,3.3982086181640625 -76.52595520019531,3.398209,-76.525955
4,TANGARA_3B7E,d29e6de,D29ESP32DE53B7E,3.3982086181640625 -76.52595520019531,3.398209,-76.525955


In [3]:
# Filter by ID_TANGARA_REFERENCE and IDS_TANGARA_TARGETS

# Tangara Sensor Reference
ID_TANGARA_REFERENCE=os.getenv("ID_TANGARA_REFERENCE", None)
# Tangara Sensors Target
IDS_TANGARA_TARGETS=os.getenv("IDS_TANGARA_TARGETS", None)

print(f"Tangara Sensor Reference: {ID_TANGARA_REFERENCE}")
print(f"Tangara Sensors Target: {IDS_TANGARA_TARGETS}")

ids = IDS_TANGARA_TARGETS.split(',') if IDS_TANGARA_TARGETS else []
ids.append(ID_TANGARA_REFERENCE)

df_tangaras = df_tangaras[df_tangaras['ID'].isin(ids)]

print(f"Total Tangara Sensors: {len(df_tangaras)}")

df_tangaras.head()

Tangara Sensor Reference: TANGARA_5636
Tangara Sensors Target: TANGARA_39D6,TANGARA_2BDE,TANGARA_3B7E,TANGARA_3BEA,TANGARA_421A,TANGARA_422A,TANGARA_5636,TANGARA_D282
Total Tangara Sensors: 8


Unnamed: 0,ID,GEOHASH,MAC,GEOLOCATION,LATITUDE,LONGITUDE
2,TANGARA_2BDE,d29e6de,D29ESP32DE52BDE,3.3982086181640625 -76.52595520019531,3.398209,-76.525955
3,TANGARA_39D6,d29e6de,D29ESP32DE539D6,3.3982086181640625 -76.52595520019531,3.398209,-76.525955
4,TANGARA_3B7E,d29e6de,D29ESP32DE53B7E,3.3982086181640625 -76.52595520019531,3.398209,-76.525955
5,TANGARA_3BEA,d29e6de,D29ESP32DE53BEA,3.3982086181640625 -76.52595520019531,3.398209,-76.525955
6,TANGARA_421A,d29e6de,D29ESP32DE5421A,3.3982086181640625 -76.52595520019531,3.398209,-76.525955


In [4]:
# Start Date Time ISO 8601 Format, TZ='America/Bogota' -05:00
START_ISO8601_DATETIME = os.getenv("START_ISO8601_DATETIME", None)
start_timestamp = to_timestamp(START_ISO8601_DATETIME)
# End Date Time ISO 8601 Format, TZ='America/Bogota' -05:00
END_ISO8601_DATETIME = os.getenv("END_ISO8601_DATETIME", None)
end_timestamp = to_timestamp(os.getenv("END_ISO8601_DATETIME", None))

# GROUP BY TIME
GROUP_BY_TIME = os.getenv("GROUP_BY_TIME", None)

print(f'Since: {START_ISO8601_DATETIME} -> {start_timestamp}, Until: {END_ISO8601_DATETIME} -> {end_timestamp}')
print(f"Group by Time: {GROUP_BY_TIME}")

[32m2023-12-15 15:35:34.245[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mto_timestamp[0m:[36m99[0m - [34m[1mdatetime_iso8601: 2023-11-26T00:00:00-05:00, Timestamp: 1700974800000[0m
[32m2023-12-15 15:35:34.246[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mto_timestamp[0m:[36m99[0m - [34m[1mdatetime_iso8601: 2023-12-10T23:59:59-05:00, Timestamp: 1702270799000[0m


Since: 2023-11-26T00:00:00-05:00 -> 1700974800000, Until: 2023-12-10T23:59:59-05:00 -> 1702270799000
Group by Time: 30s


In [5]:
# Humidity Data Frame Sensors
df_hum_raw = df_data_sensors(df_tangaras, start_timestamp, end_timestamp, 'hum', GROUP_BY_TIME)
df_hum_raw.head()
#df_hum_raw.shape

[32m2023-12-15 15:35:34.255[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mquery_measure[0m:[36m192[0m - [34m[1msql_query: SELECT last("hum") FROM "fixed_stations_01" WHERE ("name" = 'D29ESP32DE52BDE') AND time >= 1700974800000ms AND time <= 1702270799000ms GROUP BY time(30s) fill(null); SELECT last("hum") FROM "fixed_stations_01" WHERE ("name" = 'D29ESP32DE539D6') AND time >= 1700974800000ms AND time <= 1702270799000ms GROUP BY time(30s) fill(null); SELECT last("hum") FROM "fixed_stations_01" WHERE ("name" = 'D29ESP32DE53B7E') AND time >= 1700974800000ms AND time <= 1702270799000ms GROUP BY time(30s) fill(null); SELECT last("hum") FROM "fixed_stations_01" WHERE ("name" = 'D29ESP32DE53BEA') AND time >= 1700974800000ms AND time <= 1702270799000ms GROUP BY time(30s) fill(null); SELECT last("hum") FROM "fixed_stations_01" WHERE ("name" = 'D29ESP32DE5421A') AND time >= 1700974800000ms AND time <= 1702270799000ms GROUP BY time(30s) fill(null); SELECT last("hum") FROM "fixed_station

Unnamed: 0_level_0,TANGARA_2BDE,TANGARA_39D6,TANGARA_3B7E,TANGARA_3BEA,TANGARA_421A,TANGARA_422A,TANGARA_5636,TANGARA_D282
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023-11-26 00:00:00-05:00,,,,,,,,
2023-11-26 00:00:30-05:00,55.85,54.69,51.6,49.27,48.37,,,
2023-11-26 00:01:00-05:00,56.08,55.08,51.95,49.83,48.79,,,
2023-11-26 00:01:30-05:00,56.27,55.2,52.1,49.99,48.92,47.22,51.62,52.63
2023-11-26 00:02:00-05:00,56.42,55.43,52.33,50.2,49.15,47.53,51.86,52.84


## Descriptive Statistics

In [6]:
# Describe Data
df_hum_raw.describe()

Unnamed: 0,TANGARA_2BDE,TANGARA_39D6,TANGARA_3B7E,TANGARA_3BEA,TANGARA_421A,TANGARA_422A,TANGARA_5636,TANGARA_D282
count,28045.0,25549.0,39888.0,39358.0,39186.0,38837.0,38993.0,39058.0
mean,50.113499,46.594498,46.447443,45.220562,44.543531,42.7872,45.947322,47.337607
std,8.270983,8.17941,7.904166,7.922073,8.862698,7.288838,7.654836,8.233174
min,33.97,30.49,30.62,29.56,0.0,29.08,31.01,31.47
25%,43.13,39.71,39.81,38.9,38.02,36.81,39.69,40.68
50%,50.68,46.65,46.81,45.31,44.425,42.66,46.14,47.43
75%,56.47,53.43,52.28,50.8075,49.37,47.92,51.65,53.23
max,78.17,88.71,68.34,71.89,98.78,63.25,69.28,71.48


## Missing Data

In [7]:
import warnings
warnings.filterwarnings('ignore')

# Missing Data Threshold
threshold_data = float(os.getenv("THRESHOLD_MISSING_DATA", None))
print(f'Threshold Missing Data: {int(threshold_data * 100)}%')

# For each Tangara sensor
for id_tangara_sensor in df_hum_raw.columns:
    df_hum_sensor = df_hum_raw[[id_tangara_sensor]]
    # To be checked
    to_be_checked, data_percent, missing_data_percent = missing_data(df_hum_sensor, threshold_data)
    if to_be_checked:
        print(f"Tangara Sensor: {id_tangara_sensor}, To be checked: {to_be_checked}, Data: {data_percent}%, Missing Data: {missing_data_percent}%, Threshold: {threshold_data}%")

        # Drop Tangara Sensor from original dataframe
        df_hum_raw = df_hum_raw.drop(id_tangara_sensor, axis=1)

[32m2023-12-15 15:35:55.555[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mmissing_data[0m:[36m230[0m - [34m[1mTangara Sensor: TANGARA_2BDE, To be checked: True, Data: 65%, Missing Data: 35%, Threshold: 80%[0m
[32m2023-12-15 15:35:55.558[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mmissing_data[0m:[36m230[0m - [34m[1mTangara Sensor: TANGARA_39D6, To be checked: True, Data: 59%, Missing Data: 41%, Threshold: 80%[0m
[32m2023-12-15 15:35:55.561[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mmissing_data[0m:[36m230[0m - [34m[1mTangara Sensor: TANGARA_3B7E, To be checked: False, Data: 92%, Missing Data: 8%, Threshold: 80%[0m
[32m2023-12-15 15:35:55.563[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mmissing_data[0m:[36m230[0m - [34m[1mTangara Sensor: TANGARA_3BEA, To be checked: False, Data: 91%, Missing Data: 9%, Threshold: 80%[0m
[32m2023-12-15 15:35:55.565[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mmissing_data[0m:[36m230[0m - [34m[1m

Threshold Missing Data: 80%
Tangara Sensor: TANGARA_2BDE, To be checked: True, Data: 65%, Missing Data: 35%, Threshold: 0.8%
Tangara Sensor: TANGARA_39D6, To be checked: True, Data: 59%, Missing Data: 41%, Threshold: 0.8%


## Data Correlation

In [8]:
# Data Correlation Threshold
threshold_correlation = float(os.getenv("THRESHOLD_CORRELATION_DATA", None))
print(f'Threshold Correlation Data: {int(threshold_correlation * 100)}%')

# Tangara Sensor Reference
ID_TANGARA_REFERENCE=os.getenv("ID_TANGARA_REFERENCE", None)

# Tangara Sensors Target
IDS_TANGARA_TARGETS=os.getenv("IDS_TANGARA_TARGETS", None)
IDS_TANGARA_TARGETS = IDS_TANGARA_TARGETS.split(',') if IDS_TANGARA_TARGETS else ID_TANGARA_REFERENCE

# print(f"Tangara Sensor Reference: {ID_TANGARA_REFERENCE}, Tangara Sensors Target: {IDS_TANGARA_TARGETS}")

# Reference Tangara Sensor
df_tangara_reference = df_hum_raw[[ID_TANGARA_REFERENCE]]

# For each Tangara Sensor Target
for id_tangara in IDS_TANGARA_TARGETS:
    if id_tangara in df_hum_raw.columns:
        # Target Tangara Sensor
        df_tangara_target = df_hum_raw[[id_tangara]]

        # To be checked
        if not math.isnan(df_tangara_reference.std()):
            has_corr, corr = correlation_data(df_tangara_reference, df_tangara_target, threshold_correlation)
            if not has_corr:
                print(f"Tangara Sensor Reference: {ID_TANGARA_REFERENCE}, Tangara Sensors Target: {id_tangara}, There is correlation: {has_corr}, Correlation: {float("{:.2f}".format(corr))}, Threshold: {threshold_correlation}")

                # Drop Tangara Sensor from original dataframe
                df_hum_raw = df_hum_raw.drop(id_tangara, axis=1)

[32m2023-12-15 15:35:55.583[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mcorrelation_data[0m:[36m287[0m - [34m[1mTangara Sensor Reference: TANGARA_5636, Tangara Sensors Target: TANGARA_3B7E, There is correlation: True, Correlation: 1.0, Threshold: 0.7[0m
[32m2023-12-15 15:35:55.593[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mcorrelation_data[0m:[36m287[0m - [34m[1mTangara Sensor Reference: TANGARA_5636, Tangara Sensors Target: TANGARA_3BEA, There is correlation: True, Correlation: 0.99, Threshold: 0.7[0m
[32m2023-12-15 15:35:55.622[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mcorrelation_data[0m:[36m287[0m - [34m[1mTangara Sensor Reference: TANGARA_5636, Tangara Sensors Target: TANGARA_421A, There is correlation: True, Correlation: 0.92, Threshold: 0.7[0m
[32m2023-12-15 15:35:55.635[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mcorrelation_data[0m:[36m287[0m - [34m[1mTangara Sensor Reference: TANGARA_5636, Tangara Sensors Target: TANGARA_4

Threshold Correlation Data: 70%


In [9]:
# Save Humidity Data Frame Sensors into CSV file
df_to_csv(df_hum_raw, "hum_raw.csv")

[32m2023-12-15 15:35:56.245[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mdf_to_csv[0m:[36m311[0m - [34m[1mSave DataFrame: /home/sebaxtian/Workspaces/Tangara/tangara-evaluation/src/data/0_raw/hum_raw.csv[0m
