In [1]:
import os
import requests
import pandas as pd
import math

from pandas import DataFrame
from scipy.stats import pearsonr
from scipy.stats._result_classes import PearsonRResult
from requests import Response
from datetime import datetime
from dotenv import load_dotenv

# Explicitly providing path to '.env'
from pathlib import Path  # Python 3.6+ only
# Load .env variables
_ = load_dotenv(dotenv_path=f"{Path().resolve().parents[1]}/standalone/.env")

## Utils

In [2]:
def to_timestamp(datetime_iso8601: str) -> int:
    """
    Datetime ISO 8601 Format to Timestamp
    TZ='America/Bogota' -05:00

    :params
    :datetime_iso8601: str, Datetime ISO 8601 Format

    :return: int, Timestamp

    :example
        - to_timestamp('2023-03-17T00:00:00-05:00')
            return: 1679029200000
    """
    return int(datetime.fromisoformat(datetime_iso8601).timestamp() * 1000)

In [3]:
def request_influxdb(sql_query: str) -> Response:
    """
    Request to InfluxDB API REST

    :params
    :sql_query: str, InfluxDB SQL query

    :return: Response, InfluxDB response as CSV text
    """
    endpoint = os.getenv("URL_INFLUXDB_QUERY_ENDPOINT", None)
    database = os.getenv("DB_NAME_INFLUXDB", None)
    parameters = {
        'db': database,
        'q': sql_query,
        'epoch': 'ms'
    }
    # To get response as CSV text
    headers = {'Accept': 'application/csv'}
    # GET Request
    return requests.get(endpoint, params=parameters, headers=headers)

In [4]:
def query_tangaras(start_timestamp: int, end_timestamp: int) -> str:
    """
    Get InfluxDB SQL query of all Tangara sensors that have reported data over a period of time.

    :params:
    :start_timestamp: int, timestamp datetime value, ms
    :end_timestamp: int, timestamp datetime value, ms

    :return: str, InfluxDB SQL Query
    """
    # Period DateTime
    period_time = f"time >= {start_timestamp}ms AND time <= {end_timestamp}ms"
    # SQL
    sql_query = "SELECT DISTINCT(geo) AS \"geohash\" "\
                "FROM \"fixed_stations_01\" WHERE "\
                "(\"geo3\" = 'd29') AND "\
                f"{period_time} "\
                "GROUP BY \"name\";"
    return sql_query

In [5]:
def query_measure(mac_tangaras: [str], start_timestamp: int, end_timestamp: int, datatype: str='pm25') -> str:
    """
    Get InfluxDB SQL query for specific measure (datatype) and for each Tangara sensor identified by MAC address between a period of time.

    :params:
    :mac_tangaras: [str], Tangara sensor MAC address
    :start_timestamp: int, timestamp datetime value, ms
    :end_timestamp: int, timestamp datetime value, ms
    :datatype: str, choice ['pm25', 'tmp', 'hum']

    :return: str, InfluxDB SQL Query
    """
    # Period DateTime
    period_time = f"time >= {start_timestamp}ms AND time <= {end_timestamp}ms"
    # SQL Datatype by Tangara Sensor
    sql_query = ""
    for mac in mac_tangaras:
        sql_query += f"SELECT \"name\", last(\"{datatype}\") "\
                    "FROM \"fixed_stations_01\" WHERE "\
                    f"(\"name\" = '{mac}') AND "\
                    f"{period_time} " \
                    "GROUP BY time(30s) fill(none); "
    return sql_query[:-2]

In [6]:
def to_be_checked(df_sensor: DataFrame, threshold_data_percent: int=80) -> [bool, int, int]:
    """
    Check if the sensor must be checked, because it has not reported enough data.
    Return [bool, int]: [{Does it to be checked?}, {Total data}, {Total missing data}]

    :params:
    :df_sensor: DataFrame, Data reported by Tangara sensor
    :threshold_data_percent: int, Threshold to check enough data reported

    :return: [bool, int, int], Does it not report enough data?
    """
    # Check missing data
    total = df_sensor.shape[0]
    missing_data_percent = round(df_sensor.isna().sum()[0] * 100 / total)
    data_percent = round(df_sensor.count()[0] * 100 / total)
    # Threshold
    if data_percent < threshold_data_percent:
        # to be checked
        return [False, data_percent, missing_data_percent]
    # OK
    return [True, data_percent, missing_data_percent]

In [7]:
def is_corr_ok(df_reference_sensor: DataFrame, df_target_sensor: DataFrame, threshold_corr: float=0.9) -> [bool, float]:
    """
    Check if the target sensor must be checked, because it has not reference with the reference sensor.
    Return [bool, float]: [{Is correlation ok?}, {Correlation percent}]

    corr = 0, No correlation
    corr = [-1, 0), Negative correlation
    corr = (0, 1], Positive correlation

    :params:
    :df_reference_sensor: DataFrame, Reference Tangara sensor
    :df_target_sensor: DataFrame, Target Tangara sensor
    :threshold_corr: float, Threshold to check the positive correlation percent between both Tangara sensors

    :return: [bool, float], There is not a correlation?
    """
    # Pearson Correlation Coefficient
    corr = 0
    if (not df_reference_sensor.hasnans and not df_target_sensor.hasnans) and (df_reference_sensor.shape[0] == df_target_sensor.shape[0]):
        corr, _ = pearsonr(df_reference_sensor, df_target_sensor) if df_target_sensor.std() != 0 else PearsonRResult(0,0,alternative=0,n=0)
        corr = 0 if math.isnan(corr) else corr
    # corr = 0, No correlation
    # corr = [-1, 0), Negative correlation
    # corr = (0, 1], Positive correlation
    # Threshold
    if corr < threshold_corr:
        # There is not correlation
        return [False, float("{:.2f}".format(corr))]
    # There is correlation
    return [True, float("{:.2f}".format(corr))]

In [8]:
def df_to_csv(df: DataFrame, filename: str, datafolder: str='0_raw') -> None:
    """
    Save DataFrame into data folder as a CSV file.
    datafolder: str, choice ['0_raw', '1_clean', '2_features', 'backup']

    :params:
    :df: DataFrame, pandas DataFrame
    :filename: str, CSV file name with extension .csv
    :datafolder: str, choice ['0_raw', '1_clean', '2_features', 'backup']
    """
    # Save DataFrame into CSV file
    path_datafolder=f"{Path().resolve().parents[1]}/standalone/data/{datafolder}"
    df.to_csv(f"{path_datafolder}/{filename}")

In [9]:
def df_from_csv(filename: str, datafolder: str='0_raw') -> DataFrame:
    """
    Load DataFrame from CSV file localted in data folder.
    datafolder: str, choice ['0_raw', '1_clean', '2_features', 'backup']

    :params:
    :filename: str, CSV file name with extension .csv
    :datafolder: str, choice ['0_raw', '1_clean', '2_features', 'backup']

    :return: df: DataFrame, pandas DataFrame
    """
    # Load DataFrame from CSV file
    path_csvfile=f"{Path().resolve().parents[1]}/standalone/data/{datafolder}/{filename}"
    return pd.read_csv(path_csvfile)