In [1]:
from datetime import datetime
import requests
import matplotlib.pyplot as plt
import seaborn as sns
# Gloabl seaborn Theme
sns.set_theme(style="whitegrid", palette="pastel")
import math


# Evaluation Utils

In [2]:
# Datetime ISO 8601 Format to Timestamp, TZ='America/Bogota' -05:00
def to_timestamp(datetime_iso8601):
    # datetime_iso8601 = '2023-03-17T00:00:00-05:00'
    return int(datetime.fromisoformat(datetime_iso8601).timestamp() * 1000)


In [3]:
# Datetime ISO 8601 Format to Timestamp
#to_timestamp('2023-03-17T00:00:00-05:00')

In [4]:
# Request to InfluxDB API REST
def request_influxdb(sql_query):
    endpoint = "http://influxdb.canair.io:8086/query"
    database = "canairio"
    parameters = {
        'db': database,
        'q': sql_query,
        'epoch': 'ms'
    }
    # To get response as CSV text
    headers = {'Accept': 'application/csv'}
    # GET Request
    return requests.get(endpoint, params=parameters, headers=headers)


In [5]:
# Get Query Tangaras
def query_tangaras(start_timestamp, end_timestamp):
    # Period DateTime
    period_time = f"time >= {start_timestamp}ms AND time <= {end_timestamp}ms"
    # SQL
    sql_query = "SELECT DISTINCT(geo) AS \"geohash\" "\
                "FROM \"fixed_stations_01\" WHERE "\
                "(\"geo3\" = 'd29') AND "\
                f"{period_time} "\
                "GROUP BY \"name\";"
    return sql_query


In [6]:
# Get Query Data Sensors
def query_data(tangaras, start_timestamp, end_timestamp, datatype='pm25'):
    # datatype = ['pm25', 'tmp', 'hum']
    # Period DateTime
    period_time = f"time >= {start_timestamp}ms AND time <= {end_timestamp}ms"
    # SQL Datatype by Tangara Sensor
    sql_query = ""
    for mac in tangaras['MAC'].to_list():
        sql_query += f"SELECT \"name\", last(\"{datatype}\") "\
                    "FROM \"fixed_stations_01\" WHERE "\
                    f"(\"name\" = '{mac}') AND "\
                    f"{period_time} " \
                    "GROUP BY time(30s) fill(none); "
    return sql_query[:-2]


In [11]:
# Plot Histograms
def histplots(data_sensors):
    size = len(data_sensors.describe().columns)
    # Canvas
    fig, axes = plt.subplots(math.ceil(size/2), 2, figsize=(20, 40), constrained_layout=True)
    fig.suptitle('Histograms - PM25', fontsize=20)
    # Plot
    k = 1
    for i in range(0, math.ceil(size/2)):
        for j in range(0, 2):
            if k <= size:
                sns.histplot(ax=axes[i, j], data=data_sensors[data_sensors.columns[k]].fillna(0), kde=True)#, bins=50
            k += 1


In [12]:
# Plot Boxplots
def boxplots(data_sensors):
    size = len(data_sensors.describe().columns)
    # Canvas
    fig, axes = plt.subplots(math.ceil(size/2), 2, figsize=(20, 40), constrained_layout=True)
    fig.suptitle('Boxplots - PM25', fontsize=20)
    # Plot
    k = 1
    for i in range(0, math.ceil(size/2)):
        for j in range(0, 2):
            if k <= size:
                sns.boxplot(ax=axes[i, j], data=data_sensors[data_sensors.columns[k]].fillna(0), orient="h", x=data_sensors[data_sensors.columns[k]])
            k += 1


In [13]:
# Plot Lineplots
def lineplots(data_sensors):
    size = len(data_sensors.describe().columns)
    # Canvas
    fig, axes = plt.subplots(math.ceil(size/2), 2, figsize=(20, 40), constrained_layout=True)
    fig.suptitle('Timeline - PM25', fontsize=20)
    # Plot
    k = 1
    for i in range(0, math.ceil(size/2)):
        for j in range(0, 2):
            if k <= size:
                sns.lineplot(ax=axes[i, j], data=data_sensors[data_sensors.columns[k]].fillna(0))
            k += 1


In [1]:
def missing_data(data_sensors, threshold_data=80):
    # Missing data
    to_be_checked = []

    for sensor in data_sensors.columns[1:]:
        total = data_sensors[sensor].shape[0]
        missing_data = round(data_sensors[sensor].isna().sum() * 100 / total)
        data = round(data_sensors[sensor].count() * 100 / total)
        if data < threshold_data:
            to_be_checked.append({'ID': sensor, 'Data': f'{data}%', 'Missing': f'{missing_data}%'})

    return to_be_checked


In [3]:
def correlation(data_sensors, reference_sensor=None, threshold_corr=0.8):
    # TODO: 
    # Pearson's or Spearman's correlation coefficient 
    # to measure the correlation between the two series
    pass
