In [10]:
from datetime import datetime, timezone, timedelta
import pandas as pd
from io import StringIO
import import_ipynb
from evaluation import to_timestamp, request_influxdb, query_data, histplots, boxplots, lineplots, missing_data, correlation


In [11]:
# Load Tangaras
tangaras = pd.read_csv('../data/tangaras.csv')
tangaras.shape

(6, 6)

In [12]:
# Start Date Time ISO 8601 Format, TZ='America/Bogota' -05:00
#start_timestamp = to_timestamp('2023-03-17T00:00:00-05:00')
#start_timestamp = to_timestamp('2023-04-03T00:00:00-05:00')
start_timestamp = to_timestamp('2023-06-12T00:00:00-05:00')

# End Date Time ISO 8601 Format, TZ='America/Bogota' -05:00
#end_timestamp = to_timestamp('2023-03-31T00:00:00-05:00')
#end_timestamp = to_timestamp('2023-04-04T00:00:00-05:00')
end_timestamp = to_timestamp('2023-06-18T23:59:59-05:00')

print(f'Since: {start_timestamp} Until: {end_timestamp}')


Since: 1686546000000 Until: 1687150799000


In [13]:
# Get Data Frame Sensors
def df_sensors(tangaras, start_timestamp, end_timestamp):
    # Data Frame Sensors
    df_sensors = []
    # SQL Query Data Sensors
    query = query_data(tangaras, start_timestamp, end_timestamp, datatype='hum')
    # InfluxDB API REST Request
    influxdb_request = request_influxdb(query)
    #print(influxdb_request)
    #print(influxdb_request.text)

    # Data Frame InfluxDB Sensors
    df_influxdb_sensors = pd.read_csv(StringIO(influxdb_request.text), sep=",", low_memory=False)

    # Remove/Add Columns
    df_influxdb_sensors = df_influxdb_sensors[['time', 'name.1', 'last']]
    df_influxdb_sensors.rename(columns={'time': 'DATETIME', 'name.1': 'MAC', 'last': 'HUM'}, inplace=True)

    # Truncate Response
    for index, row in tangaras.iterrows():
        df_sensor = df_influxdb_sensors.loc[df_influxdb_sensors['MAC'] == row['MAC']].reset_index(drop=True)[['DATETIME', 'HUM']] # Warning
        if not df_sensor.empty:
            df_sensor.rename(columns={'HUM': row['ID']}, inplace=True)
            df_sensor.set_index('DATETIME', inplace=True)
            df_sensors.append(df_sensor)
    
    df_sensors = df_sensors[0].join(df_sensors[1:]).reset_index()

    # Date Time ISO 8601 Format, TZ='America/Bogota' -05:00
    tz = timezone(timedelta(hours=-5))
    df_sensors['DATETIME'] = df_sensors['DATETIME'].apply(lambda x: datetime.fromtimestamp(int(x) / 1000, tz=tz).isoformat())
    df_sensors['DATETIME'] = pd.to_datetime(df_sensors['DATETIME'])

    # Set Index
    df_sensors.set_index('DATETIME', inplace=True)
    df_sensors = df_sensors.asfreq(freq='30S')

    df_sensors[df_sensors.columns.to_list()] = df_sensors[df_sensors.columns.to_list()].astype('float64')
    
    return df_sensors


In [14]:
# Data Frame Sensors
hum_raw = df_sensors(tangaras, start_timestamp, end_timestamp)
hum_raw.head()

Unnamed: 0_level_0,TANGARA_260A,TANGARA_1282,TANGARA_2492,TANGARA_48C6,TANGARA_F1AE,TANGARA_06BE
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-06-12 00:00:30-05:00,57.53,64.66,55.17,99.9,59.03,66.14
2023-06-12 00:01:00-05:00,57.49,64.93,55.18,99.9,59.05,66.01
2023-06-12 00:01:30-05:00,57.51,65.02,55.34,99.9,59.09,65.97
2023-06-12 00:02:00-05:00,57.59,65.09,55.42,99.9,59.07,65.91
2023-06-12 00:02:30-05:00,57.61,65.19,55.68,99.9,59.04,65.94


# Descriptive Statistics

In [15]:
# Describe Data
hum_raw.describe()

Unnamed: 0,TANGARA_260A,TANGARA_1282,TANGARA_2492,TANGARA_48C6,TANGARA_F1AE,TANGARA_06BE
count,18995.0,18101.0,18992.0,18095.0,17271.0,18655.0
mean,51.232401,53.201862,50.137662,96.483653,51.245485,58.048789
std,8.834032,9.632215,6.037965,5.324764,9.297152,9.174962
min,29.61,28.18,36.94,71.7,29.36,39.32
25%,45.35,46.85,45.44,94.8,43.88,50.96
50%,51.0,53.9,50.625,99.9,51.16,57.6
75%,58.74,61.07,55.14,99.9,59.38,65.07
max,68.59,72.52,62.32,99.9,68.33,82.42


In [16]:
# Missing Data Sensors
threshold = 85
to_be_checked = missing_data(hum_raw, threshold)

print(f'Threshold: {threshold}%')
print(f'Total Sensors: {len(hum_raw.columns)}')
print(f'To Be Checked: {len(to_be_checked)}')
to_be_checked

Threshold: 85%
Total Sensors: 6
To Be Checked: 0


[]

In [17]:
# Sensors to evaluate by Pearson correlation coefficient
sensors_to_evaluate = hum_raw[['TANGARA_1282', 'TANGARA_260A', 'TANGARA_06BE']]

# Pearson correlation coefficient
threshold = 0.85
to_be_checked = correlation(sensors_to_evaluate, 'TANGARA_1282', threshold)

# Correlation Sensors
print(f'Threshold: {threshold}')
print(f'Total Sensors: {len(sensors_to_evaluate.columns)}')
print(f'To Be Checked: {len(to_be_checked)}')
to_be_checked


Threshold: 0.85
Total Sensors: 3
To Be Checked: 0


[]

In [18]:
# Save Humidity raw data into CSV file
hum_raw.to_csv('../data/hum_raw.csv')