In [1]:
from datetime import datetime, timezone, timedelta
import pandas as pd
from io import StringIO
import import_ipynb
from evaluation import to_timestamp, request_influxdb, query_data, histplots, boxplots, lineplots, missing_data, correlation


importing Jupyter notebook from evaluation.ipynb


In [2]:
# Load Tangaras
tangaras = pd.read_csv('../data/tangaras.csv')
tangaras.shape

(6, 6)

In [3]:
# Start Date Time ISO 8601 Format, TZ='America/Bogota' -05:00
#start_timestamp = to_timestamp('2023-03-17T00:00:00-05:00')
#start_timestamp = to_timestamp('2023-04-03T00:00:00-05:00')
start_timestamp = to_timestamp('2023-06-12T00:00:00-05:00')

# End Date Time ISO 8601 Format, TZ='America/Bogota' -05:00
#end_timestamp = to_timestamp('2023-03-31T00:00:00-05:00')
#end_timestamp = to_timestamp('2023-04-04T00:00:00-05:00')
end_timestamp = to_timestamp('2023-06-18T23:59:59-05:00')

print(f'Since: {start_timestamp} Until: {end_timestamp}')


Since: 1686546000000 Until: 1687150799000


In [4]:
# Get Data Frame Sensors
def df_sensors(tangaras, start_timestamp, end_timestamp):
    # Data Frame Sensors
    df_sensors = []
    # SQL Query Data Sensors
    query = query_data(tangaras, start_timestamp, end_timestamp, datatype='tmp')
    # InfluxDB API REST Request
    influxdb_request = request_influxdb(query)
    #print(influxdb_request)
    #print(influxdb_request.text)

    # Data Frame InfluxDB Sensors
    df_influxdb_sensors = pd.read_csv(StringIO(influxdb_request.text), sep=",", low_memory=False)

    # Remove/Add Columns
    df_influxdb_sensors = df_influxdb_sensors[['time', 'name.1', 'last']]
    df_influxdb_sensors.rename(columns={'time': 'DATETIME', 'name.1': 'MAC', 'last': 'TEMP'}, inplace=True)

    # Truncate Response
    for index, row in tangaras.iterrows():
        df_sensor = df_influxdb_sensors.loc[df_influxdb_sensors['MAC'] == row['MAC']].reset_index(drop=True)[['DATETIME', 'TEMP']] # Warning
        if not df_sensor.empty:
            df_sensor.rename(columns={'TEMP': row['ID']}, inplace=True)
            df_sensor.set_index('DATETIME', inplace=True)
            df_sensors.append(df_sensor)
    
    df_sensors = df_sensors[0].join(df_sensors[1:]).reset_index()

    # Date Time ISO 8601 Format, TZ='America/Bogota' -05:00
    tz = timezone(timedelta(hours=-5))
    df_sensors['DATETIME'] = df_sensors['DATETIME'].apply(lambda x: datetime.fromtimestamp(int(x) / 1000, tz=tz).isoformat())
    df_sensors['DATETIME'] = pd.to_datetime(df_sensors['DATETIME'])

    # Set Index
    df_sensors.set_index('DATETIME', inplace=True)
    df_sensors = df_sensors.asfreq(freq='30S')

    df_sensors[df_sensors.columns.to_list()] = df_sensors[df_sensors.columns.to_list()].astype('float64')
    
    return df_sensors


In [5]:
# Data Frame Sensors
temp_raw = df_sensors(tangaras, start_timestamp, end_timestamp)
temp_raw.head()

Unnamed: 0_level_0,TANGARA_260A,TANGARA_1282,TANGARA_2492,TANGARA_48C6,TANGARA_F1AE,TANGARA_06BE
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-06-12 00:00:30-05:00,29.47,27.5,31.05,27.6,28.87,28.35
2023-06-12 00:01:00-05:00,29.49,27.47,31.0,27.6,28.9,28.41
2023-06-12 00:01:30-05:00,29.46,27.41,30.97,27.6,28.9,28.4
2023-06-12 00:02:00-05:00,29.47,27.44,30.97,27.5,28.87,28.44
2023-06-12 00:02:30-05:00,29.46,27.4,30.93,27.5,28.9,28.45


# Descriptive Statistics

In [6]:
# Describe Data
temp_raw.describe()

Unnamed: 0,TANGARA_260A,TANGARA_1282,TANGARA_2492,TANGARA_48C6,TANGARA_F1AE,TANGARA_06BE
count,18995.0,18101.0,18992.0,18095.0,17271.0,18655.0
mean,30.872162,30.612199,32.431582,30.348196,32.029464,30.446971
std,2.695987,3.372886,2.416032,3.542196,3.597832,2.875892
min,25.18,25.29,27.56,24.5,25.58,24.92
25%,28.61,27.82,30.59,27.4,28.95,28.11
50%,30.55,29.79,32.06,29.4,31.66,30.22
75%,32.89,32.72,34.1,33.4,34.66,32.78
max,37.57,40.58,39.23,38.9,42.07,37.43


In [7]:
# Missing Data Sensors
threshold = 85
to_be_checked = missing_data(temp_raw, threshold)

print(f'Threshold: {threshold}%')
print(f'Total Sensors: {len(temp_raw.columns)}')
print(f'To Be Checked: {len(to_be_checked)}')
to_be_checked

Threshold: 85%
Total Sensors: 6
To Be Checked: 0


[]

In [8]:
# Sensors to evaluate by Pearson correlation coefficient
sensors_to_evaluate = temp_raw[['TANGARA_1282', 'TANGARA_260A', 'TANGARA_06BE']]

# Pearson correlation coefficient
threshold = 0.85
to_be_checked = correlation(sensors_to_evaluate, 'TANGARA_1282', threshold)

# Correlation Sensors
print(f'Threshold: {threshold}')
print(f'Total Sensors: {len(sensors_to_evaluate.columns)}')
print(f'To Be Checked: {len(to_be_checked)}')
to_be_checked


Threshold: 0.85
Total Sensors: 3
To Be Checked: 0


[]

In [9]:
# Save Temperature raw data into CSV file
temp_raw.to_csv('../data/temp_raw.csv')