In [1]:
import pandas as pd
from influxdb_client import InfluxDBClient, Point
from kedro.config import ConfigLoader
from kedro.framework.project import settings

In [2]:
# Load Parameters
parameters = catalog.load('parameters')

# Load Credentials
conf_path = str(context.project_path / settings.CONF_SOURCE)
conf_loader = ConfigLoader(conf_source=conf_path, env='local')
credentials = conf_loader.get('credentials*', 'credentials*/**')

print('Parameters:', parameters)
print('Credentials:', credentials)

2022-09-27 12:06:27,826 - kedro.io.data_catalog - INFO - Loading data from `parameters` (MemoryDataSet)...
Parameters: {'influxdb_version': '2.x', 'pm25_raw_measurement_name': 'PM25_RAW', 'pm25_clean_measurement_name': 'PM25_CLEAN', 'pm25_last_hour_measurement_name': 'PM25_LAST_HOUR', 'pm25_last_8h_measurement_name': 'PM25_LAST_8H', 'pm25_last_12h_measurement_name': 'PM25_LAST_12H', 'pm25_last_24h_measurement_name': 'PM25_LAST_24H', 'aqi_instant_measurement_name': 'AQI_INSTANT', 'aqi_last_hour_measurement_name': 'AQI_LAST_HOUR', 'aqi_last_8h_measurement_name': 'AQI_LAST_8H', 'aqi_last_12h_measurement_name': 'AQI_LAST_12H', 'aqi_last_24h_measurement_name': 'AQI_LAST_24H', 'nowcast_datetime': '2022-09-15T23:59:59', 'start_datetime': None}
Credentials: {'influxdb': {'url': 'http://localhost:8086', 'token': 'C5nfbXbV7sM0eVbpmpeO6sojZ8YNuETeZFzl9xFkq5FQCBtV1sbLpGCHGFuuu6JQQJwx-fHvHCUNccmSbXrUyA==', 'org': 'Tangara', 'bucket': 'Tangara', 'username': 'tangara', 'password': 'sebaxtian', 'datab

In [3]:
# Kedro Catalog
pm25_clean = catalog.load('pm25_clean')
temp_raw = catalog.load('temp_raw')
hum_raw = catalog.load('hum_raw')
co2_raw = catalog.load('co2_raw')
aqi_instant = catalog.load('aqi_instant')
tangara_stations = catalog.load('tangara_stations')

2022-09-27 12:06:27,923 - kedro.io.data_catalog - INFO - Loading data from `pm25_clean` (CSVDataSet)...
2022-09-27 12:06:27,941 - kedro.io.data_catalog - INFO - Loading data from `temp_raw` (CSVDataSet)...
2022-09-27 12:06:27,956 - kedro.io.data_catalog - INFO - Loading data from `hum_raw` (CSVDataSet)...
2022-09-27 12:06:27,969 - kedro.io.data_catalog - INFO - Loading data from `co2_raw` (CSVDataSet)...
2022-09-27 12:06:27,982 - kedro.io.data_catalog - INFO - Loading data from `aqi_instant` (CSVDataSet)...
2022-09-27 12:06:27,994 - kedro.io.data_catalog - INFO - Loading data from `tangara_stations` (CSVDataSet)...


In [4]:
tangara_stations.head()

Unnamed: 0,DATETIME,ID,MAC,GEOHASH,GEOLOCATION,LATITUDE,LONGITUDE
0,2022-09-27T10:42:29.517559-05:00,TANGARA_2BBA,D29ESP32DE02BBA,d29e6b4,3.38447571 -76.51634216,3.384476,-76.516342
1,2022-09-27T10:42:29.517559-05:00,TANGARA_14D6,D29ESP32DED14D6,d29dfx4,3.33503723 -76.52732849,3.335037,-76.527328
2,2022-09-27T10:42:29.517559-05:00,TANGARA_1CE2,D29ESP32DED1CE2,d29e4cv,3.35014343 -76.51222229,3.350143,-76.512222
3,2022-09-27T10:42:29.517559-05:00,TANGARA_1FCA,D29ESP32DED1FCA,d29e48s,3.34327698 -76.52458191,3.343277,-76.524582
4,2022-09-27T10:42:29.517559-05:00,TANGARA_2492,D29ESP32DED2492,d29e64g,3.39958191 -76.54792786,3.399582,-76.547928


In [5]:
pm25_clean.head()

Unnamed: 0,DATETIME,TANGARA_2BBA,TANGARA_14D6,TANGARA_1CE2,TANGARA_1FCA,TANGARA_2492,TANGARA_2FF6,TANGARA_48C6,TANGARA_4D7A,TANGARA_532E,TANGARA_EA06,TANGARA_F1AE,TANGARA_FAC6,TANGARA_06BE
0,2022-09-15T00:00:00-05:00,5.0,,8.0,10.0,,,,1.0,5.0,11.0,5.0,8.0,
1,2022-09-15T00:00:30-05:00,6.0,23.0,7.0,11.0,6.0,6.0,0.0,2.0,4.0,10.0,5.0,7.0,15.0
2,2022-09-15T00:01:00-05:00,5.0,18.0,9.0,11.0,7.0,7.0,0.0,2.0,6.0,13.0,5.0,7.0,8.0
3,2022-09-15T00:01:30-05:00,6.0,19.0,10.0,10.0,8.0,6.0,8.0,2.0,7.0,11.0,5.0,8.0,7.0
4,2022-09-15T00:02:00-05:00,6.0,25.0,9.0,12.0,14.0,7.0,7.0,1.0,6.0,11.0,5.0,8.0,8.0


In [6]:
# Get Tangara Stations Measurements Dictionary
def get_stations_measurements(stations, pm25, aqi, temp, hum, co2):
    # Tangara Stations Measurements Dictionary
    stations_measurements = {}
    # For each Tangara Station
    for station in stations.itertuples():
        #print(station._fields)
        #print(getattr(station, 'ID'))
        station_id = getattr(station, 'ID')
        #print('station_id:', station_id)

        # Set Index
        station_pm25 = pm25[['DATETIME', station_id]].set_index('DATETIME')
        station_aqi = aqi[['DATETIME', station_id]].set_index('DATETIME')
        station_temp = temp[['DATETIME', station_id]].set_index('DATETIME')
        station_hum = hum[['DATETIME', station_id]].set_index('DATETIME')
        station_co2 = co2[['DATETIME', station_id]].set_index('DATETIME')
        
        # Join Measurements
        pm25_aqi = station_pm25.join(station_aqi, lsuffix='_PM25', rsuffix='_AQI')
        temp_hum = station_temp.join(station_hum, lsuffix='_TEMP', rsuffix='_HUM')
        station_measurements = pm25_aqi.join([temp_hum, station_co2])
        
        # Rename Columns
        station_measurements.rename(
            columns={
                f'{station_id}_PM25': 'PM25',
                f'{station_id}_AQI': 'AQI',
                f'{station_id}_TEMP': 'TEMP',
                f'{station_id}_HUM': 'HUM',
                f'{station_id}': 'CO2'
            },
            inplace=True
        )
        # Reset Index
        station_measurements = station_measurements.reset_index()

        #print('station_measurements:', station_measurements.columns.to_list())
        
        # Add ID, MAC and GEOHASH columns
        station_measurements['STATION_ID'] = station_id
        station_measurements['MAC'] = getattr(station, 'MAC')
        station_measurements['GEOHASH'] = getattr(station, 'GEOHASH')

        # Set Tangara Station Measurements
        stations_measurements[station_id] = station_measurements
    
    return stations_measurements

In [7]:
stations_measurements = get_stations_measurements(tangara_stations, pm25_clean, aqi_instant, temp_raw, hum_raw, co2_raw)
stations_measurements['TANGARA_F1AE'].head()

Unnamed: 0,DATETIME,PM25,AQI,TEMP,HUM,CO2,STATION_ID,MAC,GEOHASH
0,2022-09-15T00:00:00-05:00,5.0,21.0,27.44,58.76,422.0,TANGARA_F1AE,D29TTGOTD8F1AE,d29eg66
1,2022-09-15T00:00:30-05:00,5.0,21.0,27.42,58.76,423.0,TANGARA_F1AE,D29TTGOTD8F1AE,d29eg66
2,2022-09-15T00:01:00-05:00,5.0,21.0,27.42,58.83,424.0,TANGARA_F1AE,D29TTGOTD8F1AE,d29eg66
3,2022-09-15T00:01:30-05:00,5.0,21.0,27.46,58.81,425.0,TANGARA_F1AE,D29TTGOTD8F1AE,d29eg66
4,2022-09-15T00:02:00-05:00,5.0,21.0,27.45,58.81,425.0,TANGARA_F1AE,D29TTGOTD8F1AE,d29eg66


In [8]:
# For each Tangara Stations Measurements
for station_measurements in stations_measurements.values():
    #print(station_measurement.columns.to_list())
    # For each station_measurement tuple
    for row in station_measurements.itertuples():
        #print(row._fields)
        measurement = {
            'STATION_ID': getattr(row, 'STATION_ID'),
            'MAC': getattr(row, 'MAC'),
            'GEOHASH': getattr(row, 'GEOHASH'),
            'PM25': getattr(row, 'PM25'),
            'AQI': getattr(row, 'AQI'),
            'TEMP': getattr(row, 'TEMP'),
            'HUM': getattr(row, 'HUM'),
            'CO2': getattr(row, 'CO2'),
            'DATETIME': getattr(row, 'DATETIME'),
        }
        point = Point.from_dict(
            measurement,
            record_measurement_key="STATION_ID",
            record_time_key="DATETIME",
            record_tag_keys=["MAC", "GEOHASH"],
            record_field_keys=["PM25", "AQI", "TEMP", "HUM", "CO2"]
        )
        #print('Point:', point)

In [9]:
# How to use RxPY to prepare batches for asyncio client
# https://github.com/influxdata/influxdb-client-python/blob/master/examples/asynchronous_batching.py
#
# InfluxDBClientAsync
#
import asyncio
from csv import DictReader

import reactivex as rx
from reactivex import operators as ops
from reactivex.scheduler.eventloop import AsyncIOScheduler

from influxdb_client import Point
from influxdb_client.client.influxdb_client_async import InfluxDBClientAsync



def station_measurements_to_generator(station_measurements):
    """
    Parse your stations_measurements Data Frame into generator
    """
    # For each station_measurements tuple
    for row in station_measurements.itertuples():
        #print(row._fields)
        measurement = {
            'MEASUREMENT_NAME': 'TANGARA_STATIONS',
            'STATION_ID': getattr(row, 'STATION_ID'),
            'MAC': getattr(row, 'MAC'),
            'GEOHASH': getattr(row, 'GEOHASH'),
            'PM25': getattr(row, 'PM25'),
            'AQI': getattr(row, 'AQI'),
            'TEMP': getattr(row, 'TEMP'),
            'HUM': getattr(row, 'HUM'),
            'CO2': getattr(row, 'CO2'),
            'DATETIME': getattr(row, 'DATETIME'),
        }
        point = Point.from_dict(
            measurement,
            record_measurement_key="MEASUREMENT_NAME",
            record_time_key="DATETIME",
            record_tag_keys=["MAC", "GEOHASH"],
            record_field_keys=["PM25", "AQI", "TEMP", "HUM", "CO2"]
        )
        #print('Point:', point)
        yield point


async def async_ingesting_stations_measurements(station_measurements):
    # Check InfluxDB Version
    if parameters['influxdb_version'] == '2.x':
        # Secrets
        # You can generate an API token from the "API Tokens Tab" in the UI
        url = credentials['influxdb']['url']
        token = credentials['influxdb']['token']
        org = credentials['influxdb']['org']
        bucket = credentials['influxdb']['bucket']
    elif parameters['influxdb_version'] == '1.8':
        # Secrets
        url = credentials['influxdb']['url']
        username = credentials['influxdb']['username']
        password = credentials['influxdb']['password']
        token = f'{username}:{password}'
        database = credentials['influxdb']['database']
        retention_policy = 'autogen'
        bucket = f'{database}/{retention_policy}'
        org = credentials['influxdb']['org']

    # Async write batches
    async with InfluxDBClientAsync(url=url, token=token, org=org) as client:
        write_api = client.write_api()

        """
        Async write
        """

        async def async_write(batch):
            """
            Prepare async task
            """
            await write_api.write(bucket=bucket, record=batch)
            return batch

        """
        Prepare batches from generator
        """
        batches = rx \
            .from_iterable(station_measurements_to_generator(station_measurements)) \
            .pipe(ops.buffer_with_count(500)) \
            .pipe(ops.map(lambda batch: rx.from_future(asyncio.ensure_future(async_write(batch)))), ops.merge_all())

        done = asyncio.Future()

        """
        Write batches by subscribing to Rx generator
        """
        batches.subscribe(on_next=lambda batch: print(f'Written batch... {len(batch)}'),
                        on_error=lambda ex: print(f'Unexpected error: {ex}'),
                        on_completed=lambda: done.set_result(0),
                        scheduler=AsyncIOScheduler(asyncio.get_event_loop()))
        """
        Wait to finish all writes
        """
        await done


In [10]:
#async_ingesting_stations_measurements(stations_measurements['TANGARA_FAC6'])

In [11]:
# How to use RxPY to prepare batches for synchronous write into InfluxDB
# https://github.com/influxdata/influxdb-client-python/blob/master/examples/import_data_set_sync_batching.py
#
# InfluxDBClientSync
#
from csv import DictReader

import reactivex as rx
from reactivex import operators as ops

from influxdb_client import InfluxDBClient, Point
from influxdb_client.client.write.retry import WritesRetry
from influxdb_client.client.write_api import SYNCHRONOUS



def station_measurements_to_generator(station_measurements):
    """
    Parse your stations_measurements Data Frame into generator
    """
    # For each station_measurements tuple
    for row in station_measurements.itertuples():
        #print(row._fields)
        measurement = {
            'MEASUREMENT_NAME': 'TANGARA_STATIONS',
            'STATION_ID': getattr(row, 'STATION_ID'),
            'MAC': getattr(row, 'MAC'),
            'GEOHASH': getattr(row, 'GEOHASH'),
            'PM25': getattr(row, 'PM25'),
            'AQI': getattr(row, 'AQI'),
            'TEMP': getattr(row, 'TEMP'),
            'HUM': getattr(row, 'HUM'),
            'CO2': getattr(row, 'CO2'),
            'DATETIME': getattr(row, 'DATETIME'),
        }
        point = Point.from_dict(
            measurement,
            record_measurement_key="MEASUREMENT_NAME",
            record_time_key="DATETIME",
            record_tag_keys=["STATION_ID", "MAC", "GEOHASH"],
            record_field_keys=["PM25", "AQI", "TEMP", "HUM", "CO2"]
        )
        #print('Point:', point)
        yield point


def sync_ingesting_stations_measurements(station_measurements):
    # Check InfluxDB Version
    if parameters['influxdb_version'] == '2.x':
        # Secrets
        # You can generate an API token from the "API Tokens Tab" in the UI
        url = credentials['influxdb']['url']
        token = credentials['influxdb']['token']
        org = credentials['influxdb']['org']
        bucket = credentials['influxdb']['bucket']
    elif parameters['influxdb_version'] == '1.8':
        # Secrets
        url = credentials['influxdb']['url']
        username = credentials['influxdb']['username']
        password = credentials['influxdb']['password']
        token = f'{username}:{password}'
        database = credentials['influxdb']['database']
        retention_policy = 'autogen'
        bucket = f'{database}/{retention_policy}'
        org = credentials['influxdb']['org']

    """
    Define Retry strategy - 3 attempts => 2, 4, 8
    """
    retries = WritesRetry(total=3, retry_interval=1, exponential_base=2)
    with InfluxDBClient(url=url, token=token, org=org, retries=retries) as client:

        """
        Use synchronous version of WriteApi to strongly depends on result of write
        """
        write_api = client.write_api(write_options=SYNCHRONOUS)

        """
        Prepare batches from generator
        """
        batches = rx \
            .from_iterable(station_measurements_to_generator(station_measurements)) \
            .pipe(ops.buffer_with_count(500))


        def write_batch(batch):
            """
            Synchronous write
            """
            print(f'Writing... {len(batch)}')
            write_api.write(bucket=bucket, record=batch)


        """
        Write batches
        """
        batches.subscribe(on_next=lambda batch: write_batch(batch),
                        on_error=lambda ex: print(f'Unexpected error: {ex}'),
                        on_completed=lambda: print('Import finished!'))


In [12]:
sync_ingesting_stations_measurements(stations_measurements['TANGARA_F1AE'])

Writing... 500
Writing... 500
Writing... 500
Writing... 500
Writing... 500
Writing... 338
Import finished!


In [13]:
sync_ingesting_stations_measurements(stations_measurements['TANGARA_EA06'])

Writing... 500
Writing... 500
Writing... 500
Writing... 500
Writing... 500
Writing... 338
Import finished!


In [14]:
sync_ingesting_stations_measurements(stations_measurements['TANGARA_FAC6'])

Writing... 500
Writing... 500
Writing... 500
Writing... 500
Writing... 500
Writing... 338
Import finished!


In [15]:
for station_id, station_measurements in stations_measurements.items():
    print(station_id)
    sync_ingesting_stations_measurements(station_measurements)

TANGARA_2BBA
Writing... 500
Writing... 500
Writing... 500
Writing... 500
Writing... 500
Writing... 338
Import finished!
TANGARA_14D6
Writing... 500
Writing... 500
Writing... 500
Writing... 500
Writing... 500
Writing... 338
Import finished!
TANGARA_1CE2
Writing... 500
Writing... 500
Writing... 500
Writing... 500
Writing... 500
Writing... 338
Import finished!
TANGARA_1FCA
Writing... 500
Writing... 500
Writing... 500
Writing... 500
Writing... 500
Writing... 338
Import finished!
TANGARA_2492
Writing... 500
Writing... 500
Writing... 500
Writing... 500
Writing... 500
Writing... 338
Import finished!
TANGARA_2FF6
Writing... 500
Writing... 500
Writing... 500
Writing... 500
Writing... 500
Writing... 338
Import finished!
TANGARA_48C6
Writing... 500
Writing... 500
Writing... 500
Writing... 500
Writing... 500
Writing... 338
Import finished!
TANGARA_4D7A
Writing... 500
Writing... 500
Writing... 500
Writing... 500
Writing... 500
Writing... 338
Import finished!
TANGARA_532E
Writing... 500
Writing... 5

In [16]:
"""
import json
def get_json(entrada):
    for station_id, station_measurements in entrada.items():
        print(station_id)
        entrada[station_id] = station_measurements.to_json()
    return entrada

salida = get_json(stations_measurements.copy())
print(type(salida))
print(type(json.dumps(salida, indent = 3)))
#print('salida', salida)
#json.dumps(salida)
"""
# Save stations_measurements into Catalog
catalog.save('stations_measurements', stations_measurements)

2022-09-27 12:06:41,905 - kedro.io.data_catalog - INFO - Saving data to `stations_measurements` (PartitionedDataSet)...
