# Process the CSV data from Pepperwood

Authors: Rohan and Daniel

In [None]:
# Mount Google Drive to import data
# Do this first since it will prompt authentication

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import matplotlib.pyplot as plt

import pathlib # Nicer IO than the os library
from tqdm import tqdm  # Progress bar

In [None]:
# Global variables since the pepperwood code version does not include headers
# It would also be possible to put this in a config.yml or similar format.
BASE_HEADERS = ['datetime', 'from_node']
NETWORK_HEADERS = ['rxSnr', 'hopLimit', 'rxRssi', 'hopStart']

SENSOR_HEADERS = {
    "device_metrics":   ['batteryLevel', 'voltage', 'channelUtilization', 'airUtilTx'],
    "bme688":           ['temperature', 'relativeHumidity', 'barometricPressure', 'gasResistance', 'iaq'],
    "ina260":           ['ch3Voltage', 'ch3Current'],
    "pmsa003i":         ['pm10Standard', 'pm25Standard', 'pm100Standard', 'pm10Environmental', 'pm25Environmental', 'pm100Environmental']
}
SENSOR_NAMES = SENSOR_HEADERS.keys()

FULL_DATA_HEADERS = {
    sensor: BASE_HEADERS + SENSOR_HEADERS[sensor] + NETWORK_HEADERS \
        for sensor in SENSOR_NAMES
}

FOLDERPATH = '/content/drive/Shareddrives/SMesh: Sustainability Radio Sensor Networks/smesh_field_data/pepperwood_campaign_2024-11-07_to_2024-11-17/'


def fix_incomplete_csv(csv_filename: pathlib.Path, sensor: str) -> pathlib.Path:
    """
    Due to a mix in read_aqi.py and network_test.py. There is a mismatch in the
    number of columns. Pandas cannot handle this and we need to add more commas
    to the CSV to compliment.

    This file would not be needed if the data was logged consistently :)
    """
    expected_comma_number = len(FULL_DATA_HEADERS[sensor]) - 1

    # Since python type hints are suggestions
    csv_filename = pathlib.Path(csv_filename)
    csv_filename_to_modify = csv_filename.with_name(
        csv_filename.stem + '_modified' + csv_filename.suffix)

    # Now add the commas
    with open(csv_filename, 'r') as infile, \
            open(csv_filename_to_modify, 'w') as outfile:

        for curr_line in tqdm(infile, desc="writing modified csv"):
            # count the number of commas
            num_commas = curr_line.count(',')
            num_missing_commas = expected_comma_number - num_commas
            assert num_missing_commas >= 0, \
                f"Expected {expected_comma_number} commas, but found {num_commas}"

            # add the commas at the end (fortunately since the network values
            # are at the end, this is simpler)
            modified_line = curr_line.rstrip('\n') + \
                                "," * num_missing_commas + '\n'
            outfile.write(modified_line)

    # Check that the new file exists
    if not csv_filename_to_modify.exists():
        raise FileNotFoundError(
            f"Could not find copied file at {csv_filename_to_modify}")

    return csv_filename_to_modify



def read_csv_data_from_logger(logger: str, folderpath: str, sensor_list: list,
                              extension: str = ".csv") -> dict:
    """
    Find the relevant data read from a specified logger node and form a pandas
    dataframe for each of the datasets.

    Inputs:
        folderpath: str    - A string to the folder. In colab, it likely starts
                             with '/content/drive/Shareddrives/'

    Ouputs:
        data_dfs : dict    - A dictionary of pandas dataframes
    """
    data_dfs = {}

    for sensor in sensor_list:
        print("Trying to open data for", sensor)
        data_path = pathlib.Path(folderpath + logger + "_" + sensor + extension)
        if not data_path.exists():
            print(f"No data for {sensor}. Used the following path: {data_path}")
            continue

        # FOR PEPPERWOOD
        mod_data_path = fix_incomplete_csv(data_path, sensor)
        print("\nFixed CSV")

        # The "header=None" means that the file does NOT have headers
        # The "parse_dates=[0]" means that the zeroth column includes datetime
        # objects.
        data_dfs[sensor] = pd.read_csv(mod_data_path, header=None, parse_dates=[0])

        # Since we do not have the headers, we need to add them
        data_dfs[sensor].columns = FULL_DATA_HEADERS[sensor]
        # Include the short name out of convenience
        data_dfs[sensor]['from_short_name'] = data_dfs[sensor]['from_node'].str[-4:]

        print(sensor, "completed!\n")

    return data_dfs

In [None]:
pepperwood_data_dfs = read_csv_data_from_logger("62e4", FOLDERPATH, SENSOR_NAMES)

In [None]:
def plot_all_sensor_variables(data_dict: dict, sensor: str):
    """
    Plot each sensor variable by row
    """
    col_id = 'from_short_name'
    sensor_vars = SENSOR_HEADERS[sensor]
    num_vars = len(sensor_vars)

    fig, axes = plt.subplots(nrows = num_vars, ncols=1,
                             figsize=(12, 3 * num_vars), sharex=True)

    for node_name, node_data in data_dict[sensor].groupby(col_id):
        for var_id, sensed_var in enumerate(sensor_vars):
            axes[var_id].scatter(x='datetime', y=sensed_var,
                           data=node_data, label=node_name, s=1)
            # axes[var_id].plot(x='datetime', y=sensed_var,
            #                   data=node_data, label=node_name)

    for var_id, sensed_var in enumerate(sensor_vars):
        axes[var_id].grid(True)
        axes[var_id].set_ylabel(sensed_var)
        axes[var_id].legend(bbox_to_anchor=(1.05, 1), loc='upper left',
                            markerscale=3)

        # if var_id == 0:
        #     axes[var_id].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        # else:
        #     axes[var_id].get_legend().remove()

    curr_xlims = plt.xlims()
    print(f"Current xlims are {curr_xlims}")
    plt.xlabel('Date and Time')

    return fig, axes

In [None]:
fig, axes = plot_all_sensor_variables(pepperwood_data_dfs, sensor='bme688')

In [None]:
fig, axes = plot_all_sensor_variables(pepperwood_data_dfs, sensor='device_metrics')

In [None]:
fig, axes = plot_all_sensor_variables(pepperwood_data_dfs, sensor='ina260')

In [None]:
fig, axes = plot_all_sensor_variables(pepperwood_data_dfs, sensor='pmsa003i')

Things left to do:


*   Night time
*   Events dictionary (e.g., Fire start, Fire end, Rain start)
*   Date time bounds to isolate days
*   Dew point calculation from temperature and relative humidity
*   Network data plots
*   Move to GitHub
*   Moving average (correctly in time)
*   PMSA semilogy plots
*   PMSA correlation plots
*   Histogram of the number of packets
*   Packet frequency plots (e.g., how regular and how many dropped packets)
*   Purple Air Data for comparison