# Quality assessement of the data


This notebook shows how the avoca can be applied to the data.


We have different possible flags that we assign


For the input data we use some test timeseries.



In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from __future__ import annotations
import logging
# Different import required
from datetime import datetime
from os import PathLike
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Import the @voc@ module 
import avoca
from avoca.utils import compounds_from_df

plt.style.use('default')

In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

## Data format

The data is expected to be in a pandas dataframe with the 2 levels columns:

The first coumn contains the compound names, 
the second the different variables. 

Some columns report measurement metadata and are shared among the compounds so the first line is `-`. 

Reserved names for compounds columns :

* `area`: the chromatogram area 
* `C`: the concentration 
* `flag`: the flag assigned to the data point

For shared columns (starting with `-`)

* `datetime`: the date and time of the measurement
* `type`: the type of measurement (calibration=`std`, sample=`air`, blank=`blank`, ...)

In [None]:
# File availabe in the repository
data_file = Path(*avoca.__path__).parent / "data" / "voc_jan2jun_2023.csv"
df = pd.read_csv(data_file, parse_dates=[('-','datetime')],  header=[0,1]).set_index([('-','datetime')])

In our case the data contains already some compounds, the concentration values and some flags

In [None]:
print(f"Lenght: {len(df)} measurements")
compounds = compounds_from_df(df)
print(f"Compounds: {compounds}")
df.head()

### Creating an assigner of the data

avoca works with assigner objects

These objects are first trained on some 'valid' data and then used to assign flags to new data

Note that you can also generete this using a config file. 
See example: `config.yaml`

In [None]:
from avoca.qa_class.concs import ExtremeConcentrations
from avoca.qa_class.zscore import XY_Correlations, Multiple_XY_Correlations


# Create assingers for each compound
assigners = [
    exteme_concs := ExtremeConcentrations(compounds=compounds),
    XY_Correlations(
        compounds=["ethane", "propane", "n-butane"], variable="C", threshold=4.0
    ),
    # xy_benzene_toluene := XY_Correlations(compounds=["benzene", "toluene"], variable="C"),
    multiple_assigner := Multiple_XY_Correlations(
        number_of_regression=3, compounds=["benzene", "toluene"], variable="C"
    ),
]

In [None]:
from avoca.manager import AssignerManager

# Assume the data has not been checked since the first of April
limit_date = datetime(2023, 4, 1)
# Fit the assigners to the data
df_train = df.loc[:limit_date]
df_test = df.loc[limit_date:]
for assigner in assigners:
    # First we train the assigner on the training data
    AssignerManager.train(assigner, df_train)
    # Then we apply the assigner on the test data, this will add the flags to the dataframe
    AssignerManager.apply(assigner, df_test)


### Flags 

Each assigner will put flags on the data.

In [None]:
from avoca.flags import QA_Flag

for assigner in assigners:
    print(type(assigner).__name__, '\t', assigner.flag)

In [None]:
# Plot ethene vs propane
fig, ax = plt.subplots(figsize=(4, 4))
mask = df[("-", "type")] == "air"

x_sub = 'ethane'
y_sub = 'propane'
unit = 'pmol/mol'

this_df = df_test.loc[mask]
ax.scatter(
    this_df[(x_sub, "C")],
    this_df[(y_sub, "C")],
    label="Test Data",
    color="darkblue",
    s=2,
)
# Get flagged data
mask_uncorrelated = (
    (this_df[(x_sub, "flag")] & QA_Flag.UNCORRELATED.value) != 0
) | ((this_df[(y_sub, "flag")] & QA_Flag.UNCORRELATED.value) != 0)
# Put crosses on uncorrelated data
ax.scatter(
    this_df.loc[mask_uncorrelated, (x_sub, "C")],
    this_df.loc[mask_uncorrelated, (y_sub, "C")],
    marker="x",
    label="Uncorrelated",
    s=20,
    color="red",
)
ax.legend()
ax.set_xlabel(f"{x_sub} [{unit}]")
ax.set_ylabel(f"{y_sub} [{unit}]")
# Make the background transparent
fig.patch.set_alpha(0)


In [None]:
# Plot the multiple correlation
assigner = multiple_assigner
fig, ax = plt.subplots(figsize=(5, 3))
mask = np.isin(df[("-", "type")], assigner.runtypes)

x_sub = 'benzene'
y_sub = 'toluene'
slopes = [l.weight.item() for l in assigner._models[(x_sub, y_sub)].linears]
stds = assigner._stds[(x_sub, y_sub)]

this_df = df.loc[mask]
ax.scatter(
    this_df[(x_sub, "C")],
    this_df[(y_sub, "C")],
    label="Test Data",
    color="darkblue",
    s=2,
)
# Get flagged data
mask_uncorrelated = (
    (this_df[(x_sub, "flag")] & QA_Flag.UNCORRELATED.value) != 0
) | ((this_df[(y_sub, "flag")] & QA_Flag.UNCORRELATED.value) != 0)

ax.scatter(
    this_df.loc[mask_uncorrelated, (x_sub, "C")],
    this_df.loc[mask_uncorrelated, (y_sub, "C")],
    marker="x",
    label="Uncorrelated",
    s=20,
    color="red",
)
x = np.linspace(np.nanmin(this_df[(x_sub, "C")]), np.nanmax(this_df[(x_sub, "C")]), 100)
for slope, std in zip(slopes, stds):
    ax.plot(x, slope * x, color="black", linewidth=1, linestyle="--")
    # Fill the std area between the lines
    print(std)
    ax.fill_between(
        x,
        np.max(slope * x - std * assigner.threshold, 0),
        slope * x + std * assigner.threshold,
        color="black",
        alpha=0.1,
        linewidth=0,
    )

In [None]:
# Plot toluene benzene
fig, ax = plt.subplots(figsize=(5, 3))
mask = df[("-", "type")] == "air"

sub = 'n-butane'

this_df = df_test.loc[mask].loc[limit_date:]
ax.scatter(
    this_df.index,
    this_df[(sub, "C")],
    label="Data",
    color="darkblue",
    s=1,
)
# Get flagged data
mask_extreme = (
    (this_df[(sub, "flag")] & QA_Flag.EXTREME_VALUE.value) != 0
) 
# Put crosses on uncorrelated data
ax.scatter(
    this_df.loc[mask_extreme].index,
    this_df.loc[mask_extreme, (sub, "C")],
    marker="x",
    label="Extreme value",
    s=10,
    color="red",
)
ax.legend()
ax.set_xlabel("Date")
ax.set_ylabel(sub)
# Rotate the xticks
ax.tick_params(axis='x', rotation=25)
# Make the background transparent
fig.patch.set_alpha(0)


In [None]:
exteme_concs.plot()