# Predicting pressures by inverting the function of PID controllers

This notebook takes as input a submission file created by a neural network model and updates all pressures which can be determined by computing the inverse function of a P controller or a PI controller.

In addition to the submission file, the notebook saves all the controller parameters.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import pickle
from IPython.display import display

# Read training data and test data

In [None]:
# Training data: extract targets to know the discrete values
train_df = pd.read_csv('../input/ventilator-pressure-prediction/train.csv')
targets = train_df[['pressure']].to_numpy()

# Find pressure sensor minimum and step
p_values = np.sort(np.unique(targets))
p_min = p_values[0]
p_step = p_values[1] - p_values[0]

# Create 2d array uu from test data
test_df = pd.read_csv('../input/ventilator-pressure-prediction/test.csv')
relevant = test_df[['u_out']].to_numpy() == 0
uu = test_df[['u_in']].to_numpy().reshape(-1, 80)
rr = relevant.reshape(-1, 80)
t = test_df['time_step'].values.reshape(-1, 80)
dt_ = t[:,1:] - t[:,:-1] # Only 79 columns - there is no dt for the final step

# Sort the 950 pressure values by frequency in the first time step so that the search is fast
# We only look for values <= 16
temp_df = pd.DataFrame(targets.reshape(-1, 80)[:,1], columns=['pressure'])
temp_df = temp_df.groupby('pressure').size().sort_values(ascending=False)
p_values_by_frequency = list(temp_df.index) + sorted(list(set(p_values[p_values <= 16]).difference(temp_df.index)))
len(p_values_by_frequency)


# Inverse functions of P controller and PI controller

In [None]:
%%time
# Find and optionally update all experiments which use a PI controller


def is_integer(discrete):
    """Test if discrete is an integer.
    
    The function can be called with a scalar or an array.
    """
    tol = 1e-10 # must be small enough so that with millions of
                # calls we don't get false positives
    return (abs(discrete - np.round(discrete)) < tol)
    
def find_pi_control(row, uu, rr, dt_, preds, pi_list, pp=None, update_preds=False):
    """Test if row has been generated by a perfect PI controller
    
    Parameters
    ----------
    row          : The row to be processed
    uu           : 2d array of u_in
    rr           : 2d array of (u_out == 0)
    dt_          : 2d array of time differences
    preds        : 2d array of predictions; will be updated if update_preds is True
    pi_list      : list, the found parameters will be appended to this list
    pp           : 2d array of true pressures for evaluation, optional
    update_preds : bool, default False, controls if preds is updated
    
    Global variables
    ----------------
    count, count_bad, mae_gain : updated with evaluation results if pp is not None
    updated                    : count of updated rows
    """
    # Verify parameters and copy a slice [start:end] of the selected row into u, oof, p and dt
    if uu.shape != preds.shape: raise ValueError(f"Shapes of uu and preds must be equal: {uu.shape} {preds.shape}")
    if rr.shape != preds.shape: raise ValueError(f"Shapes of rr and preds must be equal: {rr.shape} {preds.shape}")
    if dt_.shape[0] != preds.shape[0]: raise ValueError(f"First dimension of dt_ and preds must be equal: {dt_.shape} {preds.shape}")
    global count, count_bad, ae_gain, updated
    start, end = 1, rr[row].sum()
    p_values_to_try = p_values_by_frequency
    while start < end and (uu[row, start] == 0 or uu[row, start] == 100):
        p_values_to_try = p_values
        start += 1
    if start == end: return # all u_in are 0 or 100
    u = uu[row, rr[row]][start:]
    oof = preds[row, rr[row]][start:]
    if pp is not None: p = pp[row, rr[row]][start:]
    dt = dt_[row, rr[row, 1:]][start:] # typically 1/30
    T = 0.5
    
    def find_pi_coefficients(u, dt, p_values_to_try):
        # u has at least three elements, dt has at least two
        # Determine p_0, p_coef, i_coef and p_star for the start of the slice by grid-search
        # The possible p_0 are searched in order of descending frequency
        # Determine q_0, p_1, q_1, p_2
        # Accept the solution only if p_1 and p_2 are discrete p values
        while len(u) >= 3 and (u[0] == 0 or u[0] == 100 or u[1] == 0 or u[1] == 100 or u[2] == 0 or u[2] == 100):
            u = u[1:]
        if len(u) < 3: return None, None, None, None, None
        p_stars = np.array([10, 15, 20, 25, 30, 35])
        found = False
        s0 = dt[0] / (dt[0] + T)
        s1 = dt[1] / (dt[1] + T)
        for p_0 in p_values_to_try:
            for p_coef in [0, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
                for i_coef in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
                    if p_coef == 0 and i_coef == 0: continue
                    # u[0] = p_coef * (p_star - p_0) + i_coef * q_0
                    # dt[0] = t[1] - t[0]
                    # s0 = dt[0] / (dt[0] + T)
                    # q_1 = (1-s0) * q_0 + s0 * (p_star - p_1)
                    # u[1] = p_coef * (p_star - p_1) + i_coef * q_1
                    q_0 = (u[0] + p_coef * (p_0 - p_stars)) / i_coef
                    pis = p_coef + i_coef * s0 # positive
                    p_1 = (pis * p_stars + i_coef * (1-s0) * q_0 - u[1]) / pis # array of several possible p_1
                    ii = is_integer((p_1 - p_min) / p_step)
                    if not ii.any(): continue
                    if ii.sum() > 1: print("More than one ***********")
                    p_star = p_stars[ii.argmax()]
                    p_1 = p_1[ii.argmax()] # scalar p_1
                    q_1 = (1-s0) * q_0[ii.argmax()] + s0 * (p_star - p_1) # scalar q_1
                    pis = p_coef + i_coef * s1 # positive
                    p_2 = (pis * p_star + i_coef * (1-s1) * q_1 - u[2]) / pis # scalar p_2
                    if not is_integer((p_2 - p_min) / p_step): continue
                    if np.abs(p_1 - p_2) < 1e-10: print('p_1 == p_2', p_0, p_1, p_2, p_coef, i_coef, p_star); return None, None, None, None, None
                    found = True
                    break
                if found: break
            if found: break
        if not found: return None, None, None, None, None
        return p_0, p_coef, i_coef, p_star, q_0[ii.argmax()]
    
    # Try to determine the coefficients twice: once at the beginning of the inhalation phase and once towards the end
    p_0, p_coef, i_coef, p_star, q = find_pi_coefficients(u, dt, p_values_to_try)
    q_is_valid = p_0 is not None
    if p_0 is None:
        p_0, p_coef, i_coef, p_star, q = find_pi_coefficients(u[-9:], dt[-8:], p_values) # last three elements of u; dt is one element shorter
        q_is_valid = False
        if p_0 is None: return

    # At this point we have found parameters p_0, p_coef, i_coef and p_star which give discrete
    # values for the first three time steps, and we may have q_0

    # Compute the new predictions
    update_list = [] # for plotting
    pred_new = oof.copy()
    if q_is_valid and p_coef != 0:
        last_valid, last_delta = 0, p_0 - pred_new[0]
        pred_new[0] = p_0
        update_list.append((start, p_0))
    for i in range(1, len(pred_new)):
        # Invariant: pred_new[:i] has been computed
        # Invariant: q is the state of the PI controller or not q_is_valid
        # We want to determine pred_new[i]
        if u[i] == 0 or u[i] == 100: 
            q_is_valid = False # u has been clipped; we cannot compute p here
            continue
        if q_is_valid:
            s = dt[i-1] / (dt[i-1] + T) # ca. 1/16
            pis = p_coef + i_coef * s # positive
            pni = (pis * p_star + i_coef * (1-s) * q - u[i]) / pis # candidate pred_new[i]
            if is_integer((pni - p_min) / p_step):
                last_valid, last_delta = i, pni - pred_new[i]
                pred_new[i] = pni
                update_list.append((start+i, pni))
                q = (u[i] + p_coef * (pred_new[i] - p_star)) / i_coef
            else:
                #print(f"Out of sync {start + i}")
                q_is_valid = False
        else:
            # Try to resynchronize the controller state after a phase which didn't use the PI controller
            if i >= len(pred_new) - 2: break # we cannot resynchronize the last two
            if u[i+1] == 0 or u[i+1] == 100 or u[i+2] == 0 or u[i+2] == 100: continue
            s_i = dt[i] / (dt[i] + T)
            s_i1 = dt[i+1] / (dt[i+1] + T)
            pis = p_coef + i_coef * s_i # positive
            for p_i in p_values:
                q_i = (u[i] + p_coef * (p_i - p_star)) / i_coef
                p_i1 = (pis * p_star + i_coef * (1-s_i) * q_i - u[i+1]) / pis
                if not is_integer((p_i1 - p_min) / p_step): continue
                q_i1 = (1-s_i) * q_i + s_i * (p_star - p_i1)
                pis = p_coef + i_coef * s_i1
                p_i2 = (pis * p_star + i_coef * (1-s_i1) * q_i1 - u[i+2]) / pis
                if not is_integer((p_i2 - p_min) / p_step): continue
                #print(f"Resynchronized {i} {q_i:.3f}  {p_i:.3f}  {p_i1:.3f}  {(p_i1 - p_min) / p_step}")
                if p_coef != 0: # for p_coef == 0, q_i doesn't depend on p_i
                    last_valid, last_delta = i, p_i - pred_new[i]
                    pred_new[i] = p_i
                    update_list.append((start+i, p_i))
                q, q_is_valid = q_i, True
                break

    pred_new[(u < 1e-6) & (oof > pred_new)] = oof[(u < 1e-6) & (oof > pred_new)]
    pred_new[(u > 99.9999) & (oof < pred_new)] = oof[(u > 99.9999) & (oof < pred_new)]
    
    # For training data (where we know the true pressure): verify that the error is getting smaller
    if pp is not None and not update_preds:
        mae_pred = mean_absolute_error(p, pred_new)
        ae_gain_1 = np.abs(p - oof).sum() - np.abs(p - pred_new).sum() # should be nonnegative 
        print(f'Row {row:2}: Gain {ae_gain_1:6.3f}')
        ae_gain += ae_gain_1
        if ae_gain_1 < 0:
            print(f"Row: {row}")
            print(f"MAE OOF:  {mean_absolute_error(p, oof):.3f}")
            print(f"MAE Pred: {mae_pred:.3f}")
            print(f"Start: {start}")
            plt.figure(figsize=(10, 4))
            plt.title(f"p_coef = {p_coef:.2f}, i_coef = {i_coef:.2f}, p_star = {p_star:.0f}")
            plt.plot(np.arange(start, end), u, label='u_in')
            plt.scatter(*zip(*update_list), marker='o', label='updated pressure')
            plt.scatter(np.arange(start, end)[u == 0], u[u == 0], marker='x') # clipped u_in which is useless for predictions
            plt.plot(np.arange(start, end), oof, label='pressure_pred_oof')
            plt.plot(np.arange(start, end), pp[row, rr[row]][start:end], label='pressure_true')
            #plt.scatter(np.arange(start, end), pred_new, label='pressure_pred_new')
            #plt.plot(np.arange(len(uu[row])), uu[row]) # 80 steps of u_in
            #plt.plot(np.arange(len(pp[row])), pp[row]) # 80 steps of pressure_true
            plt.legend()
            plt.show()
            count_bad += 1
        else:
            count += 1

    # Keep the parameters for future reference
    pi_list.append((row, p_coef, i_coef, p_star, np.abs(oof - pred_new).sum()))
    
    # For test data: update the predictions
    if update_preds:
        exhale = rr[row].argmin()
        preds[row, start:exhale] = pred_new
        updated += 1
        
# Test the function on a subset of the training data
# if the training data is available and pp is defined
try:
    pi_list, count, count_bad, ae_gain = [], 0, 0, 0
    for row in range(len(pp) // 10, len(pp) // 5): # [79, 133, 219]: # [106, 171, 455]: # 
        find_pi_control(row, uu, rr, dt_, oof_pred, pi_list, pp)
    if count > 0 or count_bad > 0:
        print("Count:", count, count_bad)
        print("AE gain:", ae_gain)
    pi_df = pd.DataFrame(pi_list, columns=['row', 'p_coef', 'i_coef', 'p_star', 'difference'])
    print(f"Cumulated difference: {pi_df['difference'].sum():.3f}")
    display(pi_df)
except NameError as e:
    print("Warning: NameError caught", e)


In [None]:
# Find and update all experiments which use a P-only controller

def find_p_control(row, uu, rr, preds, p_list, pp=None, update_preds=False):
    """Test if row has been generated by a perfect P controller
    
    Parameters
    ----------
    row          : The row to be processed
    uu           : 2d array of u_in
    rr           : 2d array of (u_out == 0)
    preds        : 2d array of predictions; will be updated if update_preds is True
    p_list       : list, the found parameters will be appended to this list
    pp           : 2d array of true pressures for evaluation, optional
    update_preds : bool, default False, controls if preds is updated
    
    Global variables
    ----------------
    row_set                    : set of row numbers with P controller
    count, count_bad, mae_gain : updated with evaluation results if pp is not None
    updated                    : count of updated rows
    """
    # Verify parameters and copy the selected row into u, oof and p
    if uu.shape != preds.shape: raise ValueError(f"Shapes of uu and preds must be equal: {uu.shape} {preds.shape}")
    if rr.shape != preds.shape: raise ValueError(f"Shapes of rr and preds must be equal: {rr.shape} {preds.shape}")
    global row_set, count, count_bad, ae_gain, updated
    start, end = 1, rr[row].sum()
    u = uu[row, rr[row]][start:]
    oof = preds[row, rr[row]][start:]
    if pp is not None: p = pp[row, rr[row]][start:]
        
    def find_p_coefficients(u):
        """Take four samples from the series and determine p_coef and p_star
        
        Return (p_coef, p_star) if the breath is using a P controller
        Return (None, None) if it is not a P controller"""
        for i in [0, len(u) // 3, len(u) * 2 // 3, len(u) - 1]:
            if u[i] != 0 and u[i] != 100:
                for p_coef in [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
                    for p_star in [10, 15, 20, 25, 30, 35]:
                        predicted_p_int = (p_star - u[i] / p_coef - p_min) / p_step
                        if predicted_p_int >= 0 and predicted_p_int < len(p_values) and is_integer(predicted_p_int):
                            return p_coef, p_star
        return None, None

    p_coef, p_star = find_p_coefficients(u)
    if p_coef is None: return
        
    # Compute the new predictions
    # If anything is strange about them, keep the original predictions
    # If u_in is 0, the new prediction will be too low and not a discrete value
    # -> we may round the prediction up
    # -> we should keep the original prediction if it is higher
    # If u_in is 100, the new prediction will be too high and not a discrete value
    # -> we may round the prediction down
    # we should keep the original prediction if it is lower
    pred_new = p_star - u / p_coef
    pred_new_int = (pred_new - p_min) / p_step
    strange = ((pred_new_int < 0) | (pred_new_int >= len(p_values)) | (~is_integer(pred_new_int))) & (u != 0) & (u != 100)
    if strange.any():
        print('strange', row, strange)
        return
    pred_new[u == 0] = np.ceil(pred_new_int[u == 0]) * p_step + p_min
    pred_new[(u == 0) & (oof > pred_new)] = oof[(u == 0) & (oof > pred_new)]
    pred_new[u == 100] = np.floor(pred_new_int[u == 100]) * p_step + p_min
    pred_new[(u == 100) & (oof < pred_new)] = oof[(u == 100) & (oof < pred_new)]
    
    
    if pp is not None and not update_preds:
        mae_pred = mean_absolute_error(p, pred_new)
        ae_gain_1 = np.abs(p - oof).sum() - np.abs(p - pred_new).sum() # absolute error improvement should be positive 
        ae_gain += ae_gain_1
        if ae_gain_1 < 0: # mae_pred > 0.00001: #
            print(f"Row: {row}")
            print(f"Candidate p_coef: {p_coef:.6f}")
            print(f"Candidate p_star: {p_star:.6f}")
            print(f"MAE OOF:  {mean_absolute_error(p, oof):.3f}")
            print(f"MAE Pred: {mae_pred:.3f}")
            plt.figure(figsize=(10, 4))
            plt.title(f"p_coef = {p_coef:.2f}, p_star = {p_star:.0f}")
            plt.plot(np.arange(start, end), u, label='u_in')
            #plt.plot(np.arange(start, end), oof, label='pressure_pred_oof')
            plt.plot(np.arange(start, end), pred_new, label='pressure_pred_new')
            plt.plot(np.arange(end), pp[row, rr[row]], label='pressure_true')
            #plt.plot(np.arange(len(uu[row])), uu[row]) # all 80 u_in
            #plt.plot(np.arange(len(pp[row])), pp[row]) # all 80 true pressures
            plt.scatter(np.arange(start, end)[u < 1e-6], u[u < 1e-6]) # mark zeros of u_in
            plt.legend()
            if row in [1, 537, 634, 1098, 3193, 9847, 10398, 13828]: plt.savefig(f"p-control-{row}.png")
            plt.show()
            count_bad += 1
        else:
            count += 1

    # Keep the parameters for future reference
    p_list.append((row, p_coef, p_star, np.abs(oof - pred_new).sum()))
    
    # For test data: update the predictions
    if update_preds:
        exhale = rr[row].argmin()
        preds[row, 1:exhale] = pred_new
        updated += 1
        
    try:
        row_set.add(row)
    except NameError:
        pass

# Test the function if the training data is available and pp is defined
p_list, row_set, count, count_bad, ae_gain = [], set(), 0, 0, 0
try:
    for row in range(len(pp)):
        find_p_control(row, uu, rr, oof_pred, p_list, pp)
    if count > 0 or count_bad > 0:
        print("Count:", count, count_bad)
        print("AE gain:", ae_gain)
        if ae_gain <= 0: raise ValueError("MAE gain is not positive")
    p_df = pd.DataFrame(p_list, columns=['row', 'p_coef', 'p_star', 'difference'])
    print(f"Cumulated difference: {p_df['difference'].sum():.3f}")
    display(p_df.head())
except NameError as e:
    print("Warning: NameError caught", e)

# Read the nn predictions, update them and write the final submission file

The full computation takes more than nine hours and cannot be run on Kaggle. I have run the exact same notebook locally and uploaded the result as a dataset. If the dataset is available, the notebook skips the full computation; if the dataset is unavailable, the notebook performs the full, lengthy computation.

In [None]:
try:
    sub = pd.read_csv('../input/notebook-output-cache/submission_pi_20211101.csv')
    p_df = pd.read_csv('../input/notebook-output-cache/p_parameters.csv')
    pi_df = pd.read_csv('../input/notebook-output-cache/pi_parameters.csv')
    use_shortcut = True
    print("Using the shortcut to save CPU time")
except FileNotFoundError:
    sub = pd.read_csv('../input/vent-015a-pulp-fiction-inference/submission_better_than_median.csv')
    use_shortcut = False
    print("Doing the full computation (no shortcut)")


In [None]:
%%time
def find_pi_control_slice(a, b):
    """Return the updated rows a:b of oof_copy.
    
    This function is meant to be run in a parallel job."""
    oof_copy2 = ss.copy() # make a writable copy for this job
    pi_list = []
    for row in range(a, b):
        find_pi_control(row, uu, rr, dt_, oof_copy2, pi_list, pp=None, update_preds=True)
    return oof_copy2[a:b], pi_list

ss = sub.pressure.values.reshape(-1, 80)
ss_copy = ss.copy()
n_jobs = 8
stop = 10 if use_shortcut else len(ss)
pi_list, updated = [], 0

a_list = [stop // n_jobs * i for i in range(n_jobs)]
b_list = a_list[1:] + [stop]
updated_slices = Parallel(n_jobs=n_jobs)(delayed(find_pi_control_slice)(a, b)
                                         for a, b in zip(a_list, b_list))
for (new_slice, slice_pi_list), a, b in zip(updated_slices, a_list, b_list):
    ss[a:b] = new_slice
    pi_list += slice_pi_list
    
print(f"Modified {(ss != ss_copy).any(axis=1).sum()} rows of {len(ss)} in parallel for the PI controllers.")
if not use_shortcut: pi_df = pd.DataFrame(pi_list, columns=['row', 'p_coef', 'i_coef', 'p_star', 'difference'])
print(f"Cumulated difference: {pi_df['difference'].sum():.3f}")
with open('pi_parameters.pickle', 'wb') as handle: pickle.dump(pi_df, handle)
pi_df.to_csv('pi_parameters.csv', index=False)
display(pi_df)
print()

p_list = []
for row in range(len(ss)):
    find_p_control(row, uu, rr, ss, p_list, update_preds=True)
print(f"Updated {updated} rows for the P controllers.")
if not use_shortcut: p_df = pd.DataFrame(p_list, columns=['row', 'p_coef', 'p_star', 'difference'])
print(f"Cumulated difference: {p_df['difference'].sum():.3f}")
with open('p_parameters.pickle', 'wb') as handle: pickle.dump(p_df, handle)
p_df.to_csv('p_parameters.csv', index=False)
display(p_df)
print()

sub["pressure"] = ss.ravel()
sub.to_csv('submission_pi.csv', index=False)
sub.head(5)