<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer


In [2]:
def time_since_start(df):
    time_start = np.zeros(len(df))
    for i in range(len(df)):
        if i == 0:
            time_start[i] = 200
        else:
            if (df['mode'][i] == 'start') & (df['mode'][i - 1] == 'operation') & (df['time_difference'][i] < 10000):
                time_start[i] = 0
            elif (df['mode'][i] == 'start') & (df['mode'][i - 1] == 'start') & (df['time_difference'][i] < 10000):
                time_start[i] = time_start[i - 1] + df['time_difference'][i]
            elif (df['time_difference'][i] > 10000):
                time_start[i] = 0
            elif (df['mode'][i] == 'operation'):
                time_start[i] = time_start[i - 1]
    return time_start


def time_since_operation(df):
    time_op = np.zeros(len(df))
    for i in range(len(df)):
        if i == 0:
            time_op[i] = 1
        else:
            if (df['mode'][i] == 'operation') & (df['time_difference'][i] < 10000):
                time_op[i] = time_op[i - 1] + df['time_difference'][i]
            elif (df['mode'][i] == 'operation') & (df['time_difference'][i] > 10000):
                time_op[i] = time_op[i - 1]
            elif (df['mode'][i] == 'start'):
                time_op[i] = 0
    return time_op

def timepoint_difference(df):
    tps =np.array(df.index[:])
    tp_diff =np.diff(tps)/np.timedelta64(1, 's')
    tp_diff = np.insert(tp_diff, 0 , 1)
    return tp_diff


def track_time(df):
    tps =np.array(df.index[:])
    tp_diff =(tps-tps[0])/np.timedelta64(1, 's')
    return tp_diff

def binarize_mode(df):
    mode = np.where(df['mode'] == 'start', 0, 1)
    return mode

In [3]:

def add_features(path_to_data):
    data = pd.read_parquet(path_to_data , engine= 'fastparquet')

    new_data = pd.DataFrame(data)
    new_data.insert(0, 'time_difference', timepoint_difference(data))
    new_data.insert(0, 'time_start', time_since_start(new_data))
    new_data.insert(0, 'time_op', time_since_operation(new_data))
    new_data.insert(0, 'time_2', new_data['time_op'].cumsum()+200793036310) #200793036310 is the cumsum based on previous data
    new_data.insert(0, 'track_time', track_time(new_data)+3201304) #3201304 is the last tracked time
    new_data.insert(0, 'binary_mode', binarize_mode(new_data))
    X = new_data[['time_start', 'time_op', 'track_time', 'time_2', 'Unit_4_Power', 'Turbine_Guide Vane Opening',
                  'Turbine_Pressure Drafttube',
                  'Turbine_Pressure Spiral Casing', 'Turbine_Rotational Speed', 'binary_mode']].values
    
    imputer = KNNImputer(n_neighbors=2, weights="distance")
    X = imputer.fit_transform(X)
    np.savez('prediction_input.npz', X = X)


In [4]:
add_features('prediction_input.parquet')
X = np.load('prediction_input.npz')['X']
print(X)
print(X.shape)
print(np.mean(X, axis=0))
print(X[-1, :])

[[1.00000000e+00 1.00000000e+00 3.20130400e+06 ... 5.27987658e+03
  1.08057467e+02 1.00000000e+00]
 [1.00000000e+00 2.00000000e+00 3.20130500e+06 ... 5.27993084e+03
  1.08057460e+02 1.00000000e+00]
 [1.00000000e+00 3.00000000e+00 3.20130600e+06 ... 5.27998511e+03
  1.08057454e+02 1.00000000e+00]
 ...
 [2.11000000e+02 1.09046000e+05 4.01187600e+06 ... 5.41963399e+03
  1.06722986e+02 1.00000000e+00]
 [2.11000000e+02 1.09047000e+05 4.01187700e+06 ... 5.42009424e+03
  1.06711070e+02 1.00000000e+00]
 [2.11000000e+02 1.09048000e+05 4.01187800e+06 ... 5.42138596e+03
  1.06699154e+02 1.00000000e+00]]
(226364, 9)
[1.22305495e+02 4.71320010e+04 3.75830811e+06 2.94208361e+02
 9.12319994e+01 1.11857950e+02 5.27958223e+03 1.07965453e+02
 9.97477514e-01]
[2.11000000e+02 1.09048000e+05 4.01187800e+06 1.01421865e+02
 4.12043766e+01 1.00034405e+02 5.42138596e+03 1.06699154e+02
 1.00000000e+00]
