In [5]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [3]:
dataset = pd.read_csv('./takeoff-merged-VQ-BGU-30s.csv', parse_dates=['reportts']) \
  .sort_values('reportts')

dataset = dataset[dataset['pos'] == 1]

y = dataset['egtm']

X = dataset.drop(columns=[
    'acnum', 'pos', 'dep', 'arr', 
    'egtm', 'fltdes', 'reportts',
    'dmusw', 'exswpn', 'reason'
]).fillna(0)

X = X.loc[:, ~X.columns.str.contains('stw')]

In [4]:
def plot_predict(y, preds, name=f"EGTM true vs predicted on engine 1"):
    rmse = mean_squared_error(y, preds, squared=False)
    mae = mean_absolute_error(y, preds)
    fig, ax = plt.subplots(1, 1, figsize=(14, 6))
    ax.plot(dataset['reportts'], y, '-')
    ax.plot(dataset['reportts'], preds, '-')
    ax.legend(['True', 'Prediction'])
    ax.set_title(f"{name}\nrmse={rmse:.5f}\nmae={mae:.5f}")
    plt.show()

In [26]:
dataset

Unnamed: 0,reportts,acnum,pos,egtm,fltdes,dep,arr,ivs12,ibe,iaie,...,votm,vsva,w14,pf,wai,nai,prv,hpv,xf,reason
0,2018-12-24 10:53:22,VQ-BGU,1,44.437,8990.0,EDHI,UUDD,,,,...,,0.0,1160.0,0.53,0.0,0.0,1.0,0.0,0.0,
1,2018-12-25 15:23:23,VQ-BGU,1,44.379,1024.0,URSS,UUDD,,,,...,,0.0,1179.0,0.53,0.0,0.0,1.0,0.0,0.0,
2,2018-12-25 20:49:27,VQ-BGU,1,43.742,217.0,UUDD,UNBB,,,,...,,0.0,1302.0,0.51,0.0,0.0,1.0,0.0,0.0,
3,2018-12-26 11:42:26,VQ-BGU,1,46.443,1045.0,UUDD,URSS,,,,...,,0.0,1252.0,0.01,0.0,1.0,0.0,0.0,0.0,
4,2018-12-26 15:19:13,VQ-BGU,1,47.660,1046.0,URSS,UUDD,,,,...,,0.0,1148.0,0.01,0.0,0.0,0.0,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
830,2020-02-05 05:07:31,VQ-BGU,1,17.845,2508.0,UNNT,UUDD,,,,...,,0.0,1177.0,0.03,0.0,0.0,0.0,0.0,0.0,
831,2020-02-09 02:18:44,VQ-BGU,1,17.774,3062.0,UHBB,UUDD,,,,...,,0.0,1385.0,0.03,0.0,1.0,0.0,0.0,0.0,
832,2020-02-10 02:04:56,VQ-BGU,1,17.424,3062.0,UHBB,UUDD,,,,...,,0.0,1323.0,0.03,0.0,0.0,0.0,0.0,0.0,
833,2020-02-10 17:24:01,VQ-BGU,1,17.013,3061.0,UUDD,UHBB,,,,...,,0.0,1321.0,0.51,0.0,0.0,1.0,0.0,0.0,


In [25]:
params = X.columns

for param1 in params:
    for param2 in params:
        if param1 == param2: 
            continue
        x_aug = np.array(X[param1] / X[param2]).reshape(-1, 1)

        x_aug[np.isnan(x_aug)] = 0
        x_aug[np.isinf(x_aug)] = 1000

        model = LinearRegression(n_jobs=-1)
        preds = cross_val_predict(model, x_aug, y, n_jobs=-1)
        rmse = mean_squared_error(y, preds, squared=False)
        mae = mean_absolute_error(y, preds)

        if (rmse < 4.5):
            print(f'{param1} / {param2} rmse = {rmse}')

n2p / naiup rmse = 4.484175462476966
naiup / hpcbf rmse = 4.41684347029117
naiup / n2c5 rmse = 4.422046010957668
naiup / n2p rmse = 4.303100013180294


In [24]:
for param1 in params:
    x_aug = np.array(1 / X[param1]).reshape(-1, 1)

    x_aug[np.isnan(x_aug)] = 0
    x_aug[np.isinf(x_aug)] = 1000

    model = LinearRegression(n_jobs=-1)
    preds = cross_val_predict(model, x_aug, y)
    rmse = mean_squared_error(y, preds, squared=False)
    mae = mean_absolute_error(y, preds)

    if (rmse < 5):
        print(f'{param1} rmse = {rmse}')

naiup rmse = 4.674000377378305
