Hypotesis: egtm = egt redline - egt

Idea: egt redline may depend on engine hours linearly. I want to plot (egt - etgm) vs engine hours 

In [3]:
import pandas as pd
import matplotlib.pyplot as plt 

dataset = pd.read_csv('./small-sample-BGU-30.csv', parse_dates=['reportts']) \
  .sort_values('reportts')

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

In [5]:
important_features = [
 'egt', 'naiup', 'nait', 'tec', 'aoc', 'ecyc', 'esn', 'ehrs', 'fdp', 'ps14', 'w14', 'egtb'
]

In [6]:
Y = dataset[['egtm']]

X = dataset.drop(columns=[
    'reportts', 'acnum', 'pos', 'dep', 'arr', 
    'egtm', 'fltdes', 'reportts',
    'dmusw', 'exswpn', 'reason'
]).fillna(-100)

X = X.loc[:, ~X.columns.str.contains('stw')]

In [7]:
def train_model(X, y):
  y = Y['egtm']
  x = X[y.notna()]
  y = y.dropna()

  X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=40)

  scaler = StandardScaler()
  scaler.fit(X_train)

  X_train = scaler.transform(X_train)
  X_test = scaler.transform(X_test)
  

  model = LinearRegression(n_jobs=-1)
  model.fit(X_train, y_train)

  predicted = model.predict(X_test)
  preds = pd.DataFrame({'y': y_test, 'pred': predicted})
  mse = mean_squared_error(y_test, predicted, squared=False)
  mae = mean_absolute_error(y_test, predicted)
  
  return mse, mae, model, preds

In [8]:
mse, mae, result_model, pred = train_model(X, Y)
mse

128.4805798140631

In [9]:
mse, mae, result_model, pred = train_model(X[important_features], Y)
mse

2.4396958561090103

Let's do some feature generation

In [10]:
X_aug = X[important_features].copy()
for f in important_features:
  X_aug[f + '_2'] = X_aug[f] ** 2
  for k in important_features:
    if f != k:
      X_aug[f + '_m_' + k] = X_aug[f] * X_aug[k]

In [11]:
mse, mae, result_model, pred = train_model(X_aug, Y)
mse

66.8665293186491

In [12]:
X_aug

Unnamed: 0,egt,naiup,nait,tec,aoc,ecyc,esn,ehrs,fdp,ps14,...,egtb_m_naiup,egtb_m_nait,egtb_m_tec,egtb_m_aoc,egtb_m_ecyc,egtb_m_esn,egtb_m_ehrs,egtb_m_fdp,egtb_m_ps14,egtb_m_w14
0,800.1,128.8,4.0,13.0,3.6,0,0.0,0,11.3,15.991,...,59776.08,1856.4,6033.3,1670.76,0.0,0.0,0.0,5244.33,7421.4231,538356.0
510,802.4,127.6,3.0,10.0,3.5,0,0.0,0,10.4,15.895,...,59027.76,1387.8,4626.0,1619.10,0.0,0.0,0.0,4811.04,7353.0270,536616.0
1,851.4,129.0,12.0,24.0,3.5,2,0.0,4,12.0,16.026,...,64280.70,5979.6,11959.2,1744.05,996.6,0.0,1993.2,5979.60,7985.7558,587495.7
511,854.0,128.3,12.0,23.0,3.4,2,0.0,4,10.8,15.873,...,63919.06,5978.4,11458.6,1693.88,996.4,0.0,1992.8,5380.56,7907.9286,587377.8
2,851.6,132.3,-3.0,8.0,3.5,3,0.0,6,11.4,16.482,...,62472.06,-1416.6,3777.6,1652.70,1416.6,0.0,2833.2,5383.08,7782.8004,614804.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507,872.1,119.1,-8.0,2.0,3.6,205,771009.0,957,12.5,16.848,...,60181.23,-4042.4,1010.6,1819.08,103586.5,389590847.7,483572.1,6316.25,8513.2944,677102.0
1018,906.7,109.8,29.0,38.0,3.5,206,771035.0,964,12.6,16.001,...,59237.10,15645.5,20501.0,1888.25,111137.0,415973382.5,520078.0,6797.70,8632.5395,615030.0
508,911.1,112.7,30.0,39.0,3.4,206,771009.0,964,13.1,16.053,...,60531.17,16113.0,20946.9,1826.14,110642.6,414108933.9,517764.4,7036.01,8622.0663,610682.7
509,799.3,118.3,-17.0,-5.0,3.7,279,771009.0,1355,12.8,16.822,...,53731.86,-7721.4,-2271.0,1680.54,126721.8,350192287.8,615441.0,5813.76,7640.5524,606811.2


Equation discovery

In [13]:
# !python3 -m pysr install
# !pip3 install -U pysr -q

In [14]:
from pysr import PySRRegressor

important_features = [
'naiup', 'nait', 'tec', 'aoc', 'ecyc', 'esn', 'ehrs', 
'fdp', 'ps14', 'w14', 'egtb', 'odp', 'vb2', 'bbf', 
'vorrc', 'baf', 't2_peak', 'acct', 't3', 'dph', 'pb', 
'alt_peak', 'n1max', 'n1msa', 'oat_peak', 'shptp', 'egt', 
'joip', 'oiq', 'egt_peak'
]

model = PySRRegressor(
    niterations=1000, 
    binary_operators=["*", "+", "-", "/"],
    unary_operators=[
        "square",
        "inv(x) = 1/x",
    ],
    extra_sympy_mappings={"inv": lambda x: 1 / x},
    loss="loss(prediction, target) = (prediction - target)^2",
)

y = Y['egtm']
x = X[important_features]

model.fit(x, y)



Compiling Julia backend...


Juliaup configuration is locked by another process, waiting for it to unlock.
Juliaup configuration is locked by another process, waiting for it to unlock.
Juliaup configuration is locked by another process, waiting for it to unlock.


In [None]:
model.equations_.loc[13].equation

'((((naiup + (square(square((square(baf) * 0.48254767) / (aoc + 0.6673794))) / odp)) - n1max) * 0.015399169) * vorrc)'

In [None]:
model.predict(X)

array([34.93426448, 33.9062822 , 33.62358708, 32.99823119, 38.8662967 ,
       37.92397961, 36.76749955, 36.08217803, 34.36030771, 34.10331214,
       36.24494189, 35.73095075, 42.85829456, 41.74464709, 37.10159379,
       37.79548183, 31.73895289, 32.33860922, 37.16155942, 37.76121575,
       31.80748505, 31.89315024, 35.58531993, 35.31975784, 32.10731321,
       32.10731321, 37.09302727, 36.23637537, 36.21067581, 35.35402391,
       38.01821132, 38.78919803, 38.55790202, 37.67555056, 33.73495182,
       34.16327777, 35.77378334, 35.2597922 , 36.73323347, 37.3328898 ,
       39.48308607, 38.44653727, 39.19182442, 38.24950734, 36.45053835,
       35.42255606, 38.0524774 , 39.08045968, 37.5556193 , 38.49793639,
       35.58531993, 36.5190705 , 38.06104392, 37.21295854, 36.23637537,
       37.26435765, 37.5556193 , 36.78463259, 34.1290117 , 34.72866803,
       36.33060708, 36.93026341, 39.95424462, 38.32660601, 37.70125012,
       37.18725898, 38.74636544, 37.98394525, 39.35458829, 40.03

In [None]:
y

0      44.437
453    45.869
1      44.379
454    44.904
2      43.742
        ...  
450    22.152
904    20.216
451    22.151
452    18.218
905    17.672
Name: egtm, Length: 906, dtype: float64

In [None]:
pd.DataFrame({'1': dataset['naiup'] - 95.12702, '2': dataset['egtm']})

Unnamed: 0,1,2
0,33.67298,44.437
453,32.47298,45.869
1,33.87298,44.379
454,33.17298,44.904
2,37.17298,43.742
...,...,...
450,23.97298,22.152
904,14.67298,20.216
451,17.57298,22.151
452,23.17298,18.218


In [None]:
test = pd.DataFrame({
    'x1': np.random.random(10) * 2,
    'x2': np.random.random(10) * 5,
})

test['y'] = -10.4 * (test['x1'] ** 2) + 35 * test['x1'] * test['x2'] - 45.5

test


Unnamed: 0,x1,x2,y
0,0.696635,2.355721,6.89061
1,0.669685,0.088991,-48.078323
2,0.080931,4.371698,-33.184894
3,1.259352,0.811445,-26.227722
4,0.259401,1.550279,-32.124753
5,0.185782,3.89159,-20.554368
6,0.66702,4.561228,56.357877
7,0.494335,3.000906,3.879395
8,0.10103,4.789403,-28.670639
9,1.65486,4.317049,176.062862


In [None]:

model2 = PySRRegressor(
    niterations=100,
    binary_operators=["*", "+", "-", "/"],
    unary_operators=[
        "square", "cube"
        # "cos",
        # "exp",
        # "sin",
        # "inv(x) = 1/x",
    ],
    # extra_sympy_mappings={"inv": lambda x: 1 / x},
    loss="loss(prediction, target) = (prediction - target)^2",
)

model2.fit(test[['x1', 'x2']], test['y'])



Started!

Expressions evaluated per second: 5.250e+05
Head worker occupation: 21.8%. This is high, and will prevent efficient resource usage. Increase `ncyclesperiteration` to reduce load on head worker.
Progress: 1009 / 1500 total iterations (67.267%)
Hall of Fame:
---------------------------------------------------------------------------------------------------
Complexity  Loss       Score     Equation
1           3.951e+03  1.594e+01  y = x2
2           3.698e+03  6.640e-02  y = square(x2)
3           1.698e+03  7.780e-01  y = cube(cube(x1))
4           1.029e+03  5.015e-01  y = (x1 * cube(x2))
5           8.796e+02  1.564e-01  y = (square(x1) * cube(x2))
6           6.482e+02  3.052e-01  y = ((cube(x2) * x1) + -19.497)
7           2.481e+01  3.263e+00  y = (((x1 * 31.443) * x2) - 45.991)
9           5.687e+00  7.366e-01  y = (((x1 / 0.029664) * (x2 + -0.34145)) - 43.371)
11          3.800e+00  2.016e-01  y = (((x1 / 0.029664) * (x2 - (x1 * 0.25755))) - 43.371)
12          2.752e+0

In [None]:
model2.equations.loc[11]



complexity                                                      15
loss                                                       1.00354
score                                                     0.112253
equation         (((((x1 * 33.92955) + -0.26704854) - -0.995721...
sympy_format     (-0.2250931*x1 + x2)*(33.92955*x1 + 0.72867322...
lambda_format    PySRFunction(X=>(-0.2250931*x1 + x2)*(33.92955...
Name: 11, dtype: object