### **Эксперимент 5.**

Рассчитать приведенную температуру (через egt, tat, ... theta) и n1. Отбросить `egt, tat, n1, XF, HPV, NF`.

Использовать конфигурацию эксперимента 4. 

Сохранить и записать использованные константы.

Note: выяснилось, что если убрать NF то качество сильно падает. разобраться, почему

In [1]:
import pandas as pd
from typing import List

y_cols = ['egtm']
meta_cols = ['reportts', 'acnum', 'pos']
selected_params = ['egt', 'tat', 'n1a', 'n2a', 'wai', 'nai', 'prv', 'alt', 'mn', 'ff', 'nf']


In [2]:
fleet = ['VQ-BGU', 'VQ-BDU']

bgu = pd.read_csv('./takeoff-merged-VQ-BGU-30s.csv', \
  usecols=(selected_params + y_cols + meta_cols), parse_dates=['reportts']).dropna(subset=['egtm'])
bdu = pd.read_csv('./takeoff-merged-VQ-BDU-30s.csv', \
  usecols=(selected_params + y_cols + meta_cols), parse_dates=['reportts']).dropna(subset=['egtm'])


In [3]:
bgu.head()

Unnamed: 0,reportts,acnum,pos,egtm,alt,egt,ff,mn,n1a,n2a,nf,tat,wai,nai,prv
0,2018-12-24 10:53:22,VQ-BGU,1,44.437,1418.0,800.1,2783.0,0.257,77.95,87.63,77.92,2.0,0.0,0.0,1.0
1,2018-12-25 15:23:23,VQ-BGU,1,44.379,2005.0,851.4,3185.0,0.254,82.7,90.22,82.71,10.2,0.0,0.0,1.0
2,2018-12-25 20:49:27,VQ-BGU,1,43.742,1739.0,851.6,3637.0,0.262,86.02,89.57,86.1,-7.2,0.0,0.0,1.0
3,2018-12-26 11:42:26,VQ-BGU,1,46.443,1672.0,826.3,3298.0,0.25,83.73,88.74,83.79,-6.2,0.0,1.0,0.0
4,2018-12-26 15:19:13,VQ-BGU,1,47.66,934.0,789.5,2749.0,0.22,78.14,87.68,78.11,5.2,0.0,0.0,0.0


#### Подготовка

In [4]:
def get_recursive_features(data: List[pd.DataFrame], features = [], n_back = 1):
  result = []
  rest_features = list(set(data[0].columns) - set(features))
  assert len(rest_features + features) == len(data[0].columns)
  
  for acdata in data:
    for pos in [1, 2]:
      df = acdata[acdata['pos'] == pos].copy().reset_index()
      if df.shape[0] == 0:
        continue
      X = df[features]
      X_aug = X.copy()
      for offset in range(1, n_back + 1):
        features_back = [f"{i}_{offset}" for i in features]
        X_aug.loc[0:offset, features_back] =  X.iloc[0,:].to_numpy()
        X_aug.loc[offset:, features_back] = X.iloc[:-offset,:].to_numpy()
      
      X_aug.loc[:, rest_features] = df[rest_features]
      result.append(X_aug)
  
  result = pd.concat(result) \
              .sort_values('reportts' if 'reportts' in rest_features else 'pos') \
              .reset_index() \
              .drop(columns=['index'])
  return result

In [5]:
data = get_recursive_features(
  [bgu, bdu], 
  n_back=13,
  features=selected_params
)

In [6]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def train_model(X, y, model = 'linreg', alpha=0.2):
  assert len(X) == len(y)
  train_i = int(len(X) * 75 / 100)
  X_train, y_train = X[0:train_i], y[0:train_i]
  X_test, y_test = X[train_i:], y[train_i:]

  model = Ridge(alpha=alpha)

  model.fit(X_train, y_train)

  predicted_train = model.predict(X_train)
  mse = mean_squared_error(y_train, predicted_train, squared=False)
  mae = mean_absolute_error(y_train, predicted_train)
  r2 = r2_score(y_train, predicted_train)

  predicted_test = model.predict(X_test)
  mse = mean_squared_error(y_test, predicted_test, squared=False)
  r2 = r2_score(y_test, predicted_test)

  return mse, mae, r2, model, predicted_train, predicted_test, train_i, y_test

In [7]:
import matplotlib.pyplot as plt

# Get exponential rolling average with smothing factor alpha
def smooth(x: pd.Series, alpha=0.5):
  return pd.Series(x).ewm(alpha=alpha, adjust=False).mean().to_list()

def plot_predictions(data, acnum, pos, train_i, predicted_test, predicted_train, is_smooth=True, figsize=(14, 7), title=None):
  data.loc[:train_i-1, 'pred_train'] = predicted_train
  data.loc[train_i:, 'pred_test'] = predicted_test

  sub = data[(data['acnum'] == acnum) & (data['pos'] == pos)]
  train_i2 = sub['pred_train'].count()

  plt.figure(figsize=figsize)

  if is_smooth:
    plt.plot(sub['reportts'][:train_i2], smooth(sub['pred_train'][:train_i2], alpha=1/10), '-')
    plt.plot(sub['reportts'], smooth(sub['pred_test'], alpha=1/10), '-')
  else:
    plt.scatter(sub['reportts'][:train_i2], sub['pred_train'][:train_i2], s=2)
    plt.scatter(sub['reportts'], sub['pred_test'], s=2)

  plt.plot(sub['reportts'], sub['egtm'], '-', color='#2ca02c')

  plt.title(f'Linear model of EGTM on {acnum} engine {pos}, Gas path params' if title is None else title)
  plt.legend(['train_pred', 'test_pred', 'true'])
  plt.show()

#### Baseline из эксперимента 4

In [8]:
X = data.drop(columns=(meta_cols + y_cols))
y = data['egtm']

In [9]:
mse, mae, r2, model, predicted_train, predicted_test, train_i, y_test = train_model(X, y)

print(f'Baseline: rmse = {mse:.3f} mae = {mae:.3f} r2 = {r2:.3f}')


Baseline: rmse = 1.768 mae = 1.304 r2 = 0.440


#### Коррекция

In [10]:
def correct(data):
  theta = (data['tat'] + 273.15) / 288.15

  alpha = 0.5

  data_k = data.copy().drop(columns=['egt'])
  # data_k['n1k'] = data['n1a'] / (theta ** alpha)
  # data_k['n2k'] = data['n2a'] / (theta ** alpha)
  data_k['egtk'] = (data['egt'] + 273.15) / theta

  return data_k


In [11]:
selected_params = ['egtk', 'tat', 'n1a', 'n2a', 'wai', 'nai', 'prv', 'alt', 'mn', 'ff', 'nf']

data = get_recursive_features(
  [correct(bgu), correct(bdu)], 
  n_back=13,
  features=selected_params
)


In [12]:
X = data.drop(columns=(meta_cols + y_cols))
y = data['egtm']

mse, mae, r2, model, predicted_train, predicted_test, train_i, y_test = train_model(X, y)

print(f'Baseline: rmse = {mse:.3f} mae = {mae:.3f} r2 = {r2:.3f}')


Baseline: rmse = 1.607 mae = 1.106 r2 = 0.537


Wow, just the egt correction gained us 0.1 increase in R squared!