In [31]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_error
from catboost import CatBoostRegressor

pd.set_option('display.max_columns', 200)

X_train_dataset = pd.read_csv("X.csv")
y_train_dataset = pd.read_csv("y.csv")

df = pd.merge(X_train_dataset, y_train_dataset, on=["engine_id", "flight_datetime", "flight_phase"])

meta_columns = [
  "engine_id", "aircraft_id", "flight_datetime",
  "flight_phase", "engine_family", "engine_type", "manufacturer",
  "aircraft_family", "aircraft_type", "aircraft_grp", "ac_manufacturer"
]

In [32]:
output_cols = ['DEGT', 'DELN1', 'DELFN', 'EGTHDM']

Prepare dataset

In [33]:
df = pd.merge(X_train_dataset, y_train_dataset, on=["engine_id", "flight_datetime", "flight_phase"])
df = df.drop(columns=meta_columns)

In [34]:
def get_score(predict_field):
  data = df[df[predict_field].notna()]
  data = data.fillna(-1000)
  X = data.drop(columns=[predict_field])
  y = data[predict_field]
  
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=555)
  
  model = CatBoostRegressor(iterations=200, verbose=False)
  model.fit(X_train, y_train)
  catboost_predicted = model.predict(X_test)
  catboost_score = mean_squared_error(y_test, catboost_predicted, squared=False)
  catboost_predicted = model.predict(X)

  return catboost_score

Baseline: 

In [35]:
scores = pd.DataFrame(columns=["parameter", "rmse", ".0", ".1", ".5", ".9", "1"])

for param in tqdm(output_cols):
    try:
        score = get_score(param)
        qua = df[param].quantile([.0, .1, .5, .9, 1])
        qua = list(qua)
        scores.loc[len(scores)] = [param, score] + qua
    except:
        continue

100%|██████████| 4/4 [00:01<00:00,  2.40it/s]


In [36]:
scores

Unnamed: 0,parameter,rmse,.0,.1,.5,.9,1
0,DEGT,2.09956,-184.170715,-27.1604,6.281616,30.211646,80.758423
1,DELN1,0.102668,-1.588493,3.747295,6.333039,10.703587,17.277214
2,DELFN,0.16424,-3.178211,9.006846,15.801858,26.496931,39.112564
3,EGTHDM,1.105356,-55.431641,19.796198,52.873406,73.615068,286.805603


Try: 
- autocorellation
- PCA
- fill nan-s with average value

### Filling NA-s

In [37]:
def get_score_na_mean(predict_field):
  data = df[df[predict_field].notna()]
  data = data.fillna(data.mean())
  X = data.drop(columns=[predict_field])
  y = data[predict_field]
  
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=555)
  
  model = CatBoostRegressor(iterations=200, verbose=False)
  model.fit(X_train, y_train)
  catboost_predicted = model.predict(X_test)
  catboost_score = mean_squared_error(y_test, catboost_predicted, squared=False)
  catboost_predicted = model.predict(X)

  return catboost_score

In [38]:
scores2 = pd.DataFrame(columns=["parameter", "rmse", ".0", ".1", ".5", ".9", "1"])

for param in tqdm(output_cols):
    try:
        score = get_score_na_mean(param)
        qua = df[param].quantile([.0, .1, .5, .9, 1])
        qua = list(qua)
        scores2.loc[len(scores2)] = [param, score] + qua
    except:
        continue

100%|██████████| 4/4 [00:01<00:00,  2.06it/s]


In [39]:
scores2

Unnamed: 0,parameter,rmse,.0,.1,.5,.9,1
0,DEGT,2.102843,-184.170715,-27.1604,6.281616,30.211646,80.758423
1,DELN1,0.105144,-1.588493,3.747295,6.333039,10.703587,17.277214
2,DELFN,0.160296,-3.178211,9.006846,15.801858,26.496931,39.112564
3,EGTHDM,1.036272,-55.431641,19.796198,52.873406,73.615068,286.805603


In [41]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [59]:
def get_score_pca(predict_field):
  data = df[df[predict_field].notna()]
  data = data.fillna(-1000)
  X = data.drop(columns=[predict_field])
  y = data[predict_field]
  
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=555)
  
  scaler = StandardScaler()
  scaler.fit(X_train)

  X_train = scaler.transform(X_train)
  X_test = scaler.transform(X_test)
  
  pca = PCA()
  pca.fit(X_train)

  X_train = pca.transform(X_train)
  X_test = pca.transform(X_test)
  
  model = CatBoostRegressor(iterations=200, verbose=False)
  model.fit(X_train, y_train)
  catboost_predicted = model.predict(X_test)
  catboost_score = mean_squared_error(y_test, catboost_predicted, squared=False)
  catboost_predicted = model.predict(X)

  return catboost_score

In [60]:
scores3 = pd.DataFrame(columns=["parameter", "rmse", ".0", ".1", ".5", ".9", "1"])

for param in tqdm(output_cols):
  try:
    score = get_score_pca(param)
    qua = df[param].quantile([.0, .1, .5, .9, 1])
    qua = list(qua)
    scores3.loc[len(scores3)] = [param, score] + qua
  except:
    continue

100%|██████████| 4/4 [00:04<00:00,  1.12s/it]


In [61]:
scores3

Unnamed: 0,parameter,rmse,.0,.1,.5,.9,1
0,DEGT,1.279146,-184.170715,-27.1604,6.281616,30.211646,80.758423
1,DELN1,0.256455,-1.588493,3.747295,6.333039,10.703587,17.277214
2,DELFN,0.571339,-3.178211,9.006846,15.801858,26.496931,39.112564
3,EGTHDM,1.98063,-55.431641,19.796198,52.873406,73.615068,286.805603
