In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Data

In [None]:
data=pd.read_csv(os.path.join(dirname, filename),delim_whitespace=True, header=None)
data.columns = ['lever_position', 'ship_speed', 'gt_shaft', 'gt_rate', 'gg_rate', 'sp_torque', 'pp_torque',
                     'hpt_temp', 'gt_c_i_temp', 'gt_c_o_temp', 'hpt_pressure', 'gt_c_i_pressure', 'gt_c_o_pressure',
                     'gt_exhaust_pressure', 'turbine_inj_control', 'fuel_flow', 'gt_c_decay',  'gt_t_decay']
data = data.dropna()
data.describe()

# Analyze data

In [None]:
data.describe()

In [None]:
np.round(data.corr(),4)

# Eliminate unneccessary features
* We can easily see that gt_c_i_temp has std=0 that means it is a constant variable not an important variable that effect on our final result => drop  'gt_c_i_temp' column
* Correlation of "gt_c_i_pressure" is 0 to all others => we can drop gt_c_i_pressure

In [None]:
data=data.drop('gt_c_i_temp', axis=1)
data=data.drop('gt_c_i_pressure',axis=1)

In [None]:
data.head()

# Split data to X(Features) and Y(Responses)

In [None]:
X=data[['lever_position', 'ship_speed', 'gt_shaft', 'gt_rate', 'gg_rate', 'sp_torque',
        'pp_torque', 'hpt_temp', 'gt_c_o_temp', 'hpt_pressure', 'gt_c_o_pressure','gt_exhaust_pressure',
        'turbine_inj_control', 'fuel_flow']]
Y1=data['gt_c_decay']
Y2=data['gt_t_decay']
Y=pd.DataFrame([Y1,Y2]).transpose()
Y

# Further Correlation Analyze
* Since all the features despite having high correlation between each other but very low correlation for the response, we should assume that the features and responses may have a non-linear relationship.

In [None]:
corr_mat= np.round(data.corr(),4)
plt.figure(figsize = (18,9))
sns.heatmap(corr_mat, annot = True)
plt.show()

# Further Correlation Analyze
* Our response Y1 and Y2 have extremely small correlation => We can consider them independent and treat them as two difference response and affect by different features.

In [None]:
corr_mat= Y.corr()
plt.figure(figsize = (18,9))
sns.heatmap(corr_mat, annot = True)
plt.show()

# Normality Test
From normality test we are sure that our data do not follow normal distribution => Linear regression is not viable

In [None]:
for col in data.columns:
    stat,p= stats.kstest(data[col],'norm')
    if p<=0.05:
        print('Feature: %s is not normal'%col)
    else:
        print('Feature: %s is normal'%col)

# Split data for training model

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,Y)
X_train1,X_test1,y_train1,y_test1=train_test_split(X,Y1)
X_train2,X_test2,y_train2,y_test2=train_test_split(X,Y2)

In [None]:
y_train

# Machine Learning library Import

In [None]:
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor #Ensemble using averaging method
from xgboost import XGBRegressor #Ensemble using boosting method
from sklearn.metrics import mean_squared_error

# Model selection
By training everything by it default setting, we will find out which model perform best in default setting


In [None]:
svr=SVR()
knn= KNeighborsRegressor()
tree=DecisionTreeRegressor()
bagg=BaggingRegressor()
xgb=XGBRegressor()

# Y1 model selection
Bagging Regressor is the best model for Y1

In [None]:
y1_train=[]
y1_test=[]
svr.fit(X_train1, y_train1)
knn.fit(X_train1, y_train1)
tree.fit(X_train1, y_train1)
bagg.fit(X_train1, y_train1)
xgb.fit(X_train1, y_train1)
print('Accuracy of SVRegression on training set: {:.4f}'
     .format(svr.score(X_train1, y_train1)))
y1_train.append(svr.score(X_train1, y_train1))
print('Accuracy of SVRegression on test set: {:.4f}'
     .format(svr.score(X_test1, y_test1)))
y1_test.append(svr.score(X_test1, y_test1))

print('Accuracy of KNN Regressor on training set: {:.4f}'
     .format(knn.score(X_train1, y_train1)))
y1_train.append(knn.score(X_train1, y_train1))
print('Accuracy of KNN Regressor on test set: {:.4f}'
     .format(knn.score(X_test1, y_test1)))
y1_test.append(knn.score(X_test1, y_test1))

print('Accuracy of Decision Tree on training set: {:.4f}'
     .format(tree.score(X_train1, y_train1)))
y1_train.append(tree.score(X_train1, y_train1))
print('Accuracy of Decision Tree on test set: {:.4f}'
     .format(tree.score(X_test1, y_test1)))
y1_test.append(tree.score(X_test1, y_test1))

print('Accuracy of Bagging Regressor on training set: {:.4f}'
     .format(bagg.score(X_train1, y_train1)))
y1_train.append(bagg.score(X_train1, y_train1))
print('Accuracy of Bagging Regressor on test set: {:.4f}'
     .format(bagg.score(X_test1, y_test1)))
y1_test.append(bagg.score(X_test1, y_test1))

print('Accuracy of XG Boost Regressor on training set: {:.4f}'
     .format(xgb.score(X_train1, y_train1)))
y1_train.append(xgb.score(X_train1, y_train1))
print('Accuracy of XG Boost Regressor on test set: {:.4f}'
     .format(xgb.score(X_test1, y_test1)))
y1_test.append(xgb.score(X_test1, y_test1))

# Y2 model selection
Bagging Regressor is the best model for Y2

In [None]:
y2_train=[]
y2_test=[]
svr.fit(X_train2, y_train2)
knn.fit(X_train2, y_train2)
tree.fit(X_train2, y_train2)
bagg.fit(X_train2, y_train2)
xgb.fit(X_train2, y_train2)
print('Accuracy of SVRegression on training set: {:.4f}'
     .format(svr.score(X_train2, y_train2)))
y2_train.append(svr.score(X_train2, y_train2))
print('Accuracy of SVRegression on test set: {:.4f}'
     .format(svr.score(X_test2, y_test2)))
y2_test.append(svr.score(X_test2, y_test2))

print('Accuracy of KNN Regressor on training set: {:.4f}'
     .format(knn.score(X_train2, y_train2)))
y2_train.append(knn.score(X_train2, y_train2))
print('Accuracy of KNN Regressor on test set: {:.4f}'
     .format(knn.score(X_test2, y_test2)))
y2_test.append(knn.score(X_test2, y_test2))

print('Accuracy of Decision Tree on training set: {:.4f}'
     .format(tree.score(X_train2, y_train2)))
y2_train.append(tree.score(X_train2, y_train2))
print('Accuracy of Decision Tree on test set: {:.4f}'
     .format(tree.score(X_test2, y_test2)))
y2_test.append(tree.score(X_test2, y_test2))

print('Accuracy of Bagging Regressor on training set: {:.4f}'
     .format(bagg.score(X_train2, y_train2)))
y2_train.append(bagg.score(X_train2, y_train2))
print('Accuracy of Bagging Regressor on test set: {:.4f}'
     .format(bagg.score(X_test2, y_test2)))
y2_test.append(bagg.score(X_test2, y_test2))

print('Accuracy of XG Boost Regressor on training set: {:.4f}'
     .format(xgb.score(X_train2, y_train2)))
y2_train.append(xgb.score(X_train2, y_train2))
print('Accuracy of XG Boost Regressor on test set: {:.4f}'
     .format(xgb.score(X_test2, y_test2)))
y2_test.append(xgb.score(X_test2, y_test2))

# Model Conclusion
* Bagging Regressor gives both model for Y1 and Y2 quite good accuracy with 99.4% and 98.3% respectively
* Here I only consider some algorithm that represent a kind of method. There might be an algorithm with higher accuracy that I haven't discover.

In [None]:
model=['SVRegression','KNN Regressor','Decision Tree','Bagging Regressor','XG Boost Regressor']
mod1=pd.DataFrame([model,y1_train,y1_test]).transpose()
mod1.columns=['model','Train Accuracy','Test Accuracy']
mod1.set_index('model')
mod1.sort_values('Test Accuracy')

In [None]:
model=['SVRegression','KNN Regressor','Decision Tree','Bagging Regressor','XG Boost Regressor']
mod2=pd.DataFrame([model,y2_train,y2_test]).transpose()
mod2.columns=['model','Train Accuracy','Test Accuracy']
mod2.set_index('model')
mod2.sort_values('Test Accuracy')