In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [None]:
# Accessing data
import os
cwd = '/content/drive/MyDrive/GT'
print(cwd)

In [None]:
filename = ['R002SMDI_No_29_TripNo_12_TsDataTintrExhData.csv', 'R002SMDI_No_115_TripNo_102_TsDataTintrExhData.csv', 'R002SMDI_No_356_TripNo_377_TsDataTintrExhData.csv', 'R002SMDI_No_489_TripNo_522_TsDataTintrExhData.csv']
trainingDataFile = filename
print(trainingDataFile)

In [None]:
label = [
         'NOxFlowRateValidTail'
]

features = [
            'calLoad',
            'coolantTemp',
            'intMAP',
            'engSpeed',
            'vehSpeed',
            'intAirTemp',
            'MAF',
            'absTPS',
            'distMILAct',
            'fuelrailPress',
            'baroPress',
            'catTemp',
            'voltage',
            'ambTemp',
            'accPedalD',
            'accPedalE',
            'fuelRate',
            'actEngPerTorque',
            # 'engRefTorque',
            'MAFSensor',
            'compressorInletTemp',
            'exhFlowrate',
            'NOxRawSensor1_2',
            'O2RawSensor1_2',
            'CO2FlowRate',
            'NOxFlowRateTail',
            'NOxFlowRateValidTail'
]
print(len(features))

In [None]:
full_data = []
i = 0

for filename in trainingDataFile:
    file_path = os.path.join(cwd,filename)
    load_data = pd.read_csv(file_path, usecols=features)
    if i == 0:
        full_data = load_data
    else:
        full_data = full_data.append(load_data, ignore_index=True, sort=False)
    i += 1

In [None]:
full_data.shape

In [None]:
full_data.head()

In [None]:
full_data.info()

In [None]:
full_data.describe()

In [None]:
# Cleaning data
RPM_min = 100                      # excluding engine-stop
NOx_max = 1649                     # excluding clipped NOx data (since the sensor maxed out at 1650 ppm)
NOx_min = 0                        # exluding negative value

full_data = full_data.loc[((full_data['engSpeed'] >= RPM_min) & (full_data['NOxRawSensor1_2'] <= NOx_max) & (full_data['NOxRawSensor1_2'] >= NOx_min))]
full_data = full_data.drop(['NOxRawSensor1_2'], axis=1)

In [None]:
full_data.shape

In [None]:
cleaned_up_data = full_data.dropna()                      # eliminating rows with NaN in any column(s)
print(cleaned_up_data.shape)
print(cleaned_up_data.isnull().values.any())

In [None]:
# Dividing data in 5 equally
cleaned_up_data['NOxRawSensor1_2'] = pd.qcut(cleaned_up_data['NOxFlowRateValidTail'],
                                               q=5,
                                               labels=[1, 2, 3, 4, 5])
cleaned_up_data['NOxRawSensor1_2'].hist()
cleaned_up_data['NOxRawSensor1_2'].value_counts()/len(cleaned_up_data)

In [None]:
cleaned_up_data.info()

In [None]:
cleaned_up_data.head()

In [None]:
print(len(cleaned_up_data))
reindexed_data = cleaned_up_data.reset_index(drop=True)
print(len(reindexed_data))
reindexed_data['NOxRawSensor1_2'].value_counts()/len(reindexed_data)

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_validation_index, test_index in split.split(reindexed_data, reindexed_data['NOxRawSensor1_2']):
    train_validation_set = reindexed_data.loc[reindexed_data.index.intersection(train_validation_index),:]
    test_set = reindexed_data.loc[reindexed_data.index.intersection(test_index),:]

In [None]:
len(train_validation_set)

In [None]:
len(test_set)

In [None]:
reindexed_train_validation_set = train_validation_set.reset_index(drop=True)
print(len(reindexed_train_validation_set))

In [None]:
reindexed_train_validation_set

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, validation_index in split.split(reindexed_train_validation_set, reindexed_train_validation_set['NOxRawSensor1_2']):
    train_set = reindexed_train_validation_set.loc[reindexed_train_validation_set.index.intersection(train_index),:]
    validation_set = reindexed_train_validation_set.loc[reindexed_train_validation_set.index.intersection(validation_index),:]

In [None]:
X_train = train_set
y_train = pd.concat([X_train.pop(x) for x in label], axis=1)

X_valid = validation_set
y_valid = pd.concat([X_valid.pop(x) for x in label], axis=1)

X_test = test_set
y_test = pd.concat([X_test.pop(x) for x in label], axis=1)

In [None]:
for set_ in (train_set, validation_set, test_set):
    set_.drop("NOxRawSensor1_2", axis=1, inplace=True)

In [None]:
train_set

In [None]:
print(X_train.keys())

In [None]:
print(y_train.keys())

In [None]:
def build_model(n_hidden=1, n_neurons=30, learning_rate=3e-3, input_shape=[X_train.shape[1]]):
    model = keras.models.Sequential()
    options = {"input_shape": input_shape}
    for layer in range(n_hidden):
        model.add(keras.layers.Dense(n_neurons, activation="relu", **options))
        options = {}
    model.add(keras.layers.Dense(1, **options))
    optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
    model.compile(loss="mse", optimizer=optimizer)
    return model

In [None]:
keras_reg = tf.keras.wrappers.scikit_learn.KerasRegressor(build_model)

In [None]:
mean_vec = X_train.mean()
std_vec = X_train.std()
print(mean_vec)
print(std_vec)

#mean_vec, std_vec 내부에 NOxFlowRateValidTail값이 없다..
#NOxFlowRateTail이 해당 값과 같은 값을 갖는 것을 확인해 mean, std 값을 해당값으로 설정.
mean_NOx = mean_vec['NOxFlowRateTail']
std_NOx = std_vec['NOxFlowRateTail']

#use json
mean_data = [dict(mean_vec)]
std_data = [dict(std_vec)]

import json
with open('mean_data_w_20_RDEs_only.json', 'w') as f:
    json.dump(mean_data,f)
    
with open('std_data_w_20_RDEs_only.json', 'w') as f:
    json.dump(std_data,f)

In [None]:
# normalization
X_train = (X_train - mean_vec) / std_vec
y_train['NOxFlowRateValidTail'] = (y_train['NOxFlowRateValidTail'] - mean_NOx) / std_NOx

X_valid = (X_valid - mean_vec) / std_vec
y_valid['NOxFlowRateValidTail'] = (y_valid['NOxFlowRateValidTail'] - mean_NOx) / std_NOx

X_test = (X_test - mean_vec) / std_vec
y_test['NOxFlowRateValidTail'] = (y_test['NOxFlowRateValidTail'] - mean_NOx) / std_NOx

In [None]:
X_train = np.array(X_train)
y_train = np.array(y_train)
X_valid = np.array(X_valid)
y_valid = np.array(y_valid)
X_test = np.array(X_test)
y_test = np.array(y_test)

In [None]:
import time

start = time.time()

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import keras
param_distribs = {
    "n_hidden": [2, 3, 4, 5, 6],
    "n_neurons": [32, 64, 128, 256, 512], ## 32~512 32 간격/2^n
    "learning_rate": [5e-3, 5e-4, 5e-5] ## 중앙: 5e-4, 5e-3, 5e-5 / initilal, decay
}
rnd_search_cv = RandomizedSearchCV(keras_reg, param_distribs, n_iter=10, cv=5)
rnd_search_cv.fit(X_train, y_train, epochs=50,
                  validation_data=(X_valid, y_valid),
                  callbacks=[keras.callbacks.EarlyStopping(patience=10)])

In [None]:
rnd_search_cv.best_params_

In [None]:
# -rnd_search_cv.best_score_

In [None]:
 model = rnd_search_cv.best_estimator_.model

 model.compile(optimizer='rmsprop',
             loss='mse',
             metrics=['mse'])

model.summary()

In [None]:
# model = tf.keras.Sequential([
#     tf.keras.layers.Dense(2048, activation='relu', input_shape=(X_train.shape[1],)),
#     tf.keras.layers.Dense(2048, activation='relu'),
#     tf.keras.layers.Dense(4096, activation='relu'),
#     tf.keras.layers.Dense(4096, activation='relu'),
#     tf.keras.layers.Dense(4096, activation='relu'),
#     tf.keras.layers.Dense(4096, activation='relu'),
#     tf.keras.layers.Dense(len(label))
# ])

# model.compile(optimizer='rmsprop',
#              loss='mse',
#              metrics=['mse'])

# model.summary()

In [None]:
import time

start = time.time()

In [None]:
history = model.fit(X_train, y_train, epochs=300, batch_size=128, validation_data=(X_valid, y_valid))

In [None]:
print("계산시간 :", time.time() - start)

In [None]:
loss = history.history['loss']
mse = history.history['mse']
val_loss = history.history['val_loss']
val_mse = history.history['val_mse']

In [None]:
import matplotlib.pyplot as plt

epochs = range(len(loss))

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

plt.plot(epochs, mse, 'bo', label='Training MSE')
plt.plot(epochs, val_mse, 'r', label='Validation MSE')
plt.title('Training and validation MSE')
plt.legend()
plt.show()

In [None]:
y_pred = model.predict(X_train)

In [None]:
plt.plot(y_pred, y_train, 'o')
x = np.arange(-2,30)
y = x
plt.plot(x, y, 'k')
plt.show()

In [None]:
from sklearn.metrics import r2_score
print(r2_score(y_pred, y_train))

In [None]:
y_pred_test = model.predict(X_test)

In [None]:
plt.plot(y_pred_test, y_test, 'o')
x = np.arange(-2,20)
y = x
plt.plot(x, y, 'k')
plt.show()

In [None]:
print(r2_score(y_pred_test, y_test))  

In [None]:
model.save('NOx_Prediction_model')

In [None]:
new_model = tf.keras.models.load_model('NOx_Prediction_model')

# Check its architecture
new_model.summary()