In [4]:
The dataset contains 36733 instances of 11 sensor measures aggregated over one hour (by means of average or sum)
from a gas turbine. 
The Dataset includes gas turbine parameters (such as Turbine Inlet Temperature and Compressor Discharge pressure) 
in addition to the ambient variables.

Problem statement: predicting turbine energy yield (TEY) using ambient variables as features.


Attribute Information:

The explanations of sensor measurements and their brief statistics are given below.

Variable (Abbr.) Unit Min Max Mean
Ambient temperature (AT) C â€“6.23 37.10 17.71
Ambient pressure (AP) mbar 985.85 1036.56 1013.07
Ambient humidity (AH) (%) 24.08 100.20 77.87
Air filter difference pressure (AFDP) mbar 2.09 7.61 3.93
Gas turbine exhaust pressure (GTEP) mbar 17.70 40.72 25.56
Turbine inlet temperature (TIT) C 1000.85 1100.89 1081.43
Turbine after temperature (TAT) C 511.04 550.61 546.16
Compressor discharge pressure (CDP) mbar 9.85 15.16 12.06
Turbine energy yield (TEY) MWH 100.02 179.50 133.51
Carbon monoxide (CO) mg/m3 0.00 44.10 2.37
Nitrogen oxides (NOx) mg/m3 25.90 119.91 65.29

SyntaxError: invalid syntax (Temp/ipykernel_2444/317173812.py, line 1)


* Loaded the dataset for Gas Turbines
* Deleted gas turbine parameters (2 columns) in accordance with the problem statement.
* Predicted TEY values using 2 approach:

* 1. Standardized only predictor variables after train_test_split
     Applied ANN model
     Calculated best parameters for the batch size and no. of epochs
     Trained the model with best parameters
     Predicted TEY with an acccuracy of 99.60 %

* 2. Standardized both predictor & response variables before train_test_split
     Applied ANN model
     Calculated best parameters for the batch size and no. of epochs
     Trained the model with best parameters
     Descaled the predictor & response variables
     Predicted TEY with an acccuracy of 99.57 %
     

In [5]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense

In [6]:
ds = pd.read_csv("E:\\DATA SCIENCE\\LMS\ASSIGNMENT\\MY ASSIGNMENT\\Neural Networks\\gas_turbines.csv")
ds

Unnamed: 0,AT,AP,AH,AFDP,GTEP,TIT,TAT,TEY,CDP,CO,NOX
0,6.8594,1007.9,96.799,3.5000,19.663,1059.2,550.00,114.70,10.605,3.1547,82.722
1,6.7850,1008.4,97.118,3.4998,19.728,1059.3,550.00,114.72,10.598,3.2363,82.776
2,6.8977,1008.8,95.939,3.4824,19.779,1059.4,549.87,114.71,10.601,3.2012,82.468
3,7.0569,1009.2,95.249,3.4805,19.792,1059.6,549.99,114.72,10.606,3.1923,82.670
4,7.3978,1009.7,95.150,3.4976,19.765,1059.7,549.98,114.72,10.612,3.2484,82.311
...,...,...,...,...,...,...,...,...,...,...,...
15034,9.0301,1005.6,98.460,3.5421,19.164,1049.7,546.21,111.61,10.400,4.5186,79.559
15035,7.8879,1005.9,99.093,3.5059,19.414,1046.3,543.22,111.78,10.433,4.8470,79.917
15036,7.2647,1006.3,99.496,3.4770,19.530,1037.7,537.32,110.19,10.483,7.9632,90.912
15037,7.0060,1006.8,99.008,3.4486,19.377,1043.2,541.24,110.74,10.533,6.2494,93.227


In [7]:
ds.drop(['TIT','CDP'], axis=1, inplace=True)
ds

Unnamed: 0,AT,AP,AH,AFDP,GTEP,TAT,TEY,CO,NOX
0,6.8594,1007.9,96.799,3.5000,19.663,550.00,114.70,3.1547,82.722
1,6.7850,1008.4,97.118,3.4998,19.728,550.00,114.72,3.2363,82.776
2,6.8977,1008.8,95.939,3.4824,19.779,549.87,114.71,3.2012,82.468
3,7.0569,1009.2,95.249,3.4805,19.792,549.99,114.72,3.1923,82.670
4,7.3978,1009.7,95.150,3.4976,19.765,549.98,114.72,3.2484,82.311
...,...,...,...,...,...,...,...,...,...
15034,9.0301,1005.6,98.460,3.5421,19.164,546.21,111.61,4.5186,79.559
15035,7.8879,1005.9,99.093,3.5059,19.414,543.22,111.78,4.8470,79.917
15036,7.2647,1006.3,99.496,3.4770,19.530,537.32,110.19,7.9632,90.912
15037,7.0060,1006.8,99.008,3.4486,19.377,541.24,110.74,6.2494,93.227


In [8]:
print(ds.dtypes)
print(ds.isnull().sum())
ds.describe(include = 'all')

AT      float64
AP      float64
AH      float64
AFDP    float64
GTEP    float64
TAT     float64
TEY     float64
CO      float64
NOX     float64
dtype: object
AT      0
AP      0
AH      0
AFDP    0
GTEP    0
TAT     0
TEY     0
CO      0
NOX     0
dtype: int64


Unnamed: 0,AT,AP,AH,AFDP,GTEP,TAT,TEY,CO,NOX
count,15039.0,15039.0,15039.0,15039.0,15039.0,15039.0,15039.0,15039.0,15039.0
mean,17.764381,1013.19924,79.124174,4.200294,25.419061,545.396183,134.188464,1.972499,68.190934
std,7.574323,6.41076,13.793439,0.760197,4.173916,7.866803,15.829717,2.222206,10.470586
min,0.5223,985.85,30.344,2.0874,17.878,512.45,100.17,0.000388,27.765
25%,11.408,1008.9,69.75,3.7239,23.294,542.17,127.985,0.858055,61.3035
50%,18.186,1012.8,82.266,4.1862,25.082,549.89,133.78,1.3902,66.601
75%,23.8625,1016.9,90.0435,4.5509,27.184,550.06,140.895,2.1604,73.9355
max,34.929,1034.2,100.2,7.6106,37.402,550.61,174.61,44.103,119.89


In [9]:
ds.corr()

Unnamed: 0,AT,AP,AH,AFDP,GTEP,TAT,TEY,CO,NOX
AT,1.0,-0.412953,-0.549432,-0.099333,-0.049103,0.338569,-0.207495,-0.088588,-0.600006
AP,-0.412953,1.0,0.042573,0.040318,0.078575,-0.223479,0.146939,0.041614,0.256744
AH,-0.549432,0.042573,1.0,-0.119249,-0.202784,0.010859,-0.110272,0.165505,0.143061
AFDP,-0.099333,0.040318,-0.119249,1.0,0.744251,-0.571541,0.717995,-0.334207,-0.037299
GTEP,-0.049103,0.078575,-0.202784,0.744251,1.0,-0.756884,0.977042,-0.508259,-0.208496
TAT,0.338569,-0.223479,0.010859,-0.571541,-0.756884,1.0,-0.720356,0.063404,0.009888
TEY,-0.207495,0.146939,-0.110272,0.717995,0.977042,-0.720356,1.0,-0.541751,-0.102631
CO,-0.088588,0.041614,0.165505,-0.334207,-0.508259,0.063404,-0.541751,1.0,0.316743
NOX,-0.600006,0.256744,0.143061,-0.037299,-0.208496,0.009888,-0.102631,0.316743,1.0


In [10]:
# moving the TEY column to the 0th position in the table
lastCol = ds.pop('TEY')
ds.insert(0 , 'TEY', lastCol)
print(ds.head(5))

#assigning predictor variables to x and response variable to y
x = ds.iloc[:,1:]
y = ds[['TEY']]

      TEY      AT      AP      AH    AFDP    GTEP     TAT      CO     NOX
0  114.70  6.8594  1007.9  96.799  3.5000  19.663  550.00  3.1547  82.722
1  114.72  6.7850  1008.4  97.118  3.4998  19.728  550.00  3.2363  82.776
2  114.71  6.8977  1008.8  95.939  3.4824  19.779  549.87  3.2012  82.468
3  114.72  7.0569  1009.2  95.249  3.4805  19.792  549.99  3.1923  82.670
4  114.72  7.3978  1009.7  95.150  3.4976  19.765  549.98  3.2484  82.311


### Standardizing only predictor variable - after train test split

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.20, random_state=42)

scaler_train = StandardScaler()
scaler_test = StandardScaler()

x_train_scaled = scaler_train.fit_transform(x_train) # scaling train data -- predictor
x_test_scaled  = scaler_test.fit_transform(x_test) # scaling test data -- predictor

print(x_train_scaled.shape)
print(x_test_scaled.shape)
print(y_train.shape)
print(y_test.shape)

#for removing heading from y_test
y_test = y_test.values
#print(x_train_scaled)
#print(y_test)

(12031, 8)
(3008, 8)
(12031, 1)
(3008, 1)


In [12]:
# since we have continuous ouput, AF is not required in the o/p layer
model = Sequential()
model.add( Dense( units = 50 , activation = 'relu' , kernel_initializer = 'normal', input_dim = 8)) # input layer
model.add( Dense( units = 20 , activation = 'tanh' , kernel_initializer = 'normal' )) # hidden layer
model.add( Dense( units = 1  , kernel_initializer = 'normal' )) # o/p layer

model.compile(optimizer ='adam', loss = 'mean_squared_error', metrics=['MeanSquaredError'])
model.fit(x_train_scaled, y_train , batch_size=50, epochs=100,  verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x25f9bf95580>

In [None]:
def toFindBestParams(x_train_scaled, y_train, x_test_scaled, y_test):
        
    #defining list of hyperparameters
    batch_size_list = [5 , 10 , 15 , 20]
    epoch_list      = [5 , 10 , 50 , 100]
     
    bestParamTable = pd.DataFrame()
    
    for batch_trial in batch_size_list:
        for epochs_trial in epoch_list:
            
            # create ANN model
            model = Sequential()
            # Defining the first layer of the model
            model.add(Dense(units=50, input_dim=x_train_scaled.shape[1], kernel_initializer='normal', activation='relu'))
            
            # Defining the Second layer of the model
            model.add(Dense(units=20, kernel_initializer='normal', activation='tanh'))
 
            # The output neuron is a single fully connected node 
            # Since we will be predicting a single number
            model.add(Dense(1, kernel_initializer='normal'))
 
            # Compiling the model
            model.compile(optimizer ='adam', loss = 'mean_squared_error')
            
            # Fitting the ANN to the Training set
            model.fit(x_train_scaled, y_train , batch_size=batch_trial, epochs=epochs_trial,  verbose=0)
                        
            MAPE = np.mean(100 * (np.abs(y_test-model.predict(x_test_scaled))/y_test))  
                        
            bestParamTable=bestParamTable.append(pd.DataFrame(data=[[batch_trial, epochs_trial, 100-MAPE]],
                                                        columns=['batchsize','epochs','Accuracy'] ))
            
            # printing the results of the current iteration
            print('batch_size:', batch_trial,'-', 'epochs:',epochs_trial, 'Accuracy:',100-MAPE)

    return bestParamTable

# Calling the function
finalParamTable_1 = toFindBestParams(x_train_scaled, y_train, x_test_scaled, y_test)
finalParamTable_1

batch_size: 5 - epochs: 5 Accuracy: 98.61386024081793


In [None]:
# getting corresponding row values of the maximum value of 'Accuracy' column
finalParamTable_1 = finalParamTable_1.reset_index()
#print(finalParamTable_1)
#print(finalParamTable['Accuracy'].idxmax())
finalParamTable_1.iloc[finalParamTable_1['Accuracy'].idxmax()]

## Training Model - using best params 

In [None]:
model.compile(optimizer ='adam', loss = 'mean_squared_error')
# fitting the model to best params
model.fit(x_train_scaled,y_train, batch_size=20 , epochs = 100, verbose=0)

In [None]:
# generating predictions for test data
y_predict_test = model.predict(x_test_scaled) 

# creating table with test price & predicted price for test
final_table = pd.DataFrame(x_test)
final_table['Price'] = y_test
final_table['Predicted Price'] = y_predict_test
print(final_table.shape)
final_table.head(10)

In [None]:
# Computing the absolute percent error
APE=100*(abs(final_table['Price']-final_table['Predicted Price'])/final_table['Price'])
print('The Accuracy for Test Data -- ANN model = ', 100-np.mean(APE))

# adding absolute percent error to table
final_table['APE']=APE
final_table.head()

### Standardizing both Predictor & Response variable  - before train test split

In [None]:
### Sandardization of data ###
scaler_x = StandardScaler()
scaler_y = StandardScaler()
 
# Storing the fit object for later reference
x_scaler_fit = scaler_x.fit(x)
y_scaler_fit = scaler_y.fit(y)
 
# Generating the standardized values of X and y
x = x_scaler_fit.transform(x)
y = y_scaler_fit.transform(y)
 
# Split the data into training and testing set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)
 
# Shape of Training and Test datasets
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

In [None]:
# since we have continuous ouput, AF is not required in the o/p layer
model = Sequential()
model.add( Dense( units = 50 , activation = 'relu' , kernel_initializer = 'normal', input_dim = 8)) # input layer
model.add( Dense( units = 20 , activation = 'tanh' , kernel_initializer = 'normal' )) # hidden layer
model.add( Dense( units = 1  , kernel_initializer = 'normal' )) # o/p layer

model.compile(optimizer ='adam', loss = 'mean_squared_error', metrics=['mae'])
model.fit(x_train, y_train , batch_size=50, epochs=100,  verbose=1)

In [None]:
def toFindBestParams(x_train, y_train, x_test, y_test):
        
    #defining list of hyperparameters
    batch_size_list = [5 , 10 , 15 , 20]
    epoch_list      = [5 , 10 , 50 , 100]
    
    bestParamTable = pd.DataFrame()
    
    for batch_trial in batch_size_list:
        for epochs_trial in epoch_list:
                        
            # create ANN model
            model = Sequential()
            # Defining the first layer of the model
            model.add(Dense(units=50, input_dim=x_train.shape[1], kernel_initializer='normal', activation='relu'))
            
            # Defining the Second layer of the model
            model.add(Dense(units=20, kernel_initializer='normal', activation='tanh'))
 
            # The output neuron is 1 as o/p is continuous
            # No AF needed coz continuous output
            model.add(Dense(1, kernel_initializer='normal'))
 
            # Compiling the model
            model.compile(optimizer ='adam', loss = 'mean_squared_error')
            
            # Fitting the ANN to the Training set
            model.fit(x_train, y_train , batch_size=batch_trial, epochs=epochs_trial,  verbose=0)
                        
            MAPE = np.mean(100 * (np.abs(y_test-model.predict(x_test))/y_test))
            
            bestParamTable=bestParamTable.append(pd.DataFrame(data=[[batch_trial, epochs_trial, 100-MAPE]],
                                                        columns=['batchsize','epochs','Accuracy'] ))
            
            #printing the results of the current iteration
            print('batch_size:', batch_trial,'-', 'epochs:',epochs_trial, 'Accuracy:',100-MAPE)
            
    return bestParamTable

# Calling the function
finalParamTable = toFindBestParams(x_train, y_train, x_test, y_test)
#print(finalParamTable['Accuracy'].max())
#print(finalParamTable[finalParamTable['Accuracy'].max()])
finalParamTable

In [None]:
# getting corresponding row values of the maximum value of 'Accuracy' column
finalParamTable = finalParamTable.reset_index()
#print(finalParamTable)
#print(finalParamTable['Accuracy'].idxmax())
finalParamTable.iloc[finalParamTable['Accuracy'].idxmax()]

In [None]:
# fitting the model to best params
model.compile(optimizer ='adam', loss = 'mean_squared_error')
model.fit(x_train,y_train, batch_size=10 , epochs = 50, verbose=0)

In [None]:
# generating predictions for test data
y_predict_test = model.predict(x_test) 

# scaling back test data to original data
y_test_original = y_scaler_fit.inverse_transform(y_test)

# Scaling the predicted Price data back to original price scale
y_predict_test=y_scaler_fit.inverse_transform(y_predict_test)

# scaling the test input data back to original
x_test_original = x_scaler_fit.inverse_transform(x_test)

# creating table with descaled test price & descaled predicted price for test
final_table_1 = pd.DataFrame(x_test_original)
final_table_1['Price'] = y_test_original
final_table_1['Predicted Price'] = y_predict_test
print(final_table_1.shape)
final_table_1

In [None]:
# Computing the absolute percent error
APE_1 = 100*(abs(final_table_1['Price']-final_table_1['Predicted Price'])/final_table_1['Price'])
print('The Accuracy for Test Data -- ANN model = ', 100-np.mean(APE_1))

# adding absolute percent error to table
final_table_1['APE'] = APE_1
final_table_1.head(10)