# Using Neural Network on Gas Turbines Dataset


In [None]:
#import modin.pandas as pd
import pandas as pd
import numpy as np 
from keras.layers import Dense
from keras_tuner.tuners import RandomSearch
from keras import Sequential
#import ray
#ray.init(runtime_env={'env_vars': {'__MODIN_AUTOIMPORT_PANDAS__': '1'}})

In [None]:
gas=pd.read_csv("C:\\Users\\Hi\\Desktop\\Python Datasets\\gas_turbines.csv")

In [None]:
gas

In [None]:
#moving our Target  column to first column
first_column=gas.pop('TEY')
gas.insert(0,'TEY',first_column)
gas

# EDA

In [None]:
from matplotlib import pyplot as plt


gas.plot(kind='box',subplots=True, layout=(3,4),figsize=(20,20))
plt.show()

In [None]:
from scipy import stats

In [None]:
zsc=stats.zscore(gas) #Trying to remove outliers based on Z scores and removing the datapoints which are above Zscore=3 
zscores=np.abs(zsc)  #Making all values absolute to make -ve alues to +ve so that we can easily remove the Zscores above 3
filter_zscores=(zscores<3).all(axis=1)
filtered=gas[filter_zscores]
filtered.shape #here there are 492 rows outliers as we can see after transforming df and trying to eliminate the zvalues above 3 and below -3 

In [None]:
gas.shape

In [None]:
filtered

In [None]:
filtered.plot(kind='box',layout=(4,3),figsize=(20,20),subplots=True)
plt.show()

In [None]:
#Lets Try Isolation Forest method for Outlier Detection 
from sklearn.ensemble import IsolationForest
clf=IsolationForest(random_state=20,contamination=0.10) #based on domain knowledge the contamination value should be set
clf.fit(filtered)
filtered['anomaly']=clf.predict(filtered)
filtered

In [None]:
filtered.shape

In [None]:
#dropping  1455 records at contamination of 10% assumption
filtered.drop(filtered[filtered['anomaly']==-1].index,inplace=True)
filtered.shape

#Using PPS Score to identify correlation
import ppscore as pps


In [None]:
filtered=filtered.drop(['anomaly'],axis=1)
filtered

In [None]:
#Automatic EDA using Sweetviz
import sweetviz as sv
sweet_report=sv.analyze(filtered)
sweet_report.show_html('EDA_of_Gas_Turbines.html')

In [None]:
import seaborn as sns
plt.figure(figsize=(10,10))
sns.heatmap(filtered.corr(),annot=True)
plt.show()

## Here we have 3 Multicollinearity problem between Columns which are TAT vs GTEP at -0.83 , TAT vs CDP at -0.82, TIT vs CO at -0.74 
## We are going to consider Multicollinearity problem when there is >0.7 correlation between Independent Features(Columns)

## But MultiCollinearity is handled very well by Neural Networks as Neural Networks have Back propagation and non-linear activation functions which makes multicollinearity unimportant 

In [None]:
X=filtered.iloc[:,1:]
Y=filtered.iloc[:,:1]
Y

# Creating Model by using Hyperparameter Tuning 

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=0)

In [None]:
x_train=np.asarray(x_train)
y_train=np.asarray(y_train)
x_test=np.asarray(x_test)
y_test=np.asarray(y_test)

In [None]:
from tensorflow import keras
from keras_tuner.tuners import RandomSearch
from keras.layers import Dropout, Dense
from keras import Sequential
activation=['tanh','relu','leakyrelu','elu','sigmoid']

In [None]:
def build_model(hp):

    model=Sequential()
    for i in range(hp.Int('num_Layers',2,20)):

        model.add(Dense(units=hp.Int('units_'+str(i),min_value=8,max_value=132,step=8),activation=hp.Choice('activation'+str(i),values=['tanh','relu','sigmoid'])))
        model.add(Dropout(hp.Choice('dropout'+str(i),values=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])))
        model.add(Dense(1,activation='linear'))
        model.compile(optimizer=keras.optimizers.Adamax(hp.Choice('learning_rate',[0.1,0.01,0.001])),loss='mean_absolute_error',metrics=['mean_absolute_error'])
    return model


In [None]:
tuner=RandomSearch(hypermodel=build_model,objective='val_mean_absolute_error',max_trials=5,executions_per_trial=3,directory='project',project_name='Gas_Turbines',overwrite=True)

In [None]:
tuner.search_space_summary()

In [None]:
tuner.search(x_train,y_train,epochs=100,validation_data=(x_test,y_test))

In [None]:
tuner.get_best_hyperparameters()[0].values

In [None]:
model=tuner.get_best_models(num_models=1)[0]

In [None]:
model.fit(x_train,y_train,epochs=100,initial_epoch=5,validation_data=(x_test,y_test))

In [None]:
model.summary()

In [None]:
from sklearn.model_selection import GridSearchCV
from keras.optimizers import Adamax
from keras.wrappers.scikit_learn import KerasRegressor

In [None]:
# Below code will take more time than 4hours needs  GPUs to run fast  

def create_model(learning_rate,dropout_rate,activation_function,init,neuron1,neuron2):
    model=Sequential()
    model.add(Dense(neuron1,kernel_initializer=init,activation=activation_function))
    model.add(Dropout(dropout_rate))
    model.add(Dense(neuron2,input_dim=neuron1,kernel_initializer=init,activation=activation_function))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1,activation='linear'))

    adamax= Adamax(learning_rate=learning_rate)
    model.compile(optimizer=adamax, loss='rmsprop',metrics=['mean_absolute_error'])
    return model

#Create Model
model=KerasRegressor(build_fn=create_model,verbose=0)


#Parameters for GridSearchCV 
batch_size=[10,20,40]
epochs=[10,50,100]
learning_rate=[0.1,0.01,0.001]
dropout_rate=[0.0,0.1,0.2]
activation_function=['softmax','relu','tanh','linear']
init=['uniform','normal','zero']
neuron1=[4,8,16]
neuron2=[2,4,8]

#Make a Dictionary of Params Grid 
params_grid=dict(batch_size=batch_size,epochs=epochs,learning_rate=learning_rate,dropout_rate=dropout_rate,
                      activation_function=activation_function,init=init,neuron1=neuron1,neuron2=neuron2)
                
#Build and fit the Grid Search CV
grid=GridSearchCV(estimator=model,param_grid=params_grid)
grid_result=grid.fit(X,Y)


# Summarize the results
print('Best : {}, using {}'.format(grid_result.best_score_,grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
  print('{},{} with: {}'.format(mean, stdev, param))