In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Overview**

In this notebook , performances of different regression models are compared using a combined cycle power plant (CCPP) dataset.

The dataset contains 9568 data points collected from a Combined Cycle Power Plant over 6 years (2006-2011), when the power plant was set to work with full load. Features consist of hourly average ambient variables Temperature (AT), Ambient Pressure (AP), Relative Humidity (RH) and Exhaust Vacuum (V) to predict the net hourly electrical energy output (PE) of the plant.
CCPP is composed of gas turbines (GT), steam turbines (ST) and heat recovery steam generators. In a CCPP, the electricity is generated by gas and steam turbines, which are combined in one cycle, and is transferred from one turbine to another. While the Vacuum is collected from and has effect on the Steam Turbine, the other three of the ambient variables effect the GT performance.The averages are taken from various sensors located around the plant that record the ambient variables every second. The variables are given without normalization.

**Explatory Data Analysis**

In [None]:
dataset = pd.read_csv('../input/airpressure/Folds5x2_pp.csv')

In [None]:
dataset.info()

In [None]:
dataset.head()

In [None]:
dataset.describe()

In [None]:
# correlation map
f,ax = plt.subplots(figsize=(5,5))
sns.heatmap(dataset.corr(), annot=True, linewidths=.5, fmt='.2f', ax=ax)
plt.show()

**1. Linear Regression**

In [None]:
X1 = dataset.iloc[:, 0:1].values # We choose AT since correlation between AT and PE is -0.95
y1 = dataset.iloc[:, -1].values # PE

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.2, random_state = 0)

In [None]:
# Training the Simple Linear Regression model on the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X1_train, y1_train)

In [None]:
# Predicting the Test set results
y1_pred = regressor.predict(X1_test) 
np.set_printoptions(precision=2)  # to print 2 decimals
print(np.concatenate(
    (y1_pred.reshape(len(y1_pred),1), y1_test.reshape(len(y1_test),1)),axis=1)
) # so we can compare prediction and test values

In [None]:
# Evaluating the Model Performance
from sklearn.metrics import r2_score
lr_score = r2_score(y1_test, y1_pred)
print(lr_score)
scores = {}
scores.update({'lr_score': lr_score})

In [None]:
# Visualising the Test set results
plt.scatter(X1_test, y1_test, color = 'red')  
plt.plot(X1_test, regressor.predict(X1_test), color = 'blue')   
plt.title('AT vs PE')
plt.xlabel('AT')
plt.ylabel('PE')
plt.show()

**2. Multiple Linear Regression**

In [None]:
X2 = dataset.iloc[:, :-1].values
y2 = dataset.iloc[:, -1].values

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size = 0.2, random_state = 0)

In [None]:
# Training the Multiple Linear Regression model on the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X2_train, y2_train)

In [None]:
# Predicting the Test set results
y2_pred = regressor.predict(X2_test)
np.set_printoptions(precision=2) # we specify the number of decimals
print(np.concatenate(
    (y2_pred.reshape(len(y2_pred),1), y2_test.reshape(len(y2_test),1)), axis=1)
    )   

In [None]:
# Evaluating the Model Performance
mlr_score = r2_score(y2_test, y2_pred)
print(mlr_score)
scores.update({'mlr_score': mlr_score})

In [None]:
# Final Equation
b0 = regressor.intercept_
coefs = regressor.coef_
print (b0, coefs)
print ("PE = {0:.2f} + {1:.2f}*AT + {2:.2f}*V + {3:.2f}*AP + {4:.2f}*RH".
       format(b0, coefs[0], coefs[1], coefs[2], coefs[3]))

**3. Polynomial Regression**

In [None]:
X3 = dataset.iloc[:, :-1].values
y3 = dataset.iloc[:, -1].values

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size = 0.2, random_state = 0)

In [None]:
#Training the Polynomial Regression model on the Training set
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
poly_reg = PolynomialFeatures(degree = 2)
X3_poly = poly_reg.fit_transform(X3_train)
regressor = LinearRegression()
regressor.fit(X3_poly, y3_train)

In [None]:
# Predicting the Test set results
y3_pred = regressor.predict(poly_reg.transform(X3_test))
np.set_printoptions(precision=2)
print(np.concatenate
      ((y3_pred.reshape(len(y3_pred),1), y3_test.reshape(len(y3_test),1)),axis=1)
      )

In [None]:
# Evaluating the Model Performance
pr_score = r2_score(y3_test, y3_pred)
print(pr_score)
scores.update({'pr_score': pr_score})

**4. Support Vector Regression (SVR)**

In [None]:
X4 = dataset.iloc[:, :-1].values
y4 = dataset.iloc[:, -1].values
y4 = y4.reshape(len(y4),1) # StandartScaler class expects 2D Array as input

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y4, test_size = 0.2, random_state = 0)

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()    
X4_train = sc_X.fit_transform(X4_train)
y4_train = sc_y.fit_transform(y4_train)

In [None]:
# Training the SVR model on the Training set
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')   # Gaussian Radial Basis Function
regressor.fit(X4_train, y4_train)

In [None]:
# Predicting the Test set results
y4_pred = sc_y.inverse_transform(regressor.predict(sc_X.transform(X4_test)))
np.set_printoptions(precision=2)
print(np.concatenate
      ((y4_pred.reshape(len(y4_pred),1), y4_test.reshape(len(y4_test),1)),
       axis=1))

In [None]:
# Evaluating the Model Performance
svr_score = r2_score(y4_test, y4_pred)
print(svr_score)
scores.update({'svr_score': svr_score})

**5. Decision Tree Regression**

In [None]:
X5 = dataset.iloc[:, :-1].values
y5 = dataset.iloc[:, -1].values

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X5_train, X5_test, y5_train, y5_test = train_test_split(X5, y5, test_size = 0.2, random_state = 0)

In [None]:
# Training the Decision Tree Regression model on the Training set
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X5_train, y5_train)

In [None]:
# Predicting the Test set results
y5_pred = regressor.predict(X5_test)
np.set_printoptions(precision=2)
print(np.concatenate
      ((y5_pred.reshape(len(y5_pred),1), y5_test.reshape(len(y5_test),1)),
       axis=1))

In [None]:
# Evaluating the Model Performance
dtr_score = r2_score(y5_test, y5_pred)
print(dtr_score)
scores.update({'dtr_score': dtr_score})

**6. Random Forest Regression**

In [None]:
X6 = dataset.iloc[:, :-1].values
y6 = dataset.iloc[:, -1].values

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X6_train, X6_test, y6_train, y6_test = train_test_split(X6, y6, test_size = 0.2, random_state = 0)

In [None]:
# Training the Random Forest Regression model on the whole dataset
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)    
regressor.fit(X6_train, y6_train)

In [None]:
# Predicting the Test set results
y6_pred = regressor.predict(X6_test)
np.set_printoptions(precision=2)
print(np.concatenate
      ((y6_pred.reshape(len(y6_pred),1), y6_test.reshape(len(y6_test),1)),
       axis=1))

In [None]:
# Evaluating the Model Performance
rfr_score = r2_score(y6_test, y6_pred)
print(rfr_score)
scores.update({'rfr_score': rfr_score})

**7.Artificial Neural Network (ANN)**

In [None]:
X7 = dataset.iloc[:, :-1].values
y7 = dataset.iloc[:, -1].values

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X7_train, X7_test, y7_train, y7_test = train_test_split(X7, y7, test_size = 0.2, random_state = 0)

In [None]:
# libraries for ann
import tensorflow.keras
from keras.models import Sequential # initialize neural network library
from keras.layers import Dense # build our layers library

regressor = Sequential() # initializing ann
regressor.add(Dense(units = 6, activation = 'relu')) # Adding the the first hidden layer (input dimension will be automatically recognized)
regressor.add(Dense(units = 1)) # Adding the output layer (no activation is needed in regression)
regressor.compile(optimizer = 'adam', loss = 'mean_squared_error') # Compiling the ANN

In [None]:
regressor.fit(X7_train, y7_train, batch_size = 32, epochs = 100)

In [None]:
# Predicting the Test set results
y7_pred = regressor.predict(X7_test)
np.set_printoptions(precision=2)
print(np.concatenate
      ((y7_pred.reshape(len(y7_pred),1), y7_test.reshape(len(y7_test),1)),
       axis=1))

In [None]:
# Evaluating the Model Performance
ann_score = r2_score(y7_test, y7_pred)
print(ann_score)
scores.update({'ann_score': ann_score})

**8. Conclusion**

In [None]:
print(scores)

As you can see above, Random Forest Regression Model has the highest r2_score (0.96) which means this model gives the best prediction for net hourly electrical energy output (PE) of the plant.